@@ -3,6 +3,7 @@ import createSpinner from 'ora';
3
3
import fs from 'fs-extra' ;
4
4
import parseCategory from '../utilities/parseCategory.js' ;
5
5
import readNDJSON from '../utilities/readNDJSON.js' ;
6
+ import writeNDJSON from '../utilities/writeNDJSON.js' ;
6
7
import removeParentheticals from '../utilities/removeParentheticals.js' ;
7
8
import sortKeys from 'sort-keys' ;
8
9
@@ -107,7 +108,7 @@ function convertEntry({
107
108
wn_domains,
108
109
} ;
109
110
110
- if ( dataSources . CW . htmlInfo ) {
111
+ if ( dataSources . CW ? .htmlInfo ) {
111
112
linguistInfo . analysis = dataSources . CW . htmlInfo ;
112
113
}
113
114
@@ -279,6 +280,65 @@ export default async function dlx2importjson(inputPath, outputPath = `out.json`)
279
280
280
281
}
281
282
283
+ await validateImportJson ( entries , createSpinner ( 'Verifying correct structure of importjson file.' ) . start ( ) ) ;
284
+
285
+
282
286
return entries ;
283
287
284
288
}
289
+
290
+ function getDuplicateEntries ( list ) {
291
+ const counts = new Map ( ) ;
292
+ list . forEach ( ( entry , index ) => {
293
+ if ( counts . has ( entry ) ) {
294
+ counts . get ( entry ) . push ( index )
295
+ } else {
296
+ counts . set ( entry , [ index ] )
297
+ }
298
+ } ) ;
299
+ return counts . entries ( ) . filter ( entry => entry [ 1 ] . length > 1 )
300
+ }
301
+
302
+ /**
303
+ * Verify that the entries generated in the importjson are consistent with the constraints expected by morphodict.
304
+ */
305
+ async function validateImportJson ( entries , spinner ) {
306
+ /** in "lexicon_sourcelanguagekeyword" CONSTRAINT "source_kw_text_and_wordform" UNIQUE ("text", "wordform_id")); */
307
+ /** in "lexicon_targetlanguagekeyword" CONSTRAINT "target_kw_text_and_wordform" UNIQUE ("text", "wordform_id")) */
308
+ /** slugs must be unique */
309
+
310
+ const duplicateSlugs = Array . from ( getDuplicateEntries ( entries . map ( entry => entry . slug ) . filter ( entry => entry ) ) ) ;
311
+
312
+ if ( duplicateSlugs . length > 0 ) {
313
+ console . log ( `ERROR: MULTIPLE ENTRIES WITH SAME KEY PRESENT\n${ duplicateSlugs . toString ( ) } ` )
314
+ return spinner . fail ( 'Importjson will not import correctly into morphodict.' )
315
+ }
316
+
317
+ const groupByWordform = Array . from ( getDuplicateEntries (
318
+ entries . map ( entry => entry . slug || ( entry . formOf ? entry . head + ' of ' + entry . formOf : entry . head ) )
319
+ ) ) . map ( entry => [ entry [ 0 ] , entry [ 1 ] . map ( i => entries [ i ] ) ] ) ;
320
+
321
+ function targetLanguageKeywordsNotUnique ( entries ) {
322
+ return Array . from ( getDuplicateEntries (
323
+ entries . map ( entry => entry . senses . map ( sense => ( sense . semanticDefinition || sense . definition ) . split ( ' ' ) ) . flat ( ) ) . flat ( )
324
+ ) ) . length > 1 ;
325
+ }
326
+
327
+ const guaranteedToFailConstraints = groupByWordform
328
+ . filter ( entry => targetLanguageKeywordsNotUnique ( entry [ 1 ] ) ) ;
329
+
330
+ await writeNDJSON ( `data/brittle-entries.json` , groupByWordform ) ;
331
+ if ( guaranteedToFailConstraints . length > 0 ) {
332
+ console . log ( `\n\nERROR: SOME ENTRIES HAVE DEFINITIONS WHOSE TargetLanguageKeyword ENTRIES ARE NOT UNIQUE!\n${ guaranteedToFailConstraints . toString ( ) } ` ) ;
333
+ spinner . fail ( 'Importjson will not import correctly into morphodict' )
334
+ throw new Error ( ) ;
335
+ } else {
336
+ console . log ( [
337
+ 'Warning: There are brittle entries in this importjson where small changes in definitions may break the import process' ,
338
+ 'Please see the documentation and the data/brittle_entries.json file'
339
+ ] . join ( "." ) ) ;
340
+ }
341
+
342
+ return spinner . succeed ( 'Importjson should work with morphodict.' )
343
+
344
+ }
0 commit comments