Skip to content

Commit 26f9a23

Browse files
committed
Added validation process for #126
1 parent 841d2a3 commit 26f9a23

File tree

1 file changed

+61
-1
lines changed

1 file changed

+61
-1
lines changed

lib/convert/dlx2importjson.js

+61-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import createSpinner from 'ora';
33
import fs from 'fs-extra';
44
import parseCategory from '../utilities/parseCategory.js';
55
import readNDJSON from '../utilities/readNDJSON.js';
6+
import writeNDJSON from '../utilities/writeNDJSON.js';
67
import removeParentheticals from '../utilities/removeParentheticals.js';
78
import sortKeys from 'sort-keys';
89

@@ -107,7 +108,7 @@ function convertEntry({
107108
wn_domains,
108109
};
109110

110-
if(dataSources.CW.htmlInfo) {
111+
if(dataSources.CW?.htmlInfo) {
111112
linguistInfo.analysis = dataSources.CW.htmlInfo;
112113
}
113114

@@ -279,6 +280,65 @@ export default async function dlx2importjson(inputPath, outputPath = `out.json`)
279280

280281
}
281282

283+
await validateImportJson(entries, createSpinner('Verifying correct structure of importjson file.').start());
284+
285+
282286
return entries;
283287

284288
}
289+
290+
function getDuplicateEntries( list ) {
291+
const counts = new Map();
292+
list.forEach((entry, index) => {
293+
if (counts.has(entry)) {
294+
counts.get(entry).push(index)
295+
} else {
296+
counts.set(entry, [index])
297+
}
298+
});
299+
return counts.entries().filter(entry => entry[1].length > 1)
300+
}
301+
302+
/**
303+
* Verify that the entries generated in the importjson are consistent with the constraints expected by morphodict.
304+
*/
305+
async function validateImportJson ( entries, spinner) {
306+
/** in "lexicon_sourcelanguagekeyword" CONSTRAINT "source_kw_text_and_wordform" UNIQUE ("text", "wordform_id")); */
307+
/** in "lexicon_targetlanguagekeyword" CONSTRAINT "target_kw_text_and_wordform" UNIQUE ("text", "wordform_id")) */
308+
/** slugs must be unique */
309+
310+
const duplicateSlugs = Array.from(getDuplicateEntries(entries.map(entry => entry.slug).filter(entry=>entry)));
311+
312+
if (duplicateSlugs.length > 0 ) {
313+
console.log(`ERROR: MULTIPLE ENTRIES WITH SAME KEY PRESENT\n${duplicateSlugs.toString()}`)
314+
return spinner.fail('Importjson will not import correctly into morphodict.')
315+
}
316+
317+
const groupByWordform = Array.from(getDuplicateEntries(
318+
entries.map(entry => entry.slug || (entry.formOf ? entry.head+' of '+entry.formOf : entry.head))
319+
)).map(entry => [entry[0], entry[1].map(i => entries[i])]);
320+
321+
function targetLanguageKeywordsNotUnique ( entries ) {
322+
return Array.from(getDuplicateEntries(
323+
entries.map(entry => entry.senses.map(sense => (sense.semanticDefinition || sense.definition).split(' ')).flat()).flat()
324+
)).length > 1;
325+
}
326+
327+
const guaranteedToFailConstraints = groupByWordform
328+
.filter(entry => targetLanguageKeywordsNotUnique(entry[1]));
329+
330+
await writeNDJSON(`data/brittle-entries.json`, groupByWordform);
331+
if (guaranteedToFailConstraints.length > 0 ){
332+
console.log(`\n\nERROR: SOME ENTRIES HAVE DEFINITIONS WHOSE TargetLanguageKeyword ENTRIES ARE NOT UNIQUE!\n${guaranteedToFailConstraints.toString()}`);
333+
spinner.fail('Importjson will not import correctly into morphodict')
334+
throw new Error();
335+
} else {
336+
console.log([
337+
'Warning: There are brittle entries in this importjson where small changes in definitions may break the import process',
338+
'Please see the documentation and the data/brittle_entries.json file'
339+
].join("."));
340+
}
341+
342+
return spinner.succeed('Importjson should work with morphodict.')
343+
344+
}

0 commit comments

Comments
 (0)