Skip to content

Commit 9278ba4

Browse files
authored
merge similar definitions (#100)
1 parent c05372a commit 9278ba4

7 files changed

+583
-63
lines changed

lib/aggregate/index.js

+10-9
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1-
import createSpinner from 'ora';
2-
import DatabaseIndex from '../utilities/DatabaseIndex.js';
3-
import { fileURLToPath } from 'url';
4-
import parseAnalysis from '../utilities/parseAnalysis.js';
5-
import parseCategory from '../utilities/parseCategory.js';
6-
import readNDJSON from '../utilities/readNDJSON.js';
7-
import { Transducer } from 'hfstol';
8-
import writeNDJSON from '../utilities/writeNDJSON.js';
1+
import aggregateDefinitions from '../utilities/aggregateDefinitions.js';
2+
import createSpinner from 'ora';
3+
import DatabaseIndex from '../utilities/DatabaseIndex.js';
4+
import { fileURLToPath } from 'url';
5+
import parseAnalysis from '../utilities/parseAnalysis.js';
6+
import parseCategory from '../utilities/parseCategory.js';
7+
import readNDJSON from '../utilities/readNDJSON.js';
8+
import { Transducer } from 'hfstol';
9+
import writeNDJSON from '../utilities/writeNDJSON.js';
910

1011
import {
1112
dirname as getDirname,
@@ -249,7 +250,7 @@ function aggregateEntry(entry) {
249250
}
250251

251252
// NOTE: Currently not displaying MD senses for programmatic matches.
252-
// TODO: Use a bag-of-words approach to decide which MD senses to display.
253+
entry.senses = aggregateDefinitions(entry.senses);
253254

254255
return entry;
255256

lib/convert/dlx2importjson.js

+8-54
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
import compare from '../utilities/compare.js';
2-
import createSpinner from 'ora';
3-
import fs from 'fs-extra';
4-
import parseCategory from '../utilities/parseCategory.js';
5-
import readNDJSON from '../utilities/readNDJSON.js';
6-
import sortKeys from 'sort-keys';
1+
import compare from '../utilities/compare.js';
2+
import createSpinner from 'ora';
3+
import fs from 'fs-extra';
4+
import parseCategory from '../utilities/parseCategory.js';
5+
import readNDJSON from '../utilities/readNDJSON.js';
6+
import removeParentheticals from '../utilities/removeParentheticals.js';
7+
import sortKeys from 'sort-keys';
78

89
const instrRegExp = /^by\s+/u;
910
const latinNoteRegExp = /[ei]\.\s*[eg]\./iu;
@@ -12,24 +13,6 @@ const latinNoteRegExp = /[ei]\.\s*[eg]\./iu;
1213
// This prevents matches on things like "house(s)".
1314
const parentheticalRegExp = /(?<parenthetical>\s+\(\s*(?<parenText>.+?)\s*\)\s*)/gu;
1415

15-
const EnglishPronouns = new Set([
16-
`his/her own`,
17-
`him/herself`,
18-
`it as`,
19-
`it/him`,
20-
`it`,
21-
`of it`,
22-
`of something`,
23-
`on s.t.`,
24-
`s.o. as`,
25-
`s.t.`,
26-
`something`,
27-
`that`,
28-
`them`,
29-
`to it/him`,
30-
`to something`,
31-
]);
32-
3316
const EnglishAbbrevs = [
3417
`s\\.o\\.`,
3518
`s\\.t\\.`,
@@ -143,7 +126,7 @@ function convertEntry({
143126

144127
const isPronoun = pos === `PrA`;
145128
const displayDefinition = original;
146-
const coreDefinition = createCoreDefinition(definition);
129+
const coreDefinition = removeParentheticals(definition);
147130

148131
const semanticDefinition = createSemanticDefinition(definition, {
149132
i,
@@ -191,35 +174,6 @@ function convertEntry({
191174

192175
}
193176

194-
function createCoreDefinition(definition) {
195-
196-
let coreDefinition = definition;
197-
let match;
198-
199-
while ((match = parentheticalRegExp.exec(coreDefinition)) !== null) {
200-
201-
const { parenText, parenthetical } = match.groups;
202-
203-
// allow desired parentheticals
204-
if (
205-
instrRegExp.test(parenText) || // allow "by" phrases (instrumentals)
206-
EnglishPronouns.has(parenText) // allow pronouns
207-
) {
208-
continue;
209-
}
210-
211-
// remove all other parentheticals (including "e.g." and "i.e." parentheticals)
212-
coreDefinition = coreDefinition.replace(parenthetical, ` `);
213-
214-
}
215-
216-
return coreDefinition
217-
.replace(/\s{2,}/gu, ` `)
218-
.replace(/\s+,/gu, `,`)
219-
.trim();
220-
221-
}
222-
223177
function createSemanticDefinition(definition, { isPronoun, literalMeaning, notes, scientificName }) {
224178

225179
let semanticDefinition = definition;

lib/utilities/aggregateDefinitions.js

+109
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
/* eslint-disable
2+
no-param-reassign,
3+
*/
4+
5+
import normalizeDefinition from './normalizeDefinition.js';
6+
7+
const defaultThreshold = 0.8;
8+
9+
/**
10+
* Compares two definitions and returns the minimum similarity value for the two definitions, between 0 and 1, 0 being completely dissimilar, and 1 being completely similar.
11+
* @param {String} a The first definition.
12+
* @param {String} b The second definition.
13+
* @return {Number}
14+
*/
15+
function getSimilarity(a, b) {
16+
const aWords = a.split(/\s+/gu); // list of words in definition A
17+
const bWords = b.split(/\s+/gu); // list of words in definition B
18+
const bothWords = intersection(aWords, bWords); // list of words contained in both definitions
19+
const aOverlap = bothWords.length / aWords.length; // % of words in A that are also in B
20+
const bOverlap = bothWords.length / bWords.length; // % of words in B that are also in A
21+
return Math.min(aOverlap, bOverlap);
22+
}
23+
24+
function intersection(a, b) {
25+
const s = new Set(b);
26+
return [...new Set(a)].filter(x => s.has(x));
27+
}
28+
29+
/**
30+
* Given a set of definitions, this function determines which definitions are sufficiently similar that one can be omitted.
31+
* @param {Array} definitions An Array of definitions to compare. Each definition must be Object with `sources` and `definition` properties.
32+
* @param {Object} [options={}] An options Object.
33+
* @param {Array} [options.precedence] An Array of data sources in order of precedence. If omitted, the order of keys in the definitions object will be used.
34+
* @param {Number} [options.threshold=0.8] The minimum threshold of matching words for two definitions to be considered the same, as a percentage between 0 and 1.
35+
* @returns
36+
*/
37+
export default function aggregateDefinitions(definitions, options = {}) {
38+
39+
// NOTE: Don't alter the original definitions Array.
40+
let { precedence } = options;
41+
42+
if (!precedence) {
43+
precedence = Array.from(new Set(definitions.map(({ sources }) => sources).flat()));
44+
}
45+
46+
for (const def of definitions) {
47+
48+
for (const source of def.sources) {
49+
if (!precedence.includes(source)) {
50+
throw new Error(`Unrecognized source: ${ source } in ${ def.definition }`);
51+
}
52+
}
53+
54+
def.normalized = normalizeDefinition(def.definition);
55+
56+
}
57+
58+
const threshold = options.threshold ?? defaultThreshold;
59+
const mergedDefinitions = [];
60+
61+
for (const src of precedence) {
62+
63+
// get definitions for the current source
64+
const defs = definitions.filter(({ sources }) => sources.includes(src));
65+
66+
// if there aren't yet any selected definitions, add the current ones
67+
if (!mergedDefinitions.length) {
68+
mergedDefinitions.push(...defs);
69+
continue;
70+
}
71+
72+
// for each definition from that source
73+
defsLoop:
74+
for (const currentDefinition of defs) {
75+
76+
// compare the definition to each selected definition
77+
for (const existingDefinition of mergedDefinitions) {
78+
79+
if (currentDefinition === existingDefinition) continue;
80+
81+
const similarity = getSimilarity(currentDefinition.normalized, existingDefinition.normalized);
82+
const isSimilar = similarity >= threshold;
83+
84+
// NOTE: Once a definition is deemed similar and merged with an existing definition,
85+
// break out of the definitions loop (using the labeled break statement) so that no more processing is done for this definition.
86+
if (isSimilar) {
87+
existingDefinition.sources.push(...currentDefinition.sources);
88+
continue defsLoop;
89+
}
90+
91+
}
92+
93+
// NOTE: This conditional is only reached if the definition hasn't already been matched to an existing one.
94+
// (See labeled break statement above.)
95+
if (!mergedDefinitions.includes(currentDefinition)) {
96+
mergedDefinitions.push(currentDefinition);
97+
}
98+
99+
}
100+
101+
}
102+
103+
for (const definition of mergedDefinitions) {
104+
delete definition.normalized;
105+
}
106+
107+
return mergedDefinitions;
108+
109+
}

0 commit comments

Comments
 (0)