|
| 1 | +/* eslint-disable |
| 2 | + no-param-reassign, |
| 3 | +*/ |
| 4 | + |
| 5 | +import normalizeDefinition from './normalizeDefinition.js'; |
| 6 | + |
| 7 | +const defaultThreshold = 0.8; |
| 8 | + |
| 9 | +/** |
| 10 | + * Compares two definitions and returns the minimum similarity value for the two definitions, between 0 and 1, 0 being completely dissimilar, and 1 being completely similar. |
| 11 | + * @param {String} a The first definition. |
| 12 | + * @param {String} b The second definition. |
| 13 | + * @return {Number} |
| 14 | + */ |
| 15 | +function getSimilarity(a, b) { |
| 16 | + const aWords = a.split(/\s+/gu); // list of words in definition A |
| 17 | + const bWords = b.split(/\s+/gu); // list of words in definition B |
| 18 | + const bothWords = intersection(aWords, bWords); // list of words contained in both definitions |
| 19 | + const aOverlap = bothWords.length / aWords.length; // % of words in A that are also in B |
| 20 | + const bOverlap = bothWords.length / bWords.length; // % of words in B that are also in A |
| 21 | + return Math.min(aOverlap, bOverlap); |
| 22 | +} |
| 23 | + |
| 24 | +function intersection(a, b) { |
| 25 | + const s = new Set(b); |
| 26 | + return [...new Set(a)].filter(x => s.has(x)); |
| 27 | +} |
| 28 | + |
| 29 | +/** |
| 30 | + * Given a set of definitions, this function determines which definitions are sufficiently similar that one can be omitted. |
| 31 | + * @param {Array} definitions An Array of definitions to compare. Each definition must be Object with `sources` and `definition` properties. |
| 32 | + * @param {Object} [options={}] An options Object. |
| 33 | + * @param {Array} [options.precedence] An Array of data sources in order of precedence. If omitted, the order of keys in the definitions object will be used. |
| 34 | + * @param {Number} [options.threshold=0.8] The minimum threshold of matching words for two definitions to be considered the same, as a percentage between 0 and 1. |
| 35 | + * @returns |
| 36 | + */ |
| 37 | +export default function aggregateDefinitions(definitions, options = {}) { |
| 38 | + |
| 39 | + // NOTE: Don't alter the original definitions Array. |
| 40 | + let { precedence } = options; |
| 41 | + |
| 42 | + if (!precedence) { |
| 43 | + precedence = Array.from(new Set(definitions.map(({ sources }) => sources).flat())); |
| 44 | + } |
| 45 | + |
| 46 | + for (const def of definitions) { |
| 47 | + |
| 48 | + for (const source of def.sources) { |
| 49 | + if (!precedence.includes(source)) { |
| 50 | + throw new Error(`Unrecognized source: ${ source } in ${ def.definition }`); |
| 51 | + } |
| 52 | + } |
| 53 | + |
| 54 | + def.normalized = normalizeDefinition(def.definition); |
| 55 | + |
| 56 | + } |
| 57 | + |
| 58 | + const threshold = options.threshold ?? defaultThreshold; |
| 59 | + const mergedDefinitions = []; |
| 60 | + |
| 61 | + for (const src of precedence) { |
| 62 | + |
| 63 | + // get definitions for the current source |
| 64 | + const defs = definitions.filter(({ sources }) => sources.includes(src)); |
| 65 | + |
| 66 | + // if there aren't yet any selected definitions, add the current ones |
| 67 | + if (!mergedDefinitions.length) { |
| 68 | + mergedDefinitions.push(...defs); |
| 69 | + continue; |
| 70 | + } |
| 71 | + |
| 72 | + // for each definition from that source |
| 73 | + defsLoop: |
| 74 | + for (const currentDefinition of defs) { |
| 75 | + |
| 76 | + // compare the definition to each selected definition |
| 77 | + for (const existingDefinition of mergedDefinitions) { |
| 78 | + |
| 79 | + if (currentDefinition === existingDefinition) continue; |
| 80 | + |
| 81 | + const similarity = getSimilarity(currentDefinition.normalized, existingDefinition.normalized); |
| 82 | + const isSimilar = similarity >= threshold; |
| 83 | + |
| 84 | + // NOTE: Once a definition is deemed similar and merged with an existing definition, |
| 85 | + // break out of the definitions loop (using the labeled break statement) so that no more processing is done for this definition. |
| 86 | + if (isSimilar) { |
| 87 | + existingDefinition.sources.push(...currentDefinition.sources); |
| 88 | + continue defsLoop; |
| 89 | + } |
| 90 | + |
| 91 | + } |
| 92 | + |
| 93 | + // NOTE: This conditional is only reached if the definition hasn't already been matched to an existing one. |
| 94 | + // (See labeled break statement above.) |
| 95 | + if (!mergedDefinitions.includes(currentDefinition)) { |
| 96 | + mergedDefinitions.push(currentDefinition); |
| 97 | + } |
| 98 | + |
| 99 | + } |
| 100 | + |
| 101 | + } |
| 102 | + |
| 103 | + for (const definition of mergedDefinitions) { |
| 104 | + delete definition.normalized; |
| 105 | + } |
| 106 | + |
| 107 | + return mergedDefinitions; |
| 108 | + |
| 109 | +} |
0 commit comments