-
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
[perf] hashing function improvements #3242
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,62 +2,93 @@ | |
// Inspired by https://github.com/garycourt/murmurhash-js | ||
// Ported from https://github.com/aappleby/smhasher/blob/61a0530f28277f2e850bfc39600ce61d02b518de/src/MurmurHash2.cpp#L37-L86 | ||
|
||
export default function murmur2(str: string): string { | ||
const hasTextEncoder = typeof TextEncoder !== 'undefined' | ||
|
||
const encoder = hasTextEncoder ? new TextEncoder() : undefined | ||
|
||
let bufferLength = -1 | ||
let buffer: ArrayBuffer | ||
let uint8View: Uint8Array | ||
let int32View: Int32Array | ||
|
||
function encode(input: string): number { | ||
// Legacy IE11 support | ||
if (hasTextEncoder === false) { | ||
const bytes = [] | ||
for (let i = 0; i < input.length; i++) { | ||
const codePoint = input.charCodeAt(i) | ||
if (codePoint > 0xff) { | ||
bytes.push(codePoint >>> 8) | ||
bytes.push(codePoint & 0xff) | ||
} else { | ||
bytes.push(codePoint) | ||
} | ||
} | ||
uint8View = new Uint8Array(bytes) | ||
int32View = new Int32Array(uint8View.buffer, 0, Math.floor(bytes.length / 4)) | ||
|
||
return bytes.length | ||
} | ||
|
||
if (input.length > bufferLength) { | ||
// bufferLength must be a multiple of 4 to satisfy Int32Array constraints | ||
bufferLength = input.length + (4 - input.length % 4) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This isn't allocating enough space for strings where utf8 length != utf16 length and will just ignore some input at the end? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, and there's no way to know the byte length until it has been iterated/encoded. I'll allocate 2x the space to ensure there's enough space for the worst case. |
||
|
||
// buffer.resize() is only available in recent browsers, so we re-allocate | ||
// a new buffer and views | ||
buffer = new ArrayBuffer(bufferLength) | ||
uint8View = new Uint8Array(buffer) | ||
int32View = new Int32Array(buffer) | ||
} | ||
|
||
return encoder!.encodeInto(input, uint8View).written as number; | ||
} | ||
|
||
export default function murmur2(input: string): string { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. While I was exploring the data a bit more I realized that the hashing function is re-hashing the same strings a lot, and that we could speed it up by caching it in a One note before the results, I was testing with a dataset made of a bunch of MUI components copy/pasted. For this benchmark I switched to a new dataset that is directly extracted from the MUI dashboard template, which I think is more realistic and reflects production setups even better. Here are the results, which indicate a substantial improvement for that dataset when caching the murmur2 result. The implementation is very naive, and I found that trying to mutate/update the map with more complex rules actually decreased the gains by a lot, probably because adding bookkeeping on each hash iteration is too much work. For comparison I also added a case for the cached version, but with a dataset with absolutely no duplicate, which means the caching code is pure overhead. Last note, for this particular dataset, unlike the last dataset, SpiderMonkey shows a small/moderate performance decrease from So to sum this up, caching the hash result is a gain if we assume that we're going to see a lot of duplicates following each other, which seems to be the case for MUI cases. I would love to have your opinion on this change. |
||
const length = encode(input); | ||
|
||
// 'm' and 'r' are mixing constants generated offline. | ||
// They're not really 'magic', they just happen to work well. | ||
|
||
// const m = 0x5bd1e995; | ||
// const r = 24; | ||
const m = 0x5bd1e995; | ||
const r = 24; | ||
romgrk marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
// Initialize the hash | ||
|
||
var h = 0 | ||
let h = 0 | ||
|
||
// Mix 4 bytes at a time into the hash | ||
|
||
var k, | ||
i = 0, | ||
len = str.length | ||
for (; len >= 4; ++i, len -= 4) { | ||
k = | ||
(str.charCodeAt(i) & 0xff) | | ||
((str.charCodeAt(++i) & 0xff) << 8) | | ||
((str.charCodeAt(++i) & 0xff) << 16) | | ||
((str.charCodeAt(++i) & 0xff) << 24) | ||
|
||
k = | ||
/* Math.imul(k, m): */ | ||
(k & 0xffff) * 0x5bd1e995 + (((k >>> 16) * 0xe995) << 16) | ||
k ^= /* k >>> r: */ k >>> 24 | ||
|
||
h = | ||
/* Math.imul(k, m): */ | ||
((k & 0xffff) * 0x5bd1e995 + (((k >>> 16) * 0xe995) << 16)) ^ | ||
/* Math.imul(h, m): */ | ||
((h & 0xffff) * 0x5bd1e995 + (((h >>> 16) * 0xe995) << 16)) | ||
let i = 0 | ||
let len = length | ||
for (; len >= 4; i++, len -= 4) { | ||
let k = int32View[i] | ||
|
||
k = Math.imul(k, m) | ||
romgrk marked this conversation as resolved.
Show resolved
Hide resolved
|
||
k ^= k >>> r | ||
k = Math.imul(k, m) | ||
|
||
h = Math.imul(h, m) | ||
h ^= k | ||
} | ||
|
||
// Handle the last few bytes of the input array | ||
|
||
switch (len) { | ||
case 3: | ||
h ^= (str.charCodeAt(i + 2) & 0xff) << 16 | ||
h ^= (uint8View[i * 4 + 2] & 0xff) << 16 | ||
case 2: | ||
h ^= (str.charCodeAt(i + 1) & 0xff) << 8 | ||
h ^= (uint8View[i * 4 + 1] & 0xff) << 8 | ||
case 1: | ||
h ^= str.charCodeAt(i) & 0xff | ||
h = | ||
/* Math.imul(h, m): */ | ||
(h & 0xffff) * 0x5bd1e995 + (((h >>> 16) * 0xe995) << 16) | ||
h ^= uint8View[i * 4] & 0xff | ||
h = Math.imul(h, m) | ||
} | ||
|
||
// Do a few final mixes of the hash to ensure the last few | ||
// bytes are well-incorporated. | ||
|
||
h ^= h >>> 13 | ||
h = | ||
/* Math.imul(h, m): */ | ||
(h & 0xffff) * 0x5bd1e995 + (((h >>> 16) * 0xe995) << 16) | ||
h = Math.imul(h, m) | ||
h = (h ^ (h >>> 15)) >>> 0 | ||
|
||
return ((h ^ (h >>> 15)) >>> 0).toString(36) | ||
return h.toString(36) | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This needs to encode like
TextEncoder
since the hash needs to return the same results if there e.g. isTextEncoder
when a page is server-rendered but not on the clientThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Self notes:
utf16 to unicode
unicode to utf8