Skip to content

Commit 0a70bcd

Browse files
authored
Merge pull request #2619 from dathere/replace-ahash-with-foldhash
refactor: replace ahash with faster foldhash
2 parents 678ed31 + 5ba27da commit 0a70bcd

15 files changed

+52
-52
lines changed

Cargo.lock

+11-11
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@ inherits = "release"
7070
panic = "abort"
7171

7272
[dependencies]
73-
ahash = "0.8"
7473
arboard = { version = "3.4.1", default-features = false, optional = true }
7574
atoi_simd = "0.16"
7675
base62 = { version = "2.2", optional = true }
@@ -110,6 +109,7 @@ eudex = { version = "0.1", optional = true }
110109
ext-sort = { version = "0.1", default-features = false }
111110
fast-float2 = "0.2"
112111
flate2 = { version = "1", optional = true }
112+
foldhash = "0.1"
113113
file-format = { version = "0.26", features = ["reader"] }
114114
filetime = "0.2"
115115
flexi_logger = { version = "0.29", features = [

src/cmd/cat.rs

+5-5
Original file line numberDiff line numberDiff line change
@@ -192,9 +192,9 @@ impl Args {
192192
// this algorithm is largely inspired by https://github.com/vi/csvcatrow by @vi
193193
// https://github.com/dathere/qsv/issues/527
194194
fn cat_rowskey(&self) -> CliResult<()> {
195-
// ahash is a faster hasher than the default one used by IndexSet and IndexMap
196-
type AhashIndexSet<T> = IndexSet<T, ahash::RandomState>;
197-
type AhashIndexMap<T, T2> = IndexMap<T, T2, ahash::RandomState>;
195+
// foldhash is a faster hasher than the default one used by IndexSet and IndexMap
196+
type FhashIndexSet<T> = IndexSet<T, foldhash::fast::RandomState>;
197+
type FhashIndexMap<T, T2> = IndexMap<T, T2, foldhash::fast::RandomState>;
198198

199199
let Ok(group_kind) = GroupKind::from_str(&self.flag_group) else {
200200
return fail_incorrectusage_clierror!(
@@ -204,7 +204,7 @@ impl Args {
204204
);
205205
};
206206

207-
let mut columns_global: AhashIndexSet<Box<[u8]>> = AhashIndexSet::default();
207+
let mut columns_global: FhashIndexSet<Box<[u8]>> = FhashIndexSet::default();
208208

209209
if group_kind != GroupKind::None {
210210
columns_global.insert(self.flag_group_name.as_bytes().to_vec().into_boxed_slice());
@@ -273,7 +273,7 @@ impl Args {
273273
let mut conf_path;
274274
let mut rdr;
275275
let mut header: &csv::ByteRecord;
276-
let mut columns_of_this_file: AhashIndexMap<Box<[u8]>, usize> = AhashIndexMap::default();
276+
let mut columns_of_this_file: FhashIndexMap<Box<[u8]>, usize> = FhashIndexMap::default();
277277
columns_of_this_file.reserve(num_columns_global);
278278
let mut row: csv::ByteRecord = csv::ByteRecord::with_capacity(500, num_columns_global);
279279

src/cmd/exclude.rs

+3-3
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,8 @@ Common options:
5555

5656
use std::{collections::hash_map::Entry, fs, io, str};
5757

58-
use ahash::AHashMap;
5958
use byteorder::{BigEndian, WriteBytesExt};
59+
use foldhash::{HashMap, HashMapExt};
6060
use serde::Deserialize;
6161

6262
use crate::{
@@ -185,14 +185,14 @@ impl Args {
185185
#[allow(dead_code)]
186186
struct ValueIndex<R> {
187187
// This maps tuples of values to corresponding rows.
188-
values: AHashMap<Vec<ByteString>, Vec<usize>>,
188+
values: HashMap<Vec<ByteString>, Vec<usize>>,
189189
idx: Indexed<R, io::Cursor<Vec<u8>>>,
190190
num_rows: usize,
191191
}
192192

193193
impl<R: io::Read + io::Seek> ValueIndex<R> {
194194
fn new(mut rdr: csv::Reader<R>, sel: &Selection, casei: bool) -> CliResult<ValueIndex<R>> {
195-
let mut val_idx = AHashMap::with_capacity(10000);
195+
let mut val_idx = HashMap::with_capacity(10000);
196196
let mut row_idx = io::Cursor::new(Vec::with_capacity(8 * 10000));
197197
let (mut rowi, mut count) = (0_usize, 0_usize);
198198

src/cmd/fill.rs

+5-5
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ Common options:
5353

5454
use std::{io, iter, ops};
5555

56-
use ahash::AHashMap;
56+
use foldhash::{HashMap, HashMapExt};
5757
use serde::Deserialize;
5858

5959
use crate::{
@@ -144,8 +144,8 @@ impl ops::Deref for ByteRecord {
144144
}
145145

146146
type GroupKey = Option<ByteRecord>;
147-
type GroupBuffer = AHashMap<GroupKey, Vec<ByteRecord>>;
148-
type Grouper = AHashMap<GroupKey, GroupValues>;
147+
type GroupBuffer = HashMap<GroupKey, Vec<ByteRecord>>;
148+
type Grouper = HashMap<GroupKey, GroupValues>;
149149
type GroupKeySelection = Option<Selection>;
150150

151151
trait GroupKeyConstructor {
@@ -163,14 +163,14 @@ impl GroupKeyConstructor for GroupKeySelection {
163163

164164
#[derive(Debug)]
165165
struct GroupValues {
166-
map: AHashMap<usize, ByteString>,
166+
map: HashMap<usize, ByteString>,
167167
default: Option<ByteString>,
168168
}
169169

170170
impl GroupValues {
171171
fn new(default: Option<ByteString>) -> Self {
172172
Self {
173-
map: AHashMap::new(),
173+
map: HashMap::new(),
174174
default,
175175
}
176176
}

src/cmd/geocode.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -368,9 +368,9 @@ use std::{
368368
path::{Path, PathBuf},
369369
};
370370

371-
use ahash::RandomState;
372371
use cached::{SizedCache, proc_macro::cached};
373372
use dynfmt2::Format;
373+
use foldhash::fast::RandomState;
374374
use geosuggest_core::{
375375
CitiesRecord, CountryRecord, Engine,
376376
storage::{self, IndexStorage},

src/cmd/join.rs

+3-3
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,8 @@ Common options:
9292

9393
use std::{collections::hash_map::Entry, fmt, io, iter::repeat_n, mem::swap, str};
9494

95-
use ahash::AHashMap;
9695
use byteorder::{BigEndian, WriteBytesExt};
96+
use foldhash::{HashMap, HashMapExt};
9797
use serde::Deserialize;
9898

9999
use crate::{
@@ -463,7 +463,7 @@ impl Args {
463463

464464
struct ValueIndex<R> {
465465
// This maps tuples of values to corresponding rows.
466-
values: AHashMap<Vec<ByteString>, Vec<usize>>,
466+
values: HashMap<Vec<ByteString>, Vec<usize>>,
467467
idx: Indexed<R, io::Cursor<Vec<u8>>>,
468468
num_rows: usize,
469469
}
@@ -503,7 +503,7 @@ impl<R: io::Read + io::Seek> ValueIndex<R> {
503503
zerosi: bool,
504504
nulls: bool,
505505
) -> CliResult<ValueIndex<R>> {
506-
let mut val_idx = AHashMap::with_capacity(20_000);
506+
let mut val_idx = HashMap::with_capacity(20_000);
507507
let mut row_idx = io::Cursor::new(Vec::with_capacity(8 * 20_000));
508508
let (mut rowi, mut count) = (0_usize, 0_usize);
509509

src/cmd/partition.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ use std::{
6262
path::Path,
6363
};
6464

65-
use ahash::AHashMap;
65+
use foldhash::{HashMap, HashMapExt};
6666
use regex::Regex;
6767
use serde::Deserialize;
6868

@@ -124,7 +124,7 @@ impl Args {
124124
let key_col = self.key_column(&rconfig, &headers)?;
125125
let mut r#gen = WriterGenerator::new(self.flag_filename.clone());
126126

127-
let mut writers: AHashMap<Vec<u8>, BoxedWriter> = AHashMap::new();
127+
let mut writers: HashMap<Vec<u8>, BoxedWriter> = HashMap::new();
128128
let mut row = csv::ByteRecord::new();
129129
while rdr.read_byte_record(&mut row)? {
130130
// Decide what file to put this in.

src/cmd/pseudo.rs

+3-3
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,8 @@ Common options:
6666
Must be a single character. (default: ,)
6767
"#;
6868

69-
use ahash::AHashMap;
7069
use dynfmt2::Format;
70+
use foldhash::{HashMap, HashMapExt};
7171
use serde::Deserialize;
7272

7373
use crate::{
@@ -90,8 +90,8 @@ struct Args {
9090
flag_delimiter: Option<Delimiter>,
9191
}
9292

93-
type Values = AHashMap<String, String>;
94-
type ValuesNum = AHashMap<String, u64>;
93+
type Values = HashMap<String, String>;
94+
type ValuesNum = HashMap<String, u64>;
9595

9696
pub fn run(argv: &[&str]) -> CliResult<()> {
9797
let args: Args = util::get_args(USAGE, argv)?;

src/cmd/safenames.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ Common options:
101101

102102
use std::collections::HashMap;
103103

104-
use ahash::RandomState;
104+
use foldhash::fast::RandomState;
105105
use serde::{Deserialize, Serialize};
106106

107107
use crate::{

src/cmd/schema.rs

+7-7
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,8 @@ Common options:
8080

8181
use std::{fs::File, io::Write, path::Path};
8282

83-
use ahash::{AHashMap, AHashSet};
8483
use csv::ByteRecord;
84+
use foldhash::{HashMap, HashMapExt, HashSet};
8585
use grex::RegExpBuilder;
8686
use itertools::Itertools;
8787
use log::{debug, error, info, warn};
@@ -486,7 +486,7 @@ fn build_low_cardinality_column_selector_arg(
486486
fn get_unique_values(
487487
args: &util::SchemaArgs,
488488
column_select_arg: &str,
489-
) -> CliResult<AHashMap<String, Vec<String>>> {
489+
) -> CliResult<HashMap<String, Vec<String>>> {
490490
// prepare arg for invoking cmd::frequency
491491
let freq_args = crate::cmd::frequency::Args {
492492
arg_input: args.arg_input.clone(),
@@ -530,8 +530,8 @@ fn get_unique_values(
530530
fn construct_map_of_unique_values(
531531
freq_csv_fields: &ByteRecord,
532532
frequency_tables: &[Frequencies<Vec<u8>>],
533-
) -> CliResult<AHashMap<String, Vec<String>>> {
534-
let mut unique_values_map: AHashMap<String, Vec<String>> = AHashMap::new();
533+
) -> CliResult<HashMap<String, Vec<String>>> {
534+
let mut unique_values_map: HashMap<String, Vec<String>> = HashMap::new();
535535
let mut unique_values = Vec::with_capacity(freq_csv_fields.len());
536536
// iterate through fields and gather unique values for each field
537537
for (i, header_byte_slice) in freq_csv_fields.iter().enumerate() {
@@ -592,7 +592,7 @@ fn get_required_fields(properties_map: &Map<String, Value>) -> Vec<Value> {
592592
fn generate_string_patterns(
593593
args: &util::SchemaArgs,
594594
properties_map: &Map<String, Value>,
595-
) -> CliResult<AHashMap<String, String>> {
595+
) -> CliResult<HashMap<String, String>> {
596596
let rconfig = Config::new(args.arg_input.as_ref())
597597
.delimiter(args.flag_delimiter)
598598
.no_headers(args.flag_no_headers)
@@ -603,7 +603,7 @@ fn generate_string_patterns(
603603
let headers = rdr.byte_headers()?.clone();
604604
let sel = rconfig.selection(&headers)?;
605605

606-
let mut pattern_map: AHashMap<String, String> = AHashMap::new();
606+
let mut pattern_map: HashMap<String, String> = HashMap::new();
607607

608608
// return empty pattern map when:
609609
// * no columns are selected
@@ -615,7 +615,7 @@ fn generate_string_patterns(
615615
}
616616

617617
// Map each Header to its unique Set of values
618-
let mut unique_values_map: AHashMap<String, AHashSet<String>> = AHashMap::new();
618+
let mut unique_values_map: HashMap<String, HashSet<String>> = HashMap::new();
619619

620620
#[allow(unused_assignments)]
621621
let mut record = csv::ByteRecord::new();

src/cmd/template.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ use std::{
128128
},
129129
};
130130

131-
use ahash::{HashMap, HashMapExt};
131+
use foldhash::{HashMap, HashMapExt};
132132
#[cfg(any(feature = "feature_capable", feature = "lite"))]
133133
use indicatif::{ProgressBar, ProgressDrawTarget};
134134
use minijinja::{Environment, Value, value::ValueKind};
@@ -852,7 +852,7 @@ fn register_lookup(
852852
let row_len = lookup_table.headers.len();
853853
for record in rdr.records().flatten() {
854854
let mut row_data: HashMap<String, String> =
855-
HashMap::with_capacity_and_hasher(row_len, ahash::RandomState::new());
855+
HashMap::with_capacity_and_hasher(row_len, foldhash::fast::RandomState::default());
856856

857857
// Store all fields for this row
858858
for (header, value) in lookup_table.headers.iter().zip(record.iter()) {

src/cmd/validate.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -206,8 +206,8 @@ use std::{
206206
},
207207
};
208208

209-
use ahash::{HashSet, HashSetExt};
210209
use csv::ByteRecord;
210+
use foldhash::{HashSet, HashSetExt};
211211
use indicatif::HumanCount;
212212
#[cfg(any(feature = "feature_capable", feature = "lite"))]
213213
use indicatif::{ProgressBar, ProgressDrawTarget};

0 commit comments

Comments
 (0)