Skip to content

Commit 4aa2195

Browse files
committed
Initial commit. Not finished.
0 parents  commit 4aa2195

11 files changed

+287
-0
lines changed

.gitignore

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
.*.swp
2+
doc
3+
tags
4+
examples/ss10pusa.csv
5+
build
6+
target
7+
Cargo.lock
8+
scratch*
9+
bench_large/huge

.travis.yml

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
language: rust
2+
rust:
3+
- 1.0.0
4+
- beta
5+
- nightly
6+
script:
7+
- cargo build --verbose
8+
- cargo test --verbose
9+
- cargo doc
10+
- if [ "$TRAVIS_RUST_VERSION" = "nightly" ]; then
11+
cargo bench --verbose;
12+
fi

COPYING

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
This project is dual-licensed under the Unlicense and MIT licenses.
2+
3+
You may use this code under the terms of either license.

Cargo.toml

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
[package]
2+
name = "aho-corasick"
3+
version = "0.1.0" #:version
4+
authors = ["Andrew Gallant <[email protected]>"]
5+
description = "Fast multiple substring searching with finite state machines."
6+
documentation = "http://burntsushi.net/rustdoc/aho-corasick/"
7+
homepage = "https://github.com/BurntSushi/aho-corasick"
8+
repository = "https://github.com/BurntSushi/aho-corasick"
9+
readme = "README.md"
10+
keywords = ["string", "search", "text", "aho", "corasick"]
11+
license = "Unlicense/MIT"
12+
13+
[lib]
14+
name = "aho_corasick"
15+
16+
[dependencies]
17+
memchr = "0.1.*"

LICENSE-MIT

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
The MIT License (MIT)
2+
3+
Copyright (c) 2015 Andrew Gallant
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in
13+
all copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21+
THE SOFTWARE.

Makefile

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
all:
2+
echo Nothing to do...
3+
4+
ctags:
5+
ctags --recurse --options=ctags.rust --languages=Rust
6+
7+
docs:
8+
cargo doc
9+
in-dir ./target/doc fix-perms
10+
rscp ./target/doc/* gopher:~/www/burntsushi.net/rustdoc/
11+
12+
push:
13+
git push origin master
14+
git push github master

README.md

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
**UNDER DEVELOPMENT**
2+
3+
This crate provides a fast implementation of the
4+
[Aho-Corasick](http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm)
5+
algorithm. Its intended use case is for fast substring matching, particularly
6+
when matching multiple substrings in a search text. This is achieved by
7+
compiling the substrings into a finite state machine.
8+
9+
This implementation provides optimal algorithmic time complexity. Construction
10+
of the finite state machine is `O(p)` where `p` is the length of the substrings
11+
concatenated. Matching against search text is `O(n + p + m)`, where `n` is
12+
the length of the search text and `m` is the number of matches.
13+
14+
[![Build status](https://api.travis-ci.org/BurntSushi/aho-corasick.png)](https://travis-ci.org/BurntSushi/aho-corasick)
15+
[![](http://meritbadge.herokuapp.com/aho-corasick)](https://crates.io/crates/aho-corasick)
16+
17+
Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org).
18+
19+
20+
### Documentation
21+
22+
[http://burntsushi.net/rustdoc/aho-corasick/](http://burntsushi.net/rustdoc/aho-corasick/).
23+
24+
25+
### Alternatives
26+
27+
Aho-Corasick is useful for matching multiple substrings against many long
28+
strings. If your long string is fixed, then you might consider building a
29+
[suffix array](https://github.com/BurntSushi/suffix)
30+
of the search text (which takes `O(n)` time). Matches can then be found in
31+
`O(plogn)` time.

UNLICENSE

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
This is free and unencumbered software released into the public domain.
2+
3+
Anyone is free to copy, modify, publish, use, compile, sell, or
4+
distribute this software, either in source code form or as a compiled
5+
binary, for any purpose, commercial or non-commercial, and by any
6+
means.
7+
8+
In jurisdictions that recognize copyright laws, the author or authors
9+
of this software dedicate any and all copyright interest in the
10+
software to the public domain. We make this dedication for the benefit
11+
of the public at large and to the detriment of our heirs and
12+
successors. We intend this dedication to be an overt act of
13+
relinquishment in perpetuity of all present and future rights to this
14+
software under copyright law.
15+
16+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19+
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20+
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21+
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22+
OTHER DEALINGS IN THE SOFTWARE.
23+
24+
For more information, please refer to <http://unlicense.org/>

ctags.rust

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
--langdef=Rust
2+
--langmap=Rust:.rs
3+
--regex-Rust=/^[ \t]*(#\[[^\]]\][ \t]*)*(pub[ \t]+)?(extern[ \t]+)?("[^"]+"[ \t]+)?(unsafe[ \t]+)?fn[ \t]+([a-zA-Z0-9_]+)/\6/f,functions,function definitions/
4+
--regex-Rust=/^[ \t]*(pub[ \t]+)?type[ \t]+([a-zA-Z0-9_]+)/\2/T,types,type definitions/
5+
--regex-Rust=/^[ \t]*(pub[ \t]+)?enum[ \t]+([a-zA-Z0-9_]+)/\2/g,enum,enumeration names/
6+
--regex-Rust=/^[ \t]*(pub[ \t]+)?struct[ \t]+([a-zA-Z0-9_]+)/\2/s,structure names/
7+
--regex-Rust=/^[ \t]*(pub[ \t]+)?mod[ \t]+([a-zA-Z0-9_]+)/\2/m,modules,module names/
8+
--regex-Rust=/^[ \t]*(pub[ \t]+)?static[ \t]+([a-zA-Z0-9_]+)/\2/c,consts,static constants/
9+
--regex-Rust=/^[ \t]*(pub[ \t]+)?trait[ \t]+([a-zA-Z0-9_]+)/\2/t,traits,traits/
10+
--regex-Rust=/^[ \t]*(pub[ \t]+)?impl([ \t\n]+<.*>)?[ \t]+([a-zA-Z0-9_]+)/\3/i,impls,trait implementations/
11+
--regex-Rust=/^[ \t]*macro_rules![ \t]+([a-zA-Z0-9_]+)/\1/d,macros,macro definitions/

session.vim

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
au BufWritePost *.rs silent!make ctags > /dev/null 2>&1

src/lib.rs

+144
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
/*!
2+
A fast implementation of the Aho-Corasick string search algorithm.
3+
*/
4+
5+
use std::fmt;
6+
7+
#[derive(Clone, Debug)]
8+
pub struct Builder {
9+
pats: Vec<String>,
10+
}
11+
12+
impl Builder {
13+
pub fn new() -> Builder {
14+
Builder { pats: vec![] }
15+
}
16+
17+
pub fn add<S: Into<String>>(mut self, s: S) -> Builder {
18+
self.pats.push(s.into());
19+
self
20+
}
21+
22+
pub fn build(self) -> Automaton {
23+
Automaton::new(self.pats)
24+
}
25+
}
26+
27+
type PatIdx = usize;
28+
type StateIdx = usize;
29+
30+
#[derive(Clone)]
31+
pub struct Automaton {
32+
pats: Vec<String>,
33+
states: Vec<State>,
34+
}
35+
36+
#[derive(Clone)]
37+
struct State {
38+
out: Vec<PatIdx>,
39+
fail: StateIdx,
40+
goto: Vec<StateIdx>, // indexed by alphabet
41+
}
42+
43+
impl Automaton {
44+
fn new(pats: Vec<String>) -> Automaton {
45+
Automaton {
46+
pats: vec![], // filled in later, avoid wrath of borrow checker
47+
states: vec![State::new()],
48+
}.build(pats)
49+
}
50+
51+
fn build(mut self, pats: Vec<String>) -> Automaton {
52+
let rooti = self.add_state(State::new());
53+
for (pati, pat) in pats.iter().enumerate() {
54+
let mut previ = rooti;
55+
for &b in pat.as_bytes() {
56+
if let Some(si) = self.states[previ].goto(b) {
57+
previ = si;
58+
} else {
59+
let nexti = self.add_state(State::new());
60+
self.states[previ].goto[b as usize] = nexti;
61+
previ = nexti;
62+
}
63+
}
64+
self.states[previ].out.push(pati);
65+
}
66+
for v in &mut self.states[rooti].goto {
67+
if *v == 0 {
68+
*v = 1;
69+
}
70+
}
71+
self.pats = pats;
72+
self
73+
}
74+
75+
fn add_state(&mut self, state: State) -> StateIdx {
76+
let i = self.states.len();
77+
self.states.push(state);
78+
i
79+
}
80+
}
81+
82+
impl State {
83+
fn new() -> State {
84+
State {
85+
out: vec![],
86+
fail: 1,
87+
goto: vec![0; 256],
88+
}
89+
}
90+
91+
fn goto(&self, b: u8) -> Option<StateIdx> {
92+
let i = self.goto[b as usize];
93+
if i == 0 { None } else { Some(i) }
94+
}
95+
96+
}
97+
98+
impl fmt::Debug for Automaton {
99+
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
100+
use std::iter::repeat;
101+
102+
try!(writeln!(f, "{}", repeat('-').take(79).collect::<String>()));
103+
try!(writeln!(f, "Patterns: {:?}", self.pats));
104+
for (i, state) in self.states.iter().enumerate().skip(1) {
105+
try!(writeln!(f, "{:3}: {}", i, state.debug(i == 1)));
106+
}
107+
write!(f, "{}", repeat('-').take(79).collect::<String>())
108+
}
109+
}
110+
111+
impl State {
112+
fn debug(&self, root: bool) -> String {
113+
format!("State {{ out: {:?}, fail: {:?}, goto: {{{}}} }}",
114+
self.out, self.fail, self.dense_goto_string(root))
115+
}
116+
117+
fn dense_goto_string(&self, root: bool) -> String {
118+
use std::char::from_u32;
119+
120+
let mut goto = vec![];
121+
for (i, &state) in self.goto.iter().enumerate() {
122+
if (!root && state == 0) || (root && state == 1) { continue; }
123+
goto.push(format!("{} => {}", from_u32(i as u32).unwrap(), state));
124+
}
125+
goto.connect(", ")
126+
}
127+
}
128+
129+
impl fmt::Debug for State {
130+
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
131+
write!(f, "{}", self.debug(false))
132+
}
133+
}
134+
135+
#[cfg(test)]
136+
mod tests {
137+
use super::Builder;
138+
139+
#[test]
140+
fn scratch() {
141+
let aut = Builder::new().add("he").add("she").build();
142+
println!("{:?}", aut);
143+
}
144+
}

0 commit comments

Comments
 (0)