Skip to content

Commit 2019df4

Browse files
first version of Forest (#1)
1 parent 379972e commit 2019df4

File tree

22 files changed

+928
-88
lines changed

22 files changed

+928
-88
lines changed

Cargo.toml

+8-4
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,15 @@ keywords = ["ara", "php", "programming-language", "parser"]
1414
categories = ["compilers", "development-tools::build-utils"]
1515

1616
[dependencies]
17-
ara_parser = { version = "0.6.0" }
17+
ara_parser = { version = "0.6.5" }
1818
ara_source = { version = "0.2.0" }
19-
ara_reporting = { version = "0.6.0" }
20-
serde = { version = "1.0.152", features = ["derive"] }
21-
num_cpus = { version = "1.15.0"}
19+
ara_reporting = { version = "0.6.1" }
20+
num_cpus = { version = "1.15.0" }
21+
rustc-hash = { version = "1.1.0" }
22+
walkdir = { version = "2.3.2" }
23+
bincode = { version = "2.0.0-rc.2" }
24+
log = { version = "0.4.17" }
25+
simplelog = { version = "0.12.0" }
2226

2327
[profile.release]
2428
opt-level = 3

src/config.rs

+60-32
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,70 @@
1-
use std::hash::Hasher;
1+
use std::path::PathBuf;
22

3+
use crate::logger::Logger;
4+
5+
#[derive(Debug)]
36
pub struct Config {
4-
pub root: String,
5-
pub source: String,
6-
pub definitions: Vec<String>,
7-
pub cache: String,
7+
pub root: PathBuf,
8+
pub source: PathBuf,
9+
pub definitions: Vec<PathBuf>,
10+
pub cache: Option<PathBuf>,
811
pub threads: usize,
9-
pub hasher: Box<dyn Hasher>,
12+
pub logger: Option<Logger>,
1013
}
1114

1215
impl Config {
13-
pub fn new<R, S, D, C>(
14-
root: R,
15-
source: S,
16-
definitions: Vec<D>,
17-
cache: Option<C>,
18-
threads: Option<usize>,
19-
hasher: Option<Box<dyn Hasher>>,
20-
) -> Self
21-
where
22-
R: Into<String>,
23-
S: Into<String>,
24-
D: Into<String>,
25-
C: Into<String>,
26-
{
27-
let root = root.into();
28-
let cache = cache
29-
.map(|c| c.into())
30-
.unwrap_or_else(|| format!("{}/.cache", root));
31-
16+
pub fn new<R: Into<String>>(root: R) -> Self {
3217
Self {
33-
root,
34-
source: source.into(),
35-
definitions: definitions.into_iter().map(|d| d.into()).collect(),
36-
cache,
37-
threads: threads.unwrap_or_else(num_cpus::get),
38-
hasher: hasher
39-
.unwrap_or_else(|| Box::new(std::collections::hash_map::DefaultHasher::new())),
18+
root: PathBuf::from(root.into()),
19+
source: PathBuf::from(String::default()),
20+
definitions: Vec::new(),
21+
cache: None,
22+
threads: num_cpus::get(),
23+
logger: None,
4024
}
4125
}
26+
27+
#[must_use]
28+
pub fn with_source<S: Into<String>>(mut self, source: S) -> Self {
29+
self.source = PathBuf::from(source.into());
30+
31+
self
32+
}
33+
34+
#[must_use]
35+
pub fn with_definitions<D: Into<String>>(mut self, definitions: Vec<D>) -> Self {
36+
self.definitions = definitions
37+
.into_iter()
38+
.map(|definition| PathBuf::from(definition.into()))
39+
.collect();
40+
41+
self
42+
}
43+
44+
#[must_use]
45+
pub fn with_cache_directory<C: Into<String>>(mut self, cache_dir: C) -> Self {
46+
let path = PathBuf::from(cache_dir.into());
47+
48+
if path.is_relative() {
49+
self.cache = Some(self.root.join(path));
50+
} else {
51+
self.cache = Some(path);
52+
}
53+
54+
self
55+
}
56+
57+
#[must_use]
58+
pub fn with_threads(mut self, threads: usize) -> Self {
59+
self.threads = threads;
60+
61+
self
62+
}
63+
64+
#[must_use]
65+
pub fn with_logger(mut self, logger: Logger) -> Self {
66+
self.logger = Some(logger);
67+
68+
self
69+
}
4270
}

src/error.rs

+61
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
use ara_reporting::issue::Issue;
2+
use ara_reporting::Report;
3+
4+
#[derive(Debug)]
5+
pub enum Error {
6+
EncodeError(String),
7+
DecodeError(String),
8+
InvalidPath(String),
9+
IoError(std::io::Error),
10+
ParseError(Box<Report>),
11+
LogError(log::SetLoggerError),
12+
}
13+
14+
impl From<walkdir::Error> for Error {
15+
fn from(error: walkdir::Error) -> Self {
16+
Error::IoError(error.into())
17+
}
18+
}
19+
20+
impl From<std::io::Error> for Error {
21+
fn from(error: std::io::Error) -> Self {
22+
Error::IoError(error)
23+
}
24+
}
25+
26+
impl From<Error> for Report {
27+
fn from(error: Error) -> Self {
28+
Report::new().with_issue(Issue::from_string(error.to_string()))
29+
}
30+
}
31+
32+
impl From<bincode::error::EncodeError> for Error {
33+
fn from(error: bincode::error::EncodeError) -> Self {
34+
Error::EncodeError(error.to_string())
35+
}
36+
}
37+
38+
impl From<bincode::error::DecodeError> for Error {
39+
fn from(error: bincode::error::DecodeError) -> Self {
40+
Error::DecodeError(error.to_string())
41+
}
42+
}
43+
44+
impl From<log::SetLoggerError> for Error {
45+
fn from(error: log::SetLoggerError) -> Self {
46+
Error::LogError(error)
47+
}
48+
}
49+
50+
impl std::fmt::Display for Error {
51+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
52+
match self {
53+
Error::IoError(error) => write!(f, "io error: {error}"),
54+
Error::InvalidPath(message) => write!(f, "invalid source: {message}"),
55+
Error::EncodeError(message) => write!(f, "encode error: {message}"),
56+
Error::DecodeError(message) => write!(f, "decode error: {message}"),
57+
Error::ParseError(report) => write!(f, "parse error: {report}"),
58+
Error::LogError(error) => write!(f, "log error: {error}"),
59+
}
60+
}
61+
}

src/hash.rs

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
use std::hash::Hasher;
2+
3+
pub trait ContentHasher: Send + Sync {
4+
fn hash(&self, content: &str) -> u64;
5+
}
6+
7+
pub struct FxHasher;
8+
9+
impl FxHasher {
10+
pub fn new() -> Self {
11+
Self
12+
}
13+
}
14+
15+
impl ContentHasher for FxHasher {
16+
fn hash(&self, content: &str) -> u64 {
17+
let mut hasher = rustc_hash::FxHasher::default();
18+
hasher.write(content.as_bytes());
19+
hasher.finish()
20+
}
21+
}

src/lib.rs

+117-52
Original file line numberDiff line numberDiff line change
@@ -1,74 +1,139 @@
1-
use ara_parser::parser;
1+
use std::fs;
2+
use std::path::PathBuf;
3+
use std::thread;
4+
5+
use ara_parser::tree::Tree;
26
use ara_parser::tree::TreeMap;
3-
use ara_source::loader;
7+
use ara_reporting::Report;
8+
use ara_source::source::Source;
49
use ara_source::SourceMap;
510

611
use crate::config::Config;
12+
use crate::error::Error;
13+
use crate::hash::FxHasher;
14+
use crate::serializer::BincodeSerializer;
15+
use crate::source::SourceFilesCollector;
16+
use crate::tree::TreeBuilder;
717

818
pub mod config;
19+
pub mod error;
20+
pub(crate) mod hash;
21+
pub mod logger;
22+
pub(crate) mod serializer;
23+
pub mod source;
24+
pub(crate) mod tree;
25+
26+
pub(crate) const ARA_SOURCE_EXTENSION: &str = "ara";
27+
pub(crate) const ARA_DEFINITION_EXTENSION: &str = "d.ara";
28+
pub(crate) const ARA_CACHED_SOURCE_EXTENSION: &str = "ara.cache";
929

30+
#[derive(Debug)]
1031
pub struct Forest {
1132
pub source: SourceMap,
1233
pub tree: TreeMap,
1334
}
1435

15-
pub struct Parser {
16-
pub config: Config,
36+
impl Forest {
37+
pub fn new(source: SourceMap, tree: TreeMap) -> Self {
38+
Self { source, tree }
39+
}
40+
}
41+
42+
pub struct Parser<'a> {
43+
pub config: &'a Config,
44+
tree_builder: TreeBuilder<'a>,
1745
}
1846

19-
impl Parser {
20-
pub fn new(config: Config) -> Self {
21-
Self { config }
47+
impl<'a> Parser<'a> {
48+
pub fn new(config: &'a Config) -> Self {
49+
let tree_builder = TreeBuilder::new(
50+
config,
51+
Box::new(FxHasher::new()),
52+
Box::new(BincodeSerializer::new()),
53+
);
54+
55+
Self {
56+
config,
57+
tree_builder,
58+
}
2259
}
2360

24-
pub fn parse(&self) -> Result<Forest, String> {
25-
let mut threads = Vec::with_capacity(self.config.threads);
26-
27-
let source_map = loader::load_directories(&self.config.root, {
28-
let mut directories = self.config.definitions.clone();
29-
directories.push(self.config.source.clone());
30-
31-
directories
32-
})
33-
.expect("Failed to load source map");
34-
35-
// split the sources into N chunks, where N is the number of threads
36-
let chunk_size = source_map.sources.len() / self.config.threads;
37-
let chunks: Vec<Vec<ara_source::source::Source>> = source_map
38-
.sources
39-
.chunks(chunk_size)
40-
.map(|chunk| chunk.to_vec())
41-
.collect();
42-
43-
for chunk in chunks {
44-
threads.push(std::thread::spawn(move || {
45-
let map = SourceMap::new(chunk);
46-
parser::parse_map(&map)
47-
}));
61+
pub fn parse(&self) -> Result<Forest, Box<Report>> {
62+
self.init_logger().map_err(|error| Box::new(error.into()))?;
63+
64+
let (sources, trees) =
65+
thread::scope(|scope| -> Result<(Vec<Source>, Vec<Tree>), Box<Report>> {
66+
self.create_cache_dir()
67+
.map_err(|error| Box::new(error.into()))?;
68+
69+
let files = SourceFilesCollector::new(self.config)
70+
.collect()
71+
.map_err(|error| Box::new(error.into()))?;
72+
73+
if files.is_empty() {
74+
return Ok((Vec::new(), Vec::new()));
75+
}
76+
77+
let threads_count = self.threads_count(files.len());
78+
let chunks = files
79+
.chunks(files.len() / threads_count)
80+
.map(Vec::from)
81+
.collect::<Vec<Vec<PathBuf>>>();
82+
83+
let mut threads = Vec::with_capacity(threads_count);
84+
for chunk in chunks.into_iter() {
85+
threads.push(scope.spawn(
86+
move || -> Result<Vec<(Source, Tree)>, Box<Report>> {
87+
let mut source_tree = Vec::with_capacity(chunk.len());
88+
for source_path in chunk {
89+
let (source, tree) = self
90+
.tree_builder
91+
.build(&source_path)
92+
.map_err(|error| match error {
93+
Error::ParseError(report) => report,
94+
_ => Box::new(error.into()),
95+
})?;
96+
source_tree.push((source, tree));
97+
}
98+
99+
Ok(source_tree)
100+
},
101+
));
102+
}
103+
104+
let mut result = Vec::new();
105+
for handle in threads {
106+
result.extend(handle.join().unwrap()?);
107+
}
108+
let (sources, trees) = result.into_iter().unzip();
109+
110+
Ok((sources, trees))
111+
})?;
112+
113+
Ok(Forest::new(SourceMap::new(sources), TreeMap::new(trees)))
114+
}
115+
116+
fn threads_count(&self, files_len: usize) -> usize {
117+
if self.config.threads > files_len {
118+
files_len
119+
} else {
120+
self.config.threads
48121
}
122+
}
123+
124+
fn create_cache_dir(&self) -> Result<(), Error> {
125+
if self.config.cache.is_some() {
126+
fs::create_dir_all(self.config.cache.as_ref().unwrap())?;
127+
}
128+
129+
Ok(())
130+
}
49131

50-
let mut results = vec![];
51-
for thread in threads {
52-
results.push(thread.join().unwrap());
132+
fn init_logger(&self) -> Result<(), Error> {
133+
if self.config.logger.is_some() {
134+
self.config.logger.as_ref().unwrap().init()?
53135
}
54136

55-
todo!("
56-
the implementation above is just a placeholder
57-
58-
the idea is to:
59-
1. load the source map
60-
2. split the source map into N chunks, where N is the number of threads
61-
3. spawn N threads, each of which parses a chunk of the source map
62-
4. in each thread, iterate over the sources in the chunk and:
63-
first we need to check if the source is present in the cache, if yes, load the cached tree,
64-
and check if the hash of the source matches the hash of the cached tree, if yes, return the cached tree,
65-
otherwise, parse the source and save the tree to the cache
66-
if the source is not present in the cache, parse the source and save the tree to the cache.
67-
If the parser failed, return the report immediately and do not continue
68-
5. join the threads and collect the results
69-
If any of the threads failed, return the report immediately and do not continue
70-
6. merge the results into a single forest
71-
7. return the forest
72-
");
137+
Ok(())
73138
}
74139
}

0 commit comments

Comments
 (0)