Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions core/parser/src/lexer/cursor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -162,16 +162,28 @@ impl<R: ReadChar> Cursor<R> {
/// It also stops when the next character is not an ascii or there is no next character.
///
/// Note that all characters up until the stop character are added to the buffer, including the character right before.
pub(super) fn take_while_ascii_pred<F>(&mut self, buf: &mut Vec<u8>, pred: &F) -> io::Result<()>
#[allow(clippy::cast_possible_truncation)]
#[inline]
pub(super) fn take_while_ascii_pred<'a, F>(
&mut self,
buf: &'a mut [u8],
pred: &F,
) -> io::Result<&'a [u8]>
where
F: Fn(char) -> bool,
{
let mut count = 0;
loop {
if !self.next_is_ascii_pred(pred)? {
return Ok(());
return Ok(&buf[..count]);
} else if let Some(byte) = self.next_char()? {
#[allow(clippy::cast_possible_truncation)]
buf.push(byte as u8);
buf[count] = byte as u8;
count += 1;
} else if count >= buf.len() {
return Err(Error::new(
ErrorKind::UnexpectedEof,
"Unexpected end of buffer while taking characters",
));
} else {
// next_is_pred will return false if the next value is None so the None case should already be handled.
unreachable!();
Expand Down
64 changes: 50 additions & 14 deletions core/parser/src/lexer/regex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
use crate::lexer::{Cursor, Error, Token, TokenKind, Tokenizer};
use crate::source::ReadChar;
use bitflags::bitflags;
use boa_ast::{Position, PositionGroup};
use boa_interner::{Interner, Sym};
use boa_ast::PositionGroup;
use boa_interner::Interner;
use regress::{Flags, Regex};
use std::fmt::{Display, Write};
use std::str::{self, FromStr};
Expand Down Expand Up @@ -114,13 +114,17 @@ impl<R> Tokenizer<R> for RegexLiteral {
}
}

let mut flags = Vec::new();
let mut flags: [u8; 8] = [0; 8];
let flags_start = cursor.pos();
cursor.take_while_ascii_pred(&mut flags, &char::is_alphabetic)?;
let flags_slice = cursor.take_while_ascii_pred(&mut flags, &char::is_alphabetic)?;

// SAFETY: We have already checked that the bytes are valid UTF-8.
let flags_str = unsafe { str::from_utf8_unchecked(flags.as_slice()) };
// TODO: Change this to if err() then convert flags_slice to str
let flags_string = match RegExpFlags::from_bytes(flags_slice) {
Err(message) => return Err(Error::Syntax(message.into(), flags_start)),
Ok(regex_flags) => regex_flags.to_string(),
};

let flags_str = flags_string.as_str();
let mut body_utf16 = Vec::new();

// We convert the body to UTF-16 since it may contain code points that are not valid UTF-8.
Expand Down Expand Up @@ -149,7 +153,7 @@ impl<R> Tokenizer<R> for RegexLiteral {
Ok(Token::new_by_position_group(
TokenKind::regular_expression_literal(
interner.get_or_intern(body_utf16.as_slice()),
parse_regex_flags(flags_str, flags_start, interner)?,
interner.get_or_intern(flags_str.to_string().as_str()),
),
start_pos,
cursor.pos_group(),
Expand Down Expand Up @@ -189,6 +193,45 @@ bitflags! {
}
}

impl RegExpFlags {
fn from_bytes(bytes: &[u8]) -> Result<Self, String> {
let mut flags = Self::default();
for c in bytes {
let new_flag = match c {
b'g' => Self::GLOBAL,
b'i' => Self::IGNORE_CASE,
b'm' => Self::MULTILINE,
b's' => Self::DOT_ALL,
b'u' => Self::UNICODE,
b'y' => Self::STICKY,
b'd' => Self::HAS_INDICES,
b'v' => Self::UNICODE_SETS,
0x00 => continue,
_ => {
return Err(format!(
"invalid regular expression flag {}",
char::from(c.to_owned())
));
}
};

if flags.contains(new_flag) {
return Err(format!(
"repeated regular expression flag {}",
char::from(c.to_owned())
));
}
flags.insert(new_flag);
}

if flags.contains(Self::UNICODE) && flags.contains(Self::UNICODE_SETS) {
return Err("cannot use both 'u' and 'v' flags".into());
}

Ok(flags)
}
}

impl FromStr for RegExpFlags {
type Err = String;

Expand Down Expand Up @@ -224,13 +267,6 @@ impl FromStr for RegExpFlags {
}
}

fn parse_regex_flags(s: &str, start: Position, interner: &mut Interner) -> Result<Sym, Error> {
match RegExpFlags::from_str(s) {
Err(message) => Err(Error::Syntax(message.into(), start)),
Ok(flags) => Ok(interner.get_or_intern(flags.to_string().as_str())),
}
}

impl Display for RegExpFlags {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
if self.contains(Self::HAS_INDICES) {
Expand Down
25 changes: 13 additions & 12 deletions core/parser/src/lexer/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -866,45 +866,46 @@ fn addition_no_spaces_e_number() {
fn take_while_ascii_pred_simple() {
let mut cur = Cursor::from(&b"abcdefghijk"[..]);

let mut buf: Vec<u8> = Vec::new();
let mut buf: [u8; 8] = [0; 8];

cur.take_while_ascii_pred(&mut buf, &|c| c == 'a' || c == 'b' || c == 'c')
let slice = cur
.take_while_ascii_pred(&mut buf, &|c| c == 'a' || c == 'b' || c == 'c')
.unwrap();

assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abc");
assert_eq!(str::from_utf8(slice).unwrap(), "abc");
}

#[test]
fn take_while_ascii_pred_immediate_stop() {
let mut cur = Cursor::from(&b"abcdefghijk"[..]);

let mut buf: Vec<u8> = Vec::new();
let mut buf: [u8; 8] = [0; 8];

cur.take_while_ascii_pred(&mut buf, &|_| false).unwrap();
let slice = cur.take_while_ascii_pred(&mut buf, &|_| false).unwrap();

assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "");
assert_eq!(str::from_utf8(slice).unwrap(), "");
}

#[test]
fn take_while_ascii_pred_entire_str() {
let mut cur = Cursor::from(&b"abcdefghijk"[..]);

let mut buf: Vec<u8> = Vec::new();
let mut buf: [u8; 11] = [0; 11];

cur.take_while_ascii_pred(&mut buf, &|_| true).unwrap();
let slice = cur.take_while_ascii_pred(&mut buf, &|_| true).unwrap();

assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abcdefghijk");
assert_eq!(str::from_utf8(slice).unwrap(), "abcdefghijk");
}

#[test]
fn take_while_ascii_pred_non_ascii_stop() {
let mut cur = Cursor::from("abcde😀fghijk".as_bytes());

let mut buf: Vec<u8> = Vec::new();
let mut buf: [u8; 12] = [0; 12];

cur.take_while_ascii_pred(&mut buf, &|_| true).unwrap();
let slice = cur.take_while_ascii_pred(&mut buf, &|_| true).unwrap();

assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abcde");
assert_eq!(str::from_utf8(slice).unwrap(), "abcde");
}

#[test]
Expand Down
Loading