Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 63 additions & 7 deletions src-tauri/src/commands/tokens.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@ use tree_sitter_highlight::{HighlightConfiguration, HighlightEvent, Highlighter}

#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct Token {
/// Zero-based character offset where the token starts.
pub start: usize,
/// Zero-based character offset where the token ends (exclusive).
pub end: usize,
pub token_type: String,
pub class_name: String,
Expand Down Expand Up @@ -460,7 +462,10 @@ pub fn tokenize_content(content: &str, language: &str) -> Result<Vec<Token>> {
.highlight(&config, content.as_bytes(), None, |_| None)?
.collect::<Result<Vec<_>, _>>()?;

let mut tokens = Vec::new();
// Collect raw byte-based tokens first. We'll normalize offsets to
// character positions after the highlight walk so the frontend can
// skip costly byte-to-char conversions.
let mut raw_tokens: Vec<(usize, usize, String, String)> = Vec::new();
let mut current_highlight: Option<usize> = None;

for event in highlights {
Expand All @@ -473,12 +478,7 @@ pub fn tokenize_content(content: &str, language: &str) -> Result<Vec<Token>> {
// Skip whitespace-only tokens
let text = &content[start..end];
if !text.trim().is_empty() {
tokens.push(Token {
start,
end,
token_type: token_type.to_string(),
class_name: class_name.to_string(),
});
raw_tokens.push((start, end, token_type.to_string(), class_name.to_string()));
}
}
}
Expand All @@ -491,9 +491,65 @@ pub fn tokenize_content(content: &str, language: &str) -> Result<Vec<Token>> {
}
}

if raw_tokens.is_empty() {
return Ok(Vec::new());
}

// For ASCII content, byte and character offsets align, so we can
// return early without additional work.
if content.is_ascii() {
return Ok(raw_tokens
.into_iter()
.filter(|(start, end, _, _)| end > start)
.map(|(start, end, token_type, class_name)| Token {
start,
end,
token_type,
class_name,
})
.collect());
}

// Build a lookup table of byte offsets to character indices so we can
// translate tree-sitter's byte ranges into char-based offsets. Using a
// sorted Vec keeps memory overhead predictable and allows binary search
// per token without scanning the full string repeatedly.
let mut char_starts: Vec<usize> = content.char_indices().map(|(idx, _)| idx).collect();
char_starts.push(content.len());

let tokens = raw_tokens
.into_iter()
.filter_map(|(start_byte, end_byte, token_type, class_name)| {
if end_byte <= start_byte {
return None;
}

let start = byte_to_char_offset(start_byte, &char_starts);
let end = byte_to_char_offset(end_byte, &char_starts);

if end <= start {
return None;
}

Some(Token {
start,
end,
token_type,
class_name,
})
})
.collect();

Ok(tokens)
}

fn byte_to_char_offset(byte_offset: usize, char_starts: &[usize]) -> usize {
match char_starts.binary_search(&byte_offset) {
Ok(idx) => idx,
Err(idx) => idx,
}
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down
191 changes: 90 additions & 101 deletions src/features/editor/stores/view-store.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,108 +17,75 @@ interface EditorViewState {
};
}

// Helper function to convert buffer tokens to line tokens
// Handles conversion from byte offsets (from tree-sitter) to character positions
// Helper function to convert buffer tokens to per-line tokens. The incoming
// token offsets are character-based, so we only need to map them onto the
// correct line ranges.
function convertToLineTokens(
content: string,
lines: string[],
tokens: Array<{ start: number; end: number; class_name: string }>,
): Map<number, LineToken[]> {
const lines = content.split("\n");
const tokensByLine = new Map<number, LineToken[]>();

if (tokens.length === 0) {
if (tokens.length === 0 || lines.length === 0) {
return tokensByLine;
}

// Build a byte-to-character mapping for proper UTF-8 handling
const encoder = new TextEncoder();
let byteOffset = 0;
let charOffset = 0;
const byteToChar = new Map<number, number>();

for (let i = 0; i < content.length; i++) {
byteToChar.set(byteOffset, charOffset);
const char = content[i];
const charBytes = encoder.encode(char).length;
byteOffset += charBytes;
charOffset++;
// Precompute the starting character offset for each line so we can place
// tokens without repeatedly scanning the entire content.
const lineStartOffsets: number[] = new Array(lines.length);
let runningOffset = 0;
for (let i = 0; i < lines.length; i++) {
lineStartOffsets[i] = runningOffset;
runningOffset += lines[i].length;
if (i < lines.length - 1) {
runningOffset += 1; // Account for the newline character between lines
}
}
byteToChar.set(byteOffset, charOffset); // End position

// Convert byte offsets to character offsets
const charTokens = tokens
.map((token) => {
// Find closest byte positions if exact match not found
let start = byteToChar.get(token.start);
let end = byteToChar.get(token.end);

// If exact byte position not found, find the closest character position
if (start === undefined) {
// Find the largest byte offset that's <= token.start
let closestByte = 0;
for (const [byte, char] of byteToChar.entries()) {
if (byte <= token.start && byte > closestByte) {
closestByte = byte;
start = char;
}
}
if (start === undefined) start = 0;
}

if (end === undefined) {
// Find the smallest byte offset that's >= token.end
let closestChar = content.length;
for (const [byte, char] of byteToChar.entries()) {
if (byte >= token.end && char < closestChar) {
closestChar = char;
end = char;
}
}
if (end === undefined) end = content.length;
}
let lineIndex = 0;
const lastLineIndex = lines.length - 1;

return { start, end, class_name: token.class_name };
})
.filter((token) => {
// Keep tokens that are valid for the current content
return (
token.start >= 0 &&
token.end <= content.length &&
token.start < token.end &&
token.end - token.start < 10000 // Allow large tokens but skip absurdly large ones
);
});
for (const token of tokens) {
const { start, end, class_name } = token;
if (end <= start) continue;

let currentCharOffset = 0;
// Advance the line pointer until the token start falls within the current
// line's range. Tokens arrive in order, so we never need to move backwards.
while (lineIndex + 1 < lines.length && start >= lineStartOffsets[lineIndex + 1]) {
lineIndex++;
}

for (let lineNumber = 0; lineNumber < lines.length; lineNumber++) {
const lineLength = lines[lineNumber].length;
const lineStart = currentCharOffset;
const lineEnd = currentCharOffset + lineLength;
const lineTokens: LineToken[] = [];
let currentLine = lineIndex;

// Find tokens that overlap with this line
for (const token of charTokens) {
if (token.start >= lineEnd) break;
if (token.end <= lineStart) continue;
while (currentLine < lines.length) {
const lineStart = lineStartOffsets[currentLine];
const lineLength = lines[currentLine].length;
const lineEnd = lineStart + lineLength;

const tokenStartInLine = Math.max(0, token.start - lineStart);
const tokenEndInLine = Math.min(lineLength, token.end - lineStart);
const startInLine = Math.max(0, start - lineStart);
const endInLine = Math.min(lineLength, end - lineStart);

if (tokenStartInLine < tokenEndInLine) {
if (startInLine < endInLine) {
let lineTokens = tokensByLine.get(currentLine);
if (!lineTokens) {
lineTokens = [];
tokensByLine.set(currentLine, lineTokens);
}
lineTokens.push({
startColumn: tokenStartInLine,
endColumn: tokenEndInLine,
className: token.class_name,
startColumn: startInLine,
endColumn: endInLine,
className: class_name,
});
}
}

if (lineTokens.length > 0) {
tokensByLine.set(lineNumber, lineTokens);
if (end <= lineEnd || currentLine === lastLineIndex) {
break;
}

currentLine++;
}

currentCharOffset += lineLength + 1; // +1 for newline
lineIndex = currentLine;
}

return tokensByLine;
Expand All @@ -141,7 +108,7 @@ export const useEditorViewStore = createSelectors(
getLineTokens: () => {
const activeBuffer = useBufferStore.getState().actions.getActiveBuffer();
if (!activeBuffer) return new Map();
return convertToLineTokens(activeBuffer.content, activeBuffer.tokens);
return convertToLineTokens(activeBuffer.content.split("\n"), activeBuffer.tokens);
},

getContent: () => {
Expand All @@ -155,26 +122,48 @@ export const useEditorViewStore = createSelectors(
);

// Subscribe to buffer changes and update computed values
useBufferStore.subscribe((state) => {
const activeBuffer = state.actions.getActiveBuffer();
if (activeBuffer) {
// Always recalculate line tokens when content or tokens change
// The token filtering in convertToLineTokens handles stale tokens gracefully
const lineTokens = convertToLineTokens(activeBuffer.content, activeBuffer.tokens);
console.log(
"[EditorViewStore] Buffer tokens:",
activeBuffer.tokens.length,
"Line tokens:",
lineTokens.size,
);
{
let lastBufferId: string | null = null;
let lastContent: string | null = null;
let lastTokensRef: Array<{ start: number; end: number; class_name: string }> | null = null;
let cachedLines: string[] = [""];

useBufferStore.subscribe((state) => {
const activeBuffer = state.actions.getActiveBuffer();

if (!activeBuffer) {
lastBufferId = null;
lastContent = null;
lastTokensRef = null;
cachedLines = [""];
useEditorViewStore.setState({
lines: cachedLines,
lineTokens: new Map(),
});
return;
}

const contentChanged = activeBuffer.content !== lastContent;
const tokensChanged = activeBuffer.tokens !== lastTokensRef;
const bufferSwitched = activeBuffer.id !== lastBufferId;

if (!contentChanged && !tokensChanged && !bufferSwitched) {
return;
}

if (contentChanged || bufferSwitched) {
cachedLines = activeBuffer.content.split("\n");
lastContent = activeBuffer.content;
}

const lineTokens = convertToLineTokens(cachedLines, activeBuffer.tokens);

useEditorViewStore.setState({
lines: activeBuffer.content.split("\n"),
lines: cachedLines,
lineTokens,
});
} else {
useEditorViewStore.setState({
lines: [""],
lineTokens: new Map(),
});
}
});

lastBufferId = activeBuffer.id;
lastTokensRef = activeBuffer.tokens;
});
}
57 changes: 57 additions & 0 deletions src/features/editor/utils/large-file.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
export const LARGE_FILE_CHAR_THRESHOLD = 300_000;
export const LARGE_FILE_LINE_THRESHOLD = 4_000;

const NEWLINE_CHAR_CODE = 10; // "\n"

export function countLinesUpTo(content: string, limit: number): number {
if (!content) return 0;

let lines = 1;
const cappedLimit = Math.max(limit, 1);

for (let i = 0; i < content.length && lines <= cappedLimit; i++) {
if (content.charCodeAt(i) === NEWLINE_CHAR_CODE) {
lines++;
if (lines > cappedLimit) {
return lines;
}
}
}

return lines;
}

export function isLargeFile(
content: string,
charThreshold: number = LARGE_FILE_CHAR_THRESHOLD,
lineThreshold: number = LARGE_FILE_LINE_THRESHOLD,
): boolean {
if (!content) return false;
if (content.length >= charThreshold) return true;
return countLinesUpTo(content, lineThreshold) > lineThreshold;
}

export function getLargeFileMeta(content: string): {
isLarge: boolean;
approxLineCount: number;
} {
if (!content) {
return { isLarge: false, approxLineCount: 0 };
}

const approxLineCount = countLinesUpTo(content, LARGE_FILE_LINE_THRESHOLD + 1);
const isLarge =
content.length >= LARGE_FILE_CHAR_THRESHOLD || approxLineCount > LARGE_FILE_LINE_THRESHOLD;

return { isLarge, approxLineCount };
}

export async function waitForIdle(timeoutMs = 200): Promise<void> {
if (typeof requestIdleCallback === "function") {
return new Promise((resolve) => {
requestIdleCallback(() => resolve(), { timeout: timeoutMs });
});
}

return new Promise((resolve) => setTimeout(resolve, 0));
}
Loading
Loading