athasdev · malore350 · Nov 6, 2025
diff --git a/src-tauri/src/commands/tokens.rs b/src-tauri/src/commands/tokens.rs
@@ -14,7 +14,9 @@ use tree_sitter_highlight::{HighlightConfiguration, HighlightEvent, Highlighter}
 
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct Token {
+   /// Zero-based character offset where the token starts.
    pub start: usize,
+   /// Zero-based character offset where the token ends (exclusive).
    pub end: usize,
    pub token_type: String,
    pub class_name: String,
@@ -460,7 +462,10 @@ pub fn tokenize_content(content: &str, language: &str) -> Result<Vec<Token>> {
       .highlight(&config, content.as_bytes(), None, |_| None)?
       .collect::<Result<Vec<_>, _>>()?;
 
-   let mut tokens = Vec::new();
+   // Collect raw byte-based tokens first. We'll normalize offsets to
+   // character positions after the highlight walk so the frontend can
+   // skip costly byte-to-char conversions.
+   let mut raw_tokens: Vec<(usize, usize, String, String)> = Vec::new();
    let mut current_highlight: Option<usize> = None;
 
    for event in highlights {
@@ -473,12 +478,7 @@ pub fn tokenize_content(content: &str, language: &str) -> Result<Vec<Token>> {
                // Skip whitespace-only tokens
                let text = &content[start..end];
                if !text.trim().is_empty() {
-                  tokens.push(Token {
-                     start,
-                     end,
-                     token_type: token_type.to_string(),
-                     class_name: class_name.to_string(),
-                  });
+                  raw_tokens.push((start, end, token_type.to_string(), class_name.to_string()));
                }
             }
          }
@@ -491,9 +491,65 @@ pub fn tokenize_content(content: &str, language: &str) -> Result<Vec<Token>> {
       }
    }
 
+   if raw_tokens.is_empty() {
+      return Ok(Vec::new());
+   }
+
+   // For ASCII content, byte and character offsets align, so we can
+   // return early without additional work.
+   if content.is_ascii() {
+      return Ok(raw_tokens
+         .into_iter()
+         .filter(|(start, end, _, _)| end > start)
+         .map(|(start, end, token_type, class_name)| Token {
+            start,
+            end,
+            token_type,
+            class_name,
+         })
+         .collect());
+   }
+
+   // Build a lookup table of byte offsets to character indices so we can
+   // translate tree-sitter's byte ranges into char-based offsets. Using a
+   // sorted Vec keeps memory overhead predictable and allows binary search
+   // per token without scanning the full string repeatedly.
+   let mut char_starts: Vec<usize> = content.char_indices().map(|(idx, _)| idx).collect();
+   char_starts.push(content.len());
+
+   let tokens = raw_tokens
+      .into_iter()
+      .filter_map(|(start_byte, end_byte, token_type, class_name)| {
+         if end_byte <= start_byte {
+            return None;
+         }
+
+         let start = byte_to_char_offset(start_byte, &char_starts);
+         let end = byte_to_char_offset(end_byte, &char_starts);
+
+         if end <= start {
+            return None;
+         }
+
+         Some(Token {
+            start,
+            end,
+            token_type,
+            class_name,
+         })
+      })
+      .collect();
+
    Ok(tokens)
 }
 
+fn byte_to_char_offset(byte_offset: usize, char_starts: &[usize]) -> usize {
+   match char_starts.binary_search(&byte_offset) {
+      Ok(idx) => idx,
+      Err(idx) => idx,
+   }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;

diff --git a/src/features/editor/stores/view-store.ts b/src/features/editor/stores/view-store.ts
@@ -17,108 +17,75 @@ interface EditorViewState {
   };
 }
 
-// Helper function to convert buffer tokens to line tokens
-// Handles conversion from byte offsets (from tree-sitter) to character positions
+// Helper function to convert buffer tokens to per-line tokens. The incoming
+// token offsets are character-based, so we only need to map them onto the
+// correct line ranges.
 function convertToLineTokens(
-  content: string,
+  lines: string[],
   tokens: Array<{ start: number; end: number; class_name: string }>,
 ): Map<number, LineToken[]> {
-  const lines = content.split("\n");
   const tokensByLine = new Map<number, LineToken[]>();
 
-  if (tokens.length === 0) {
+  if (tokens.length === 0 || lines.length === 0) {
     return tokensByLine;
   }
 
-  // Build a byte-to-character mapping for proper UTF-8 handling
-  const encoder = new TextEncoder();
-  let byteOffset = 0;
-  let charOffset = 0;
-  const byteToChar = new Map<number, number>();
-
-  for (let i = 0; i < content.length; i++) {
-    byteToChar.set(byteOffset, charOffset);
-    const char = content[i];
-    const charBytes = encoder.encode(char).length;
-    byteOffset += charBytes;
-    charOffset++;
+  // Precompute the starting character offset for each line so we can place
+  // tokens without repeatedly scanning the entire content.
+  const lineStartOffsets: number[] = new Array(lines.length);
+  let runningOffset = 0;
+  for (let i = 0; i < lines.length; i++) {
+    lineStartOffsets[i] = runningOffset;
+    runningOffset += lines[i].length;
+    if (i < lines.length - 1) {
+      runningOffset += 1; // Account for the newline character between lines
+    }
   }
-  byteToChar.set(byteOffset, charOffset); // End position
-
-  // Convert byte offsets to character offsets
-  const charTokens = tokens
-    .map((token) => {
-      // Find closest byte positions if exact match not found
-      let start = byteToChar.get(token.start);
-      let end = byteToChar.get(token.end);
-
-      // If exact byte position not found, find the closest character position
-      if (start === undefined) {
-        // Find the largest byte offset that's <= token.start
-        let closestByte = 0;
-        for (const [byte, char] of byteToChar.entries()) {
-          if (byte <= token.start && byte > closestByte) {
-            closestByte = byte;
-            start = char;
-          }
-        }
-        if (start === undefined) start = 0;
-      }
 
-      if (end === undefined) {
-        // Find the smallest byte offset that's >= token.end
-        let closestChar = content.length;
-        for (const [byte, char] of byteToChar.entries()) {
-          if (byte >= token.end && char < closestChar) {
-            closestChar = char;
-            end = char;
-          }
-        }
-        if (end === undefined) end = content.length;
-      }
+  let lineIndex = 0;
+  const lastLineIndex = lines.length - 1;
 
-      return { start, end, class_name: token.class_name };
-    })
-    .filter((token) => {
-      // Keep tokens that are valid for the current content
-      return (
-        token.start >= 0 &&
-        token.end <= content.length &&
-        token.start < token.end &&
-        token.end - token.start < 10000 // Allow large tokens but skip absurdly large ones
-      );
-    });
+  for (const token of tokens) {
+    const { start, end, class_name } = token;
+    if (end <= start) continue;
 
-  let currentCharOffset = 0;
+    // Advance the line pointer until the token start falls within the current
+    // line's range. Tokens arrive in order, so we never need to move backwards.
+    while (lineIndex + 1 < lines.length && start >= lineStartOffsets[lineIndex + 1]) {
+      lineIndex++;
+    }
 
-  for (let lineNumber = 0; lineNumber < lines.length; lineNumber++) {
-    const lineLength = lines[lineNumber].length;
-    const lineStart = currentCharOffset;
-    const lineEnd = currentCharOffset + lineLength;
-    const lineTokens: LineToken[] = [];
+    let currentLine = lineIndex;
 
-    // Find tokens that overlap with this line
-    for (const token of charTokens) {
-      if (token.start >= lineEnd) break;
-      if (token.end <= lineStart) continue;
+    while (currentLine < lines.length) {
+      const lineStart = lineStartOffsets[currentLine];
+      const lineLength = lines[currentLine].length;
+      const lineEnd = lineStart + lineLength;
 
-      const tokenStartInLine = Math.max(0, token.start - lineStart);
-      const tokenEndInLine = Math.min(lineLength, token.end - lineStart);
+      const startInLine = Math.max(0, start - lineStart);
+      const endInLine = Math.min(lineLength, end - lineStart);
 
-      if (tokenStartInLine < tokenEndInLine) {
+      if (startInLine < endInLine) {
+        let lineTokens = tokensByLine.get(currentLine);
+        if (!lineTokens) {
+          lineTokens = [];
+          tokensByLine.set(currentLine, lineTokens);
+        }
         lineTokens.push({
-          startColumn: tokenStartInLine,
-          endColumn: tokenEndInLine,
-          className: token.class_name,
+          startColumn: startInLine,
+          endColumn: endInLine,
+          className: class_name,
         });
       }
-    }
 
-    if (lineTokens.length > 0) {
-      tokensByLine.set(lineNumber, lineTokens);
+      if (end <= lineEnd || currentLine === lastLineIndex) {
+        break;
+      }
+
+      currentLine++;
     }
 
-    currentCharOffset += lineLength + 1; // +1 for newline
+    lineIndex = currentLine;
   }
 
   return tokensByLine;
@@ -141,7 +108,7 @@ export const useEditorViewStore = createSelectors(
         getLineTokens: () => {
           const activeBuffer = useBufferStore.getState().actions.getActiveBuffer();
           if (!activeBuffer) return new Map();
-          return convertToLineTokens(activeBuffer.content, activeBuffer.tokens);
+          return convertToLineTokens(activeBuffer.content.split("\n"), activeBuffer.tokens);
         },
 
         getContent: () => {
@@ -155,26 +122,48 @@ export const useEditorViewStore = createSelectors(
 );
 
 // Subscribe to buffer changes and update computed values
-useBufferStore.subscribe((state) => {
-  const activeBuffer = state.actions.getActiveBuffer();
-  if (activeBuffer) {
-    // Always recalculate line tokens when content or tokens change
-    // The token filtering in convertToLineTokens handles stale tokens gracefully
-    const lineTokens = convertToLineTokens(activeBuffer.content, activeBuffer.tokens);
-    console.log(
-      "[EditorViewStore] Buffer tokens:",
-      activeBuffer.tokens.length,
-      "Line tokens:",
-      lineTokens.size,
-    );
+{
+  let lastBufferId: string | null = null;
+  let lastContent: string | null = null;
+  let lastTokensRef: Array<{ start: number; end: number; class_name: string }> | null = null;
+  let cachedLines: string[] = [""];
+
+  useBufferStore.subscribe((state) => {
+    const activeBuffer = state.actions.getActiveBuffer();
+
+    if (!activeBuffer) {
+      lastBufferId = null;
+      lastContent = null;
+      lastTokensRef = null;
+      cachedLines = [""];
+      useEditorViewStore.setState({
+        lines: cachedLines,
+        lineTokens: new Map(),
+      });
+      return;
+    }
+
+    const contentChanged = activeBuffer.content !== lastContent;
+    const tokensChanged = activeBuffer.tokens !== lastTokensRef;
+    const bufferSwitched = activeBuffer.id !== lastBufferId;
+
+    if (!contentChanged && !tokensChanged && !bufferSwitched) {
+      return;
+    }
+
+    if (contentChanged || bufferSwitched) {
+      cachedLines = activeBuffer.content.split("\n");
+      lastContent = activeBuffer.content;
+    }
+
+    const lineTokens = convertToLineTokens(cachedLines, activeBuffer.tokens);
+
     useEditorViewStore.setState({
-      lines: activeBuffer.content.split("\n"),
+      lines: cachedLines,
       lineTokens,
     });
-  } else {
-    useEditorViewStore.setState({
-      lines: [""],
-      lineTokens: new Map(),
-    });
-  }
-});
+
+    lastBufferId = activeBuffer.id;
+    lastTokensRef = activeBuffer.tokens;
+  });
+}
diff --git a/src/features/editor/utils/large-file.ts b/src/features/editor/utils/large-file.ts
@@ -0,0 +1,57 @@
+export const LARGE_FILE_CHAR_THRESHOLD = 300_000;
+export const LARGE_FILE_LINE_THRESHOLD = 4_000;
+
+const NEWLINE_CHAR_CODE = 10; // "\n"
+
+export function countLinesUpTo(content: string, limit: number): number {
+  if (!content) return 0;
+
+  let lines = 1;
+  const cappedLimit = Math.max(limit, 1);
+
+  for (let i = 0; i < content.length && lines <= cappedLimit; i++) {
+    if (content.charCodeAt(i) === NEWLINE_CHAR_CODE) {
+      lines++;
+      if (lines > cappedLimit) {
+        return lines;
+      }
+    }
+  }
+
+  return lines;
+}
+
+export function isLargeFile(
+  content: string,
+  charThreshold: number = LARGE_FILE_CHAR_THRESHOLD,
+  lineThreshold: number = LARGE_FILE_LINE_THRESHOLD,
+): boolean {
+  if (!content) return false;
+  if (content.length >= charThreshold) return true;
+  return countLinesUpTo(content, lineThreshold) > lineThreshold;
+}
+
+export function getLargeFileMeta(content: string): {
+  isLarge: boolean;
+  approxLineCount: number;
+} {
+  if (!content) {
+    return { isLarge: false, approxLineCount: 0 };
+  }
+
+  const approxLineCount = countLinesUpTo(content, LARGE_FILE_LINE_THRESHOLD + 1);
+  const isLarge =
+    content.length >= LARGE_FILE_CHAR_THRESHOLD || approxLineCount > LARGE_FILE_LINE_THRESHOLD;
+
+  return { isLarge, approxLineCount };
+}
+
+export async function waitForIdle(timeoutMs = 200): Promise<void> {
+  if (typeof requestIdleCallback === "function") {
+    return new Promise((resolve) => {
+      requestIdleCallback(() => resolve(), { timeout: timeoutMs });
+    });
+  }
+
+  return new Promise((resolve) => setTimeout(resolve, 0));
+}