-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdebug_chunks.py
More file actions
40 lines (32 loc) · 1.14 KB
/
debug_chunks.py
File metadata and controls
40 lines (32 loc) · 1.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
"""
Check how the PDF is being chunked
"""
import sys
sys.path.insert(0, 'src')
from utils.document_loader import DocumentLoader
from utils.text_chunker import SmartChunker
# Load document
loader = DocumentLoader()
doc = loader.load_document('src/data/notes/oops concepts.pdf')
print(f"Document loaded:")
print(f"- Length: {len(doc['text'])} characters")
print(f"- Success: {doc['success']}")
print(f"- Filename: {doc['metadata'].get('filename', 'N/A')}")
# Chunk it
chunker = SmartChunker(chunk_size=1000, chunk_overlap=200)
chunks = chunker.chunk_text(doc['text'], doc['metadata'])
print(f"\nTotal chunks created: {len(chunks)}")
print(f"\nFirst 5 chunks:")
for c in chunks[:5]:
print(f"\nChunk {c['chunk_id']}:")
print(f" Length: {len(c['text'])} chars")
print(f" Preview: {c['text'][:100]}...")
# Find interface chunks
interface_chunks = [(c['chunk_id'], c['text']) for c in chunks if 'interface' in c['text'].lower()]
print(f"\n{'='*80}")
print(f"Chunks containing 'interface': {len(interface_chunks)}")
print(f"{'='*80}")
for cid, text in interface_chunks[:3]:
print(f"\n--- Chunk {cid} ---")
print(text[:500])
print("...")