Skip to content

Fixed MD links attached to the right span instead of the whole sentence #174

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 46 additions & 4 deletions pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,14 +351,56 @@ def resolve_links(links, span):
None or a string representing the link in MD format.
"""
bbox = pymupdf.Rect(span["bbox"]) # span bbox
# a link should overlap at least 70% of the span
text = span["text"].strip()

# Calculate the average width per character
bbox_width = bbox.width
text_length = len(text)
avg_char_width = bbox_width / text_length if text_length > 0 else 0

for link in links:
print(link)
hot = link["from"] # the hot area of the link
middle = (hot.tl + hot.br) / 2 # middle point of hot area
middle = (hot.tl + hot.br) / 2 # middle point of the hot area

# Check if the middle point of the link intersects with the span's bounding box
if not middle in bbox:
continue # does not touch the bbox
text = f'[{span["text"].strip()}]({link["uri"]})'
return text

# Calculate the character indices corresponding to the link's bounding box
link_x_min = hot.tl.x - bbox.x0 # adjust for bbox origin
link_x_max = hot.br.x - bbox.x0 # adjust for bbox origin

# Calculate character indices
char_index_min = int(link_x_min / avg_char_width)
char_index_max = int(link_x_max / avg_char_width)

# Ensure the indices are within the bounds of the text
char_index_min = max(0, char_index_min)
char_index_max = min(text_length, char_index_max)

# Handle some corner cases.
# - Single letter link
# - Check if the link starts at the very beginning.
if char_index_min == char_index_max == 0:
return f'[{span["text"].strip()}]({link["uri"]})'
if char_index_min != 0:
# Adjust char_index_min to ensure it lands exactly at the beginning of the link text
while char_index_min < char_index_max and (char_index_min > 0 and text[char_index_min-1] != ' '):
char_index_min += 1

# Adjust char_index_max to ensure it captures the entire link text and stops at the next space
while char_index_max < text_length and text[char_index_max] != ' ':
char_index_max -= 1

# Extract the substring corresponding to the link
link_text = text[char_index_min:char_index_max].strip()
# Format the text as a Markdown link
markdown_link = f'[{link_text}]({link["uri"]})'
# Replace the link text in the original text with the Markdown link
surrounding_text = text[:char_index_min] + markdown_link + text[char_index_max:]
return surrounding_text # Return the text with the Markdown link included
return None

def save_image(page, rect, i):
"""Optionally render the rect part of a page.
Expand Down