Skip to content

Commit dac2a73

Browse files
committed
Support code block in clipper
1 parent be1f35f commit dac2a73

File tree

3 files changed

+142
-4
lines changed

3 files changed

+142
-4
lines changed

examples/parse_code.ipynb

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"from bs4 import BeautifulSoup, NavigableString\n",
10+
"\n",
11+
"html_doc = \"\"\"\n",
12+
" <pre\n",
13+
" style=\"\n",
14+
" box-sizing: border-box;\n",
15+
" font-family: 'Roboto Mono', sfmono-regular,\n",
16+
" consolas, 'liberation mono', menlo, courier,\n",
17+
" monospace;\n",
18+
" background: rgb(32, 33, 35) none repeat scroll\n",
19+
" 0% 0% / auto padding-box border-box;\n",
20+
" color: rgb(255, 255, 255);\n",
21+
" overflow-x: auto;\n",
22+
" border-bottom-left-radius: 4px;\n",
23+
" border-bottom-right-radius: 4px;\n",
24+
" margin: 0px;\n",
25+
" min-height: 44px;\n",
26+
" padding: 12px 16px;\n",
27+
" font-size: 15px;\n",
28+
" line-height: 24px;\n",
29+
" border-top-left-radius: 4px;\n",
30+
" border-top-right-radius: 4px;\n",
31+
" \"\n",
32+
" ><code style=\"box-sizing:border-box;font-family:&quot;Roboto Mono&quot;, sfmono-regular, consolas, &quot;liberation mono&quot;, menlo, courier, monospace;white-space:pre;\">\n",
33+
" <code style=\"box-sizing:border-box;float:left;font-family:&quot;Roboto Mono&quot;, sfmono-regular, consolas, &quot;liberation mono&quot;, menlo, courier, monospace;padding-right:16px;\"><span style=\"box-sizing:border-box;color:rgb(110, 110, 128);\">1\n",
34+
"</span><span style=\"box-sizing:border-box;color:rgb(110, 110, 128);\">2\n",
35+
"</span><span style=\"box-sizing:border-box;color:rgb(110, 110, 128);\">3\n",
36+
"</span><span style=\"box-sizing:border-box;color:rgb(110, 110, 128);\">4\n",
37+
"</span><span style=\"box-sizing:border-box;color:rgb(110, 110, 128);\">5\n",
38+
"</span><span style=\"box-sizing:border-box;color:rgb(110, 110, 128);\">6\n",
39+
"</span><span style=\"box-sizing:border-box;color:rgb(110, 110, 128);\">7\n",
40+
"</span><span style=\"box-sizing:border-box;color:rgb(110, 110, 128);\">8\n",
41+
"</span><span style=\"box-sizing:border-box;color:rgb(110, 110, 128);\">9\n",
42+
"</span><span style=\"box-sizing:border-box;color:rgb(110, 110, 128);\">10\n",
43+
"</span><span style=\"box-sizing:border-box;color:rgb(110, 110, 128);\">11\n",
44+
"</span><span style=\"box-sizing:border-box;color:rgb(110, 110, 128);\">12\n",
45+
"</span></code>\n",
46+
"<span style=\"box-sizing:border-box;color:rgba(255, 255, 255, 0.5);\"># Note: you need to be using OpenAI Python v0.27.0 for the code below to work</span><span style=\"box-sizing:border-box;\"\n",
47+
"/><span style=\"box-sizing:border-box;\"/><span style=\"box-sizing:border-box;color:rgb(46, 149, 211);\">import</span><span style=\"box-sizing:border-box;\"> openai\n",
48+
"</span>\n",
49+
"openai.ChatCompletion.create(\n",
50+
"<span style=\"box-sizing:border-box;\"> model=</span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"gpt-3.5-turbo\"</span><span style=\"box-sizing:border-box;\">,\n",
51+
"</span> messages=[\n",
52+
"<span style=\"box-sizing:border-box;\"> {</span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"role\"</span><span style=\"box-sizing:border-box;\">: </span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"system\"</span><span style=\"box-sizing:border-box;\">, </span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"content\"</span><span style=\"box-sizing:border-box;\">: </span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"You are a helpful assistant.\"</span><span style=\"box-sizing:border-box;\">},\n",
53+
"</span><span style=\"box-sizing:border-box;\"> {</span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"role\"</span><span style=\"box-sizing:border-box;\">: </span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"user\"</span><span style=\"box-sizing:border-box;\">, </span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"content\"</span><span style=\"box-sizing:border-box;\">: </span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"Who won the world series in 2020?\"</span><span style=\"box-sizing:border-box;\">},\n",
54+
"</span><span style=\"box-sizing:border-box;\"> {</span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"role\"</span><span style=\"box-sizing:border-box;\">: </span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"assistant\"</span><span style=\"box-sizing:border-box;\">, </span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"content\"</span><span style=\"box-sizing:border-box;\">: </span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"The Los Angeles Dodgers won the World Series in 2020.\"</span><span style=\"box-sizing:border-box;\">},\n",
55+
"</span><span style=\"box-sizing:border-box;\"> {</span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"role\"</span><span style=\"box-sizing:border-box;\">: </span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"user\"</span><span style=\"box-sizing:border-box;\">, </span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"content\"</span><span style=\"box-sizing:border-box;\">: </span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"Where was it played?\"</span><span style=\"box-sizing:border-box;\">}\n",
56+
"</span> ]\n",
57+
")</code></pre>\n",
58+
"\"\"\"\n",
59+
"soup = BeautifulSoup(html_doc, 'html.parser')\n",
60+
"\n",
61+
"# 找到所有的<pre>标签\n",
62+
"pre_tags = soup.find_all('pre')\n",
63+
"\n",
64+
"for pre in pre_tags:\n",
65+
" # 在每个<pre>标签中找到<code>标签\n",
66+
" code_tags = pre.find_all('code')\n",
67+
" \n",
68+
" for code in code_tags:\n",
69+
" # 检查<code>标签是否包含行号,这里假设行号是在<span>标签中的数字\n",
70+
" span_tags = code.find_all('span')\n",
71+
" \n",
72+
" for span in span_tags:\n",
73+
" if span.string and span.string.strip().isdigit():\n",
74+
" # 如果是行号,则删除这个<span>标签\n",
75+
" span.decompose()\n",
76+
"\n",
77+
"# 这时,soup中的HTML已经没有行号了\n",
78+
"print(soup.prettify())\n"
79+
]
80+
}
81+
],
82+
"metadata": {
83+
"kernelspec": {
84+
"display_name": "notion",
85+
"language": "python",
86+
"name": "python3"
87+
},
88+
"language_info": {
89+
"codemirror_mode": {
90+
"name": "ipython",
91+
"version": 3
92+
},
93+
"file_extension": ".py",
94+
"mimetype": "text/x-python",
95+
"name": "python",
96+
"nbconvert_exporter": "python",
97+
"pygments_lexer": "ipython3",
98+
"version": "3.11.2"
99+
},
100+
"orig_nbformat": 4
101+
},
102+
"nbformat": 4,
103+
"nbformat_minor": 2
104+
}

html2notion/translate/html2json_base.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -294,15 +294,19 @@ def get_color(styles: dict, attrs):
294294
color = attrs['color']
295295
if not color:
296296
return "default"
297+
# If the color_values have 4 items, then it is RGBA and the last value is alpha
298+
# rgba(174, 174, 188, 0.2)
297299
if color.startswith("rgb"):
298-
r, g, b = [int(x.strip()) for x in color[4:-1].split(",")]
300+
color_values = [int(x.strip()) for x in re.findall(r'\d+', color)]
301+
if len(color_values) >= 3:
302+
r, g, b = color_values[:3]
303+
return Html2JsonBase._closest_color(r, g, b)
299304
# Check if color is in hexadecimal format
300305
elif re.match(r'^#(?:[0-9a-fA-F]{3}){1,2}$', color):
301306
r, g, b = Html2JsonBase._hex_to_rgb(color)
302-
else:
303-
return "default"
307+
return Html2JsonBase._closest_color(r, g, b)
304308

305-
return Html2JsonBase._closest_color(r, g, b)
309+
return "default"
306310

307311
def convert_paragraph(self, soup):
308312
json_obj = {

html2notion/translate/html2json_clipper.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,9 @@ def get_block_type(self, element):
5959
return Block.BULLETED_LIST.value
6060
elif tag_name == 'p':
6161
return Block.PARAGRAPH.value
62+
elif element.name == 'pre' and element.code:
63+
return Block.CODE.value
64+
6265
return Block.FAIL.value
6366

6467
def convert_children(self, soup):
@@ -80,4 +83,31 @@ def convert_children(self, soup):
8083
logger.warning(f"Unknown cnvert {element}, {block_type}")
8184
return
8285

86+
# <pre><code><code>line number</code>... code content ...</code></pre>
87+
def convert_code(self, soup):
88+
json_obj = {
89+
"object": "block",
90+
"type": "code",
91+
"code": {
92+
"rich_text": [],
93+
"language": "plain text",
94+
},
95+
}
96+
rich_text = json_obj["code"]["rich_text"]
97+
code_tag = soup.code
98+
if not code_tag:
99+
logger.error(f'No code tag found in {soup}')
100+
return
101+
children_list = list(code_tag.children) if isinstance(code_tag, Tag) else [code_tag]
102+
for child in children_list:
103+
if isinstance(child, Tag) and child.name == "code":
104+
logger.debug(f'Skip line number')
105+
continue
106+
text_obj = self.generate_inline_obj(child)
107+
if text_obj:
108+
rich_text.extend(text_obj)
109+
json_obj["code"]["rich_text"] = self.merge_rich_text(rich_text)
110+
return json_obj
111+
112+
83113
Html2JsonBase.register(YinXiangClipper_Type, Html2JsonYinXiang)

0 commit comments

Comments
 (0)