-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconvert_from_old_metalist.py
64 lines (51 loc) · 1.94 KB
/
convert_from_old_metalist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import sqlite3
import json
import os
from bs4 import BeautifulSoup
from metalist.utils.crud import get_database_path
from metalist import config
from metalist.utils.initialize import initialize_database, set_db_version
def main():
print('conversion...')
db1 = sqlite3.connect("metalist.cleartext.db") # must be in local dir
initialize_database(config.default_db_name)
set_db_version(config.default_db_name)
db_path = get_database_path(config.default_db_name)
if os.path.exists(db_path):
os.remove(db_path)
db2 = sqlite3.connect(db_path)
sql = 'CREATE TABLE IF NOT EXISTS items (id INTEGER PRIMARY KEY, value TEXT NOT NULL);'
db2.execute(sql)
rows = db1.execute('SELECT * from items ORDER BY key DESC').fetchall()
count = 0
for row in rows:
item = json.loads(row[1])
id = item['id']
cleanup(item)
value = json.dumps(item)
db2.execute('INSERT INTO items (id, value) VALUES (?, ?)', (id, value))
count += 1
print(f'done creating {count} rows')
db2.commit()
rows = db2.execute('SELECT * from items ORDER BY id DESC').fetchall()
print(f'confirm {len(rows)} rows')
def cleanup(item):
for subitem in item['subitems']:
html_content = subitem['data']
# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')
# Find all <div> elements with spellcheck="false"
divs_to_remove = soup.find_all('div', {'spellcheck': 'false'})
# Replace each element with its contents
for div in divs_to_remove:
div.unwrap()
# Convert the modified tree back to a string
clean_html = str(soup)
# remove trailing </br>
if clean_html.endswith("<br/>"):
clean_html = clean_html[:-5] # Slice off the last 5 characters
subitem['data'] = clean_html
if 'timestamp' in item:
del item['timestamp']
if __name__ == '__main__':
main()