-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPivotal-Embedding.py
82 lines (64 loc) · 3.05 KB
/
Pivotal-Embedding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import uuid
import re
import json
import argparse
from instant_clip_tokenizer import Tokenizer
def clean_text(text):
# Remove specified symbols except parentheses / Menghapus simbol tertentu kecuali tanda kurung
return re.sub(r'[",.;*]', '', text)
def process_text(input_file_path):
tokenizer = Tokenizer()
entries = []
# Reading the input file / Membaca file input
with open(input_file_path, 'r') as file:
lines = file.readlines()
for line in lines:
name = clean_text(line.strip())
if not name:
continue
tokens = tokenizer.encode(name)
token_count = len(tokens)
first_word = clean_text(name.split()[0])
unique_id = uuid.uuid4()
entry = {
"__version": 0,
"uuid": str(unique_id),
"model_name": "",
"placeholder": name,
"train": True,
"stop_training_after": None,
"stop_training_after_unit": "NEVER",
"token_count": token_count,
"initial_embedding_text": first_word
}
entries.append(entry)
return entries
def main():
parser = argparse.ArgumentParser(description='Process text from an input file and generate a formatted output.')
parser.add_argument('-s', '--source', required=True, help='Input .txt file with names')
parser.add_argument('-d', '--destination', required=True, help='Output .json file to write formatted content')
parser.add_argument('-lp', '--loadpreset', required=False, help='Input .json preset file to load')
args = parser.parse_args()
# Process the text from input file / Memproses teks dari file input
additional_embeddings = process_text(args.source)
if args.loadpreset:
# Load the JSON preset file / Memuat file preset JSON
with open(args.loadpreset, 'r') as preset_file:
preset_data = json.load(preset_file)
# Replace "additional_embeddings": [] line from preset / Merubah baris "additional_embeddings": [] dari preset
preset_data["additional_embeddings"] = additional_embeddings
# Write the updated JSON content to the destination file / Menulis konten JSON yang telah diperbarui ke file tujuan
with open(args.destination, 'w') as output_file:
json.dump(preset_data, output_file, indent=4)
print(f"The formatted output has been written to '{args.destination}' using preset '{args.loadpreset}'")
else:
# Create a new JSON structure if no preset is provided / Membuat struktur JSON baru jika preset tidak disediakan
output_data = {
"additional_embeddings": additional_embeddings
}
# Write the new JSON content to the destination file / Menulis konten JSON baru ke file tujuan
with open(args.destination, 'w') as output_file:
json.dump(output_data, output_file, indent=4)
print(f"The formatted output has been written to '{args.destination}'")
if __name__ == "__main__":
main()