Skip to content
Merged
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
364ec66
cifar10 dataset class
AvindaShamal Dec 6, 2024
397b8c2
Addressed review comments
AvindaShamal Dec 10, 2024
6fc5669
temporary
AvindaShamal Dec 11, 2024
7196a77
deleted test folder
AvindaShamal Dec 11, 2024
7491940
Deleted __init__.py
AvindaShamal Dec 11, 2024
6bd5e7e
removed unwanted spaces
AvindaShamal Dec 11, 2024
c64da2c
Merge branch 'main' of https://github.com/AvindaShamal/intern-ml
AvindaShamal Dec 11, 2024
39c4b80
solved all issues
AvindaShamal Dec 13, 2024
a8e2f2a
Updated settings.json
AvindaShamal Dec 13, 2024
3a39d40
Added CustomImageDataset for Hugging Face datasets and packaged the p…
AvindaShamal Dec 14, 2024
4ba0a28
Merge branch 'main' of https://github.com/AvindaShamal/intern-ml
AvindaShamal Dec 14, 2024
345e1b0
removed huggingface files and egg-info
AvindaShamal Dec 16, 2024
c037338
deleted egg-info files
AvindaShamal Dec 16, 2024
cd169da
modified .gitignore
AvindaShamal Dec 16, 2024
a76aa9e
modified .gitignore
AvindaShamal Dec 16, 2024
877cd3e
Merge branch 'fcodelabs:main' into main
AvindaShamal Dec 16, 2024
5e8e71a
Merge branch 'fcodelabs:main' into main
AvindaShamal Dec 17, 2024
00520d6
Create python-app.yml
AvindaShamal Dec 17, 2024
fd383b6
Update python-app.yml
AvindaShamal Dec 17, 2024
254dd16
Update python-app.yml
AvindaShamal Dec 18, 2024
c6ccc9a
Update python-app.yml
AvindaShamal Dec 18, 2024
4cb6508
Create python1-app.yml
AvindaShamal Dec 18, 2024
37e42be
Remove python1-app.yml
AvindaShamal Dec 18, 2024
1b83feb
Create test12.yml
AvindaShamal Dec 18, 2024
2c62366
Update python-app.yml
AvindaShamal Dec 18, 2024
834ce38
Update __init__.py
AvindaShamal Dec 18, 2024
435fcc2
Update __init__.py
AvindaShamal Dec 18, 2024
243f68a
Merge branch 'fcodelabs:main' into main
AvindaShamal Dec 19, 2024
bea0872
Update GitHub Actions workflow
AvindaShamal Dec 19, 2024
7c12ee7
Remove Python application workflow from GitHub Actions
AvindaShamal Dec 20, 2024
5a86d12
Merge branch 'fcodelabs:main' into main
AvindaShamal Jan 2, 2025
a2b40f0
Merge branch 'fcodelabs:main' into main
AvindaShamal Jan 2, 2025
5742770
Merge branch 'fcodelabs:main' into main
AvindaShamal Jan 3, 2025
a2058e0
Merge branch 'fcodelabs:main' into main
AvindaShamal Jan 8, 2025
27304e7
Add dataset preprocessing script for OCR data handling
AvindaShamal Jan 9, 2025
3554784
Add functionality to push OCR dataset
AvindaShamal Jan 10, 2025
cbd99a0
Add dataset information to the OCR dataset
AvindaShamal Jan 16, 2025
7ea42cd
transfer the dataset from private profile to organization
AvindaShamal Jan 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 102 additions & 0 deletions avinda_shamal/src/preprocessing/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import json
from PIL import Image as PILImage
from datasets import (
Image,
Dataset,
Features,
Value,
Sequence,
Array2D,
DatasetInfo,
SplitDict,
Split,
DatasetDict,
load_from_disk,
)

dataset_info = DatasetInfo(
description="This dataset contains OCR data for text detection and recognition tasks. "
"Each image has annotated bounding boxes, labels, and corresponding text.",
citation="",
license="MIT License",
homepage="https://github.com/fcodelabs/intern-ml",
features=Features(
{
"image": Image(),
"height": Value("int32"),
"width": Value("int32"),
"annotations": Sequence(
{
"box": Array2D(dtype="float32", shape=(4, 2)),
"text": Value("string"),
"label": Value("int32"),
}
),
}
),
dataset_name="WildReceipt",
splits=SplitDict(
{
"train": Split(name="train"),
"test": Split("test"),
}
),
)


def walk_through_json(file_name):
# load the json file
with open(file_name, "r") as fi:
file = json.load(fi)

# parse and reformat the data
data = []
for item in file:
try:
annotations = []
for annotation in item["annotations"]:
annotations.append(
{
"box": [
[annotation["box"][0], annotation["box"][1]],
[annotation["box"][2], annotation["box"][3]],
[annotation["box"][4], annotation["box"][5]],
[annotation["box"][6], annotation["box"][7]],
],
"text": annotation["text"],
"label": annotation["label"],
}
)
data.append(
{
"image": PILImage.open(item["file_name"]).convert("RGB"),
"height": item["height"],
"width": item["width"],
"annotations": annotations,
}
)
except Exception as e:
print(f"Error processing item {item['file_name']}: {e}")
return data


train_data = walk_through_json("train.json")
test_data = walk_through_json("test.json")
train_dataset = Dataset.from_list(train_data, features=dataset_info.features)
test_dataset = Dataset.from_list(test_data, features=dataset_info.features)
dataset = DatasetDict(
{
"train": train_dataset,
"test": test_dataset,
}
)
dataset.info = dataset_info

# save the dataset locally
dataset.save_to_disk("ocr_dataset")
print("Dataset Created Successfully")

# push to the hub
loaded_dataset = load_from_disk("ocr_dataset")
loaded_dataset.push_to_hub(repo_id="fcodelabs/WildReceipt-OCR")
print(loaded_dataset)
Loading