Skip to content

Commit

Permalink
alpaca dataset support jsonl format (#18)
Browse files Browse the repository at this point in the history
  • Loading branch information
linziyi96 authored Aug 4, 2023
1 parent 94e345c commit 89425cd
Showing 1 changed file with 20 additions and 1 deletion.
21 changes: 20 additions & 1 deletion accessory/data/alpaca.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import copy
import torchvision.transforms as transforms
import numpy as np
import os


try:
Expand Down Expand Up @@ -60,7 +61,25 @@ def __init__(self, config_path, transform=transform_train, max_words=30, image_w
print(self.config)
group_ann = {}
for meta_path, meta_type in self.config['META']:
meta_l = json.load(open(meta_path))
meta_ext = os.path.splitext(meta_path)[-1]
if meta_ext == ".json":
with open(meta_path) as f:
meta_l = json.load(f)
elif meta_ext == ".jsonl":
meta_l = []
with open(meta_path) as f:
for i, line in enumerate(f):
try:
meta_l.append(json.loads(line))
except json.decoder.JSONDecodeError as e:
print(f"Error decoding the following jsonl line ({i}):\n{line.rstrip()}", force=True)
raise e
else:
raise NotImplementedError(
f"Unknown meta file extension: \"{meta_ext}\". Currently, .json and .jsonl files are supported. "
"If you are using a supported format, please set the file extension so that the proper parsing "
"routine can be called."
)
if meta_type not in group_ann:
group_ann[meta_type] = []
print(f"{meta_path}, type{meta_type}: len {len(meta_l)}")
Expand Down

0 comments on commit 89425cd

Please sign in to comment.