Skip to content

Commit

Permalink
Add document_id to row index mapper (#103)
Browse files Browse the repository at this point in the history
  • Loading branch information
lovit committed Oct 5, 2020
1 parent 362d984 commit f1d4ad0
Showing 1 changed file with 8 additions and 6 deletions.
14 changes: 8 additions & 6 deletions Korpora/korpus_modu.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import os
from dataclasses import dataclass
from glob import glob
from tqdm import tqdm
Expand Down Expand Up @@ -29,19 +30,20 @@
정확한 라이센스는 확인 중 입니다."""


class ModuKorpus(Korpus):
def __init__(self, root_dir=None, force_download=False):
super().__init__(description, license)


class ModuNewsKorpus(Korpus):
def __init__(self, root_dir_or_paths, load_light=True, force_download=False):
super().__init__(description, license)
if isinstance(root_dir_or_paths, str):
paths = sorted(glob(f'{root_dir_or_paths}/N*RW*.json'))
if os.path.isdir(root_dir_or_paths):
paths = sorted(glob(f'{root_dir_or_paths}/N*RW*.json'))
else:
# wildcard
paths = sorted(glob(root_dir_or_paths))
else:
paths = root_dir_or_paths
self.train = ModuNewsData(load_modu_news(paths, load_light))
self.row_to_documentid = [news.document_id for news in self.train]
self.documentid_to_row = {document_id: idx for idx, document_id in enumerate(self.row_to_documentid)}


class ModuNewsData(KorpusData):
Expand Down

0 comments on commit f1d4ad0

Please sign in to comment.