prepare_kbp37.py (1753B)
1 from typing import Dict, Iterable, Tuple 2 import argparse 3 import pathlib 4 5 import tqdm 6 7 from gbure.utils import DATA_PATH 8 import gbure.data.prepare_semeval 9 import gbure.data.preprocessing as preprocessing 10 11 DATASET_PATH: pathlib.Path = DATA_PATH / "KBP37" 12 COMMIT_ID: str = "7d88486ad632a9c6e9fe6adbc2468049e89bc11d" 13 DIRECTORY_NAME: str = f"kbp37-{COMMIT_ID}" 14 ARCHIVE_NAME: str = "kbp37_data.zip" 15 ARCHIVE_SHA512: str = "f6661df79d327a34ad4198f0405d7c06e05af4d9aab3723282c02270394c6511df41a330118d2be89f390b9d36c1eb2a32800db01a3d4387b990493befb011ac" 16 DOWNLOAD_URL: str = f"https://github.com/zhangdongxu/kbp37/archive/{COMMIT_ID}.zip" 17 18 TRAIN_SIZE: int = 15917 19 VALID_SIZE: int = 1724 20 TEST_SIZE: int = 3405 21 UNKNOWN_RELATION: str = "no_relation" 22 23 24 def read_splits() -> Dict[str, Iterable[Tuple[str, str, str, str, str]]]: 25 return {"train": gbure.data.prepare_semeval.read_data(DATASET_PATH / DIRECTORY_NAME / "train.txt", TRAIN_SIZE), 26 "valid": gbure.data.prepare_semeval.read_data(DATASET_PATH / DIRECTORY_NAME / "dev.txt", VALID_SIZE), 27 "test": gbure.data.prepare_semeval.read_data(DATASET_PATH / DIRECTORY_NAME / "test.txt", TEST_SIZE)} 28 29 30 if __name__ == "__main__": 31 parser: argparse.ArgumentParser = preprocessing.base_argument_parser("Prepare the supervised KBP37 dataset.") 32 args: argparse.Namespace = parser.parse_args() 33 name: str = preprocessing.dataset_name(args) 34 35 preprocessing.get_zip_data(DATASET_PATH, DIRECTORY_NAME, ARCHIVE_NAME, ARCHIVE_SHA512, DOWNLOAD_URL) 36 preprocessing.serialize_dataset( 37 supervision="supervised", 38 path=DATASET_PATH / name, 39 splits=read_splits(), 40 unknown_relation=UNKNOWN_RELATION, 41 **preprocessing.args_to_serialize(args))