gbure

Graph-based approaches on unsupervised relation extraction evaluated as a fewshot problem
git clone https://esimon.eu/repos/gbure.git
Log | Files | Refs | README | LICENSE

prepare_kbp37.py (1753B)


      1 from typing import Dict, Iterable, Tuple
      2 import argparse
      3 import pathlib
      4 
      5 import tqdm
      6 
      7 from gbure.utils import DATA_PATH
      8 import gbure.data.prepare_semeval
      9 import gbure.data.preprocessing as preprocessing
     10 
     11 DATASET_PATH: pathlib.Path = DATA_PATH / "KBP37"
     12 COMMIT_ID: str = "7d88486ad632a9c6e9fe6adbc2468049e89bc11d"
     13 DIRECTORY_NAME: str = f"kbp37-{COMMIT_ID}"
     14 ARCHIVE_NAME: str = "kbp37_data.zip"
     15 ARCHIVE_SHA512: str = "f6661df79d327a34ad4198f0405d7c06e05af4d9aab3723282c02270394c6511df41a330118d2be89f390b9d36c1eb2a32800db01a3d4387b990493befb011ac"
     16 DOWNLOAD_URL: str = f"https://github.com/zhangdongxu/kbp37/archive/{COMMIT_ID}.zip"
     17 
     18 TRAIN_SIZE: int = 15917
     19 VALID_SIZE: int = 1724
     20 TEST_SIZE: int = 3405
     21 UNKNOWN_RELATION: str = "no_relation"
     22 
     23 
     24 def read_splits() -> Dict[str, Iterable[Tuple[str, str, str, str, str]]]:
     25     return {"train": gbure.data.prepare_semeval.read_data(DATASET_PATH / DIRECTORY_NAME / "train.txt", TRAIN_SIZE),
     26             "valid": gbure.data.prepare_semeval.read_data(DATASET_PATH / DIRECTORY_NAME / "dev.txt", VALID_SIZE),
     27             "test": gbure.data.prepare_semeval.read_data(DATASET_PATH / DIRECTORY_NAME / "test.txt", TEST_SIZE)}
     28 
     29 
     30 if __name__ == "__main__":
     31     parser: argparse.ArgumentParser = preprocessing.base_argument_parser("Prepare the supervised KBP37 dataset.")
     32     args: argparse.Namespace = parser.parse_args()
     33     name: str = preprocessing.dataset_name(args)
     34 
     35     preprocessing.get_zip_data(DATASET_PATH, DIRECTORY_NAME, ARCHIVE_NAME, ARCHIVE_SHA512, DOWNLOAD_URL)
     36     preprocessing.serialize_dataset(
     37             supervision="supervised",
     38             path=DATASET_PATH / name,
     39             splits=read_splits(),
     40             unknown_relation=UNKNOWN_RELATION,
     41             **preprocessing.args_to_serialize(args))