From 5574f2afda7b8b464172e31e9999001ca90f497f Mon Sep 17 00:00:00 2001 From: ckxz105 Date: Thu, 16 Apr 2020 13:46:11 -0700 Subject: [PATCH 001/278] add cache support with redis for embedding, ensure TSNE generate same results on same input --- kgtk/cli/text_embedding.py | 302 +++++++++++++++--------- kgtk/cli/text_embedding_README.md | 13 + kgtk/cli/text_embedding_requirement.txt | 3 +- 3 files changed, 203 insertions(+), 115 deletions(-) diff --git a/kgtk/cli/text_embedding.py b/kgtk/cli/text_embedding.py index 5e81a7422..2118da76d 100644 --- a/kgtk/cli/text_embedding.py +++ b/kgtk/cli/text_embedding.py @@ -22,10 +22,11 @@ class EmbeddingVector: - def __init__(self, model_name=None, query_server=None): + def __init__(self, model_name=None, query_server=None, cache_config:dict={}): from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses, models # type: ignore import logging import re + self._logger = logging.getLogger(__name__) from collections import defaultdict if model_name is None: model_name = 'bert-base-nli-mean-tokens' @@ -41,11 +42,25 @@ def __init__(self, model_name=None, query_server=None): else: self.model_name = model_name self.model = SentenceTransformer(model_name) - if query_server is None: - self.wikidata_server = "https://dsbox02.isi.edu:8888/bigdata/namespace/wdq/sparql" + if query_server is None or query_server == "": + self.wikidata_server = "https://query.wikidata.org/sparql" else: self.wikidata_server = query_server - self.q_nodes_descriptions = dict() + use_cache = cache_config.get("use_cache", False) + if use_cache: + import redis + host = cache_config.get("host", "dsbox01.isi.edu") + port = cache_config.get("port", 6379) + self.redis_server = redis.Redis(host=host, port=port, db=0) + try: + _ = self.redis_server.get("foo") + self._logger.debug("Cache server {}:{} connected!".format(host, port)) + except: + self._logger.error("Cache server {}:{} is not able to be connected! Will not use cache!".format(host, port)) + self.redis_server = None + else: + self.redis_server = None + self.qnodes_descriptions = dict() self.vectors_map = dict() self.vectors_2D = None self.gt_nodes = set() @@ -56,7 +71,6 @@ def __init__(self, model_name=None, query_server=None): self.metadata = [] self.gt_indexes = set() self.input_format = "" - self._logger = logging.getLogger(__name__) self.token_patern = re.compile(r"(?u)\b\w\w+\b") @staticmethod @@ -78,14 +92,27 @@ def minDistance(word1, word2): return table[-1][-1] - def get_sentences_embedding(self, sentences: typing.List[str]): + def get_sentences_embedding(self, sentences: typing.List[str], qnodes: typing.List[str]): """ transform a list of sentences to embedding vectors """ # if sentences in self.embedding_cache: # return self.embedding_cache[sentences] # else: - sentence_embeddings = self.model.encode(sentences, show_progress_bar=False) + from ast import literal_eval + if self.redis_server is not None: + sentence_embeddings = [] + for each_node, each_sentence in zip(qnodes, sentences): + cache_res = self.redis_server.get(each_node+each_sentence) + if cache_res is not None: + sentence_embeddings.append(literal_eval(cache_res.decode("utf-8"))) + # self._logger.error("{} hit!".format(each_node+each_sentence)) + else: + each_embedding = self.model.encode([each_sentence], show_progress_bar=False) + sentence_embeddings.extend(each_embedding) + self.redis_server.set(each_node+each_sentence, str(each_embedding[0].tolist())) + else: + sentence_embeddings = self.model.encode(sentences, show_progress_bar=False) # self.embedding_cache[sentences] = sentence_embeddings return sentence_embeddings @@ -105,7 +132,7 @@ def send_sparql_query(self, query_body:str): results = qm.query().convert()['results']['bindings'] return results except: - raise ValueError("Sending Sparl query to {} failed!".format(wikidata_server)) + raise ValueError("Sending Sparl query to {} failed!".format(self.wikidata_server)) def get_item_description(self, qnodes: typing.List[str]=None, target_properties:dict={}, gt_label:str=""): """ @@ -117,7 +144,7 @@ def get_item_description(self, qnodes: typing.List[str]=None, target_properties: find_all_properties = True else: find_all_properties = False - + # self._logger.error(str(qnodes)) properties_list = [[] for _ in range(4)] used_p_node_ids = set() names = ["labels", "descriptions", "isa_properties", "has_properties"] @@ -131,97 +158,117 @@ def get_item_description(self, qnodes: typing.List[str]=None, target_properties: elif v == "has_properties": properties_list[3].append(k) - need_find_label = "label" in properties_list[0] - need_find_description = "description" in properties_list[1] - query_qnodes = "" - for each in qnodes: - query_qnodes += "wd:{} ".format(each) - - # this is used to get corresponding labels / descriptions - if need_find_label or need_find_description: - query_body = """ - select ?item ?itemDescription ?itemLabel - where { - values ?item {""" + query_qnodes + """ } - SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } - } - """ - results = self.send_sparql_query(query_body) - for each in results: - each_node = each['item']['value'].split("/")[-1] - if 'itemDescription' in each: - description = each['itemDescription']['value'] - else: - description = "" - if "itemLabel" in each: - label = each['itemLabel']['value'] - # if each_node == self.gt[gt_label]: - # if self.minDistance(label, gt_label) > len(gt_label): - # a = "".join(self.token_patern.findall(label.lower())) - # b = "".join(self.token_patern.findall(gt_label.lower())) - # if a not in b and b not in a: - # self._logger.error("{} with {} --> {} edit distance too larger!!!".format(each_node, label, gt_label)) - else: - label = "" - if need_find_label: - self.candidates[each_node]["label_properties"] = [label] - if need_find_description: - self.candidates[each_node]["description_properties"] = [description] - - # this is used to get corresponding P node labels - query_body2 = "select ?item" - part2 = "" - for name, part in zip(names, properties_list): - for i, each in enumerate(part): - if each not in {"label", "description", "all"}: - used_p_node_ids.add(each) - query_body2 += " ?{}_{}Label".format(name, i) - part2 += """?item wdt:{} ?{}_{}. \n""".format(each, name, i) - query_body2 += """ + sentences_cache_dict = {} + if self.redis_server is not None: + for each_node in qnodes: + cache_res = self.redis_server.get(each_node+str(properties_list)) + if cache_res is not None: + sentences_cache_dict[each_node] = cache_res + # self._logger.error("{} hit!".format(each_node+str(properties_list))) + + if len(sentences_cache_dict) > 0: + qnodes = set(qnodes) - set(sentences_cache_dict.keys()) + + # only need to do query when we still have remained nodes + if len(qnodes) > 0: + need_find_label = "label" in properties_list[0] + need_find_description = "description" in properties_list[1] + query_qnodes = "" + for each in qnodes: + query_qnodes += "wd:{} ".format(each) + + # this is used to get corresponding labels / descriptions + if need_find_label or need_find_description: + query_body = """ + select ?item ?itemDescription ?itemLabel where { - values ?item {""" + query_qnodes + "}" - - query_body2 += part2 + """ + values ?item {""" + query_qnodes + """ } SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } } - """ - results2 = self.send_sparql_query(query_body2) - for each in results2: - node_name = each['item']['value'].split("/")[-1] + """ + results = self.send_sparql_query(query_body) + for each in results: + each_node = each['item']['value'].split("/")[-1] + if 'itemDescription' in each: + description = each['itemDescription']['value'] + else: + description = "" + if "itemLabel" in each: + label = each['itemLabel']['value'] + # if each_node == self.gt[gt_label]: + # if self.minDistance(label, gt_label) > len(gt_label): + # a = "".join(self.token_patern.findall(label.lower())) + # b = "".join(self.token_patern.findall(gt_label.lower())) + # if a not in b and b not in a: + # self._logger.error("{} with {} --> {} edit distance too larger!!!".format(each_node, label, gt_label)) + else: + label = "" + if need_find_label: + self.candidates[each_node]["label_properties"] = [label] + if need_find_description: + self.candidates[each_node]["description_properties"] = [description] + + # this is used to get corresponding P node labels + query_body2 = "select ?item" + part2 = "" for name, part in zip(names, properties_list): - if len(part) > 0: - properties_res = set() - for i in range(len(part)): - property_key = '{}_{}Label'.format(name, i) - if property_key in each: - properties_res.add(each[property_key]['value']) - self.candidates[node_name][name] = properties_res - - # if need get all properties, we need to run extra query - if find_all_properties: - query_body3 = """ - select DISTINCT ?item ?p_entity ?p_entityLabel - where { - values ?item {"""+ query_qnodes + """} - ?item ?p ?o. - FILTER regex(str(?p), "^http://www.wikidata.org/prop/P", "i") - BIND (IRI(REPLACE(STR(?p), "http://www.wikidata.org/prop", "http://www.wikidata.org/entity")) AS ?p_entity) . - SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } - } + for i, each in enumerate(part): + if each not in {"label", "description", "all"}: + used_p_node_ids.add(each) + query_body2 += " ?{}_{}Label".format(name, i) + part2 += """?item wdt:{} ?{}_{}. \n""".format(each, name, i) + query_body2 += """ + where { + values ?item {""" + query_qnodes + "}" + + query_body2 += part2 + """ + SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } + } """ - results3 = self.send_sparql_query(query_body3) - for each in results3: + results2 = self.send_sparql_query(query_body2) + for each in results2: node_name = each['item']['value'].split("/")[-1] - p_node_id = each['p_entity']['value'].split("/")[-1] - p_node_label = each['p_entityLabel']['value'] - if p_node_id not in used_p_node_ids: - if "has_properties" in self.candidates[node_name]: - self.candidates[node_name]["has_properties"].add(p_node_label) - else: - self.candidates[node_name]["has_properties"] = set([p_node_label]) + for name, part in zip(names, properties_list): + if len(part) > 0: + properties_res = set() + for i in range(len(part)): + property_key = '{}_{}Label'.format(name, i) + if property_key in each: + properties_res.add(each[property_key]['value']) + self.candidates[node_name][name] = properties_res + + # if need get all properties, we need to run extra query + if find_all_properties: + query_body3 = """ + select DISTINCT ?item ?p_entity ?p_entityLabel + where { + values ?item {"""+ query_qnodes + """} + ?item ?p ?o. + FILTER regex(str(?p), "^http://www.wikidata.org/prop/P", "i") + BIND (IRI(REPLACE(STR(?p), "http://www.wikidata.org/prop", "http://www.wikidata.org/entity")) AS ?p_entity) . + SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } + } + """ + results3 = self.send_sparql_query(query_body3) + for each in results3: + node_name = each['item']['value'].split("/")[-1] + p_node_id = each['p_entity']['value'].split("/")[-1] + p_node_label = each['p_entityLabel']['value'] + if p_node_id not in used_p_node_ids: + if "has_properties" in self.candidates[node_name]: + self.candidates[node_name]["has_properties"].add(p_node_label) + else: + self.candidates[node_name]["has_properties"] = set([p_node_label]) for each_node_id in qnodes: - self.candidates[each_node_id]["sentence"] = self.attribute_to_sentence(self.candidates[each_node_id], each_node_id) + each_sentence = self.attribute_to_sentence(self.candidates[each_node_id], each_node_id) + self.candidates[each_node_id]["sentence"] = each_sentence + if self.redis_server is not None: + # self._logger.error("Pushed: {}".format(each_node+str(properties_list))) + self.redis_server.set(each_node+str(properties_list), each_sentence) + + for each_node_id, sentence in sentences_cache_dict.items(): + self.candidates[each_node_id]["sentence"] = sentence def read_input(self, file_path: str, skip_nodes_set: set=None, @@ -234,11 +281,12 @@ def read_input(self, file_path: str, skip_nodes_set: set=None, from collections import defaultdict import pandas as pd # type: ignore import numpy as np + import math self.property_labels_dict = property_labels_dict if input_format == "test_format": self.input_format = input_format - input_df = pd.read_csv(file_path, dtype=str) + input_df = pd.read_csv(file_path) candidates = {} gt = {} count = 0 @@ -247,13 +295,16 @@ def read_input(self, file_path: str, skip_nodes_set: set=None, elif "kg_id" in input_df.columns: gt_column_id = "kg_id" else: - raise ValueError("Can't find ground truth id column!") + raise ValueError("Can't find ground truth id column! It should either named as `GT_kg_id` or `kg_id`") for _, each in input_df.iterrows(): - temp = str(each['candidates']).split("|") + if each['candidates'] is np.nan or math.isnan(each['candidates']): + temp = [] + else: + temp = str(each['candidates']).split("|") to_remove_q = set() if each[gt_column_id] is np.nan: - self._logger.error("Ignore nan value form {}".format(str(each))) + self._logger.warning("Ignore NaN gt value form {}".format(str(each))) each[gt_column_id] = "" gt_nodes = each[gt_column_id].split(" ") label = str(each["label"]) @@ -273,7 +324,7 @@ def read_input(self, file_path: str, skip_nodes_set: set=None, count += len(temp) self.gt_nodes.add(each[gt_column_id]) self.get_item_description(temp, target_properties, label) - + self._logger.info("Totally {} rows with {} candidates loaded.".format(str(len(gt)), str(count))) elif input_format == "kgtk_format": @@ -356,6 +407,8 @@ def get_real_label_name(self, node): def attribute_to_sentence(self, v, node_id = None): concated_sentence = "" + # sort the properties to ensure the sentence always same + v = {key: sorted(list(value)) for key, value in v.items() if len(value) > 0} if "label_properties" in v and len(v["label_properties"]) > 0: concated_sentence += self.get_real_label_name(v["label_properties"][0]) if "description_properties" in v and len(v["description_properties"]) > 0: @@ -397,11 +450,12 @@ def get_vetors(self, use_cache=True, vector_dump_file=None): jobs_count = 0 counter = 0 self._logger.info("Now generating embedding vector.") - for q_node, each_item in tqdm(self.candidates.items()): # do process for each row(one target) sentence = each_item["sentence"] - vectors = self.get_sentences_embedding([sentence]) + if isinstance(sentence, bytes): + sentence = sentence.decode("utf-8") + vectors = self.get_sentences_embedding([sentence], [q_node]) self.vectors_map[q_node] = vectors[0] self._logger.info("Totally used {} seconds.".format(str(time.time() - start_all))) @@ -492,11 +546,12 @@ def plot_result(self, use_cache=True, vector_dump_file=None, # self.load_vectors(vector_dump_file, "2D") # else: vectors = list(self.vectors_map.values()) + self.vectors_map = {k: v for k, v in sorted(self.vectors_map.items(), key=lambda item: item[0], reverse=True)} # use tsne to reduce dimension if run_TSNE: self._logger.warning("Start running TSNE to reduce dimension. It will take a long time.") start = time.time() - self.vectors_2D = TSNE(n_components=2).fit_transform(vectors) + self.vectors_2D = TSNE(n_components=2, random_state=0).fit_transform(vectors) # self.dump_vectors(vector_dump_file, "2D") self._logger.info("Totally used {} seconds.".format(time.time() - start)) @@ -506,10 +561,7 @@ def plot_result(self, use_cache=True, vector_dump_file=None, vector_map_keys = list(self.vectors_map.keys()) for each_node in self.gt_nodes: gt_indexes.add(vector_map_keys.index(each_node)) - # load the descriptions if we don't have them - # if len(self.q_nodes_descriptions) == 0: - # for each in self.candidates.values(): - # _ = self.get_item_description(each) + self.metadata.append("Q_nodes\tType\tLabel\tDescription") for i, each in enumerate(self.vectors_map.keys()): label = self.q_node_to_label[each] @@ -551,6 +603,7 @@ def evaluate_result(self): """ for the ground truth nodes, evaluate the average distance to the centroid, the lower the average distance, the better clustering results should be """ + import numpy as np centroid = None gt_nodes_vectors = [] if len(self.gt_indexes) == 0: @@ -559,19 +612,19 @@ def evaluate_result(self): points = self.gt_indexes for i, each in enumerate(self.vectors_map.keys()): # label = self.q_node_to_label[each] - # description = self.q_nodes_descriptions.get(each, "") + # description = self.qnodes_descriptions.get(each, "") if i in points: if centroid is None: - centroid = self.vectors_map[each] + centroid = np.array(self.vectors_map[each]) else: - centroid += self.vectors_map[each] + centroid += np.array(self.vectors_map[each]) gt_nodes_vectors.append(self.vectors_map[each]) centroid = centroid / len(points) distance_sum = 0 for each in gt_nodes_vectors: distance_sum += self.calculate_distance(each, centroid) - self._logger.warning("The average distance for the ground truth nodes to centroid is {}".format(distance_sum / len(points))) + self._logger.info("The average distance for the ground truth nodes to centroid is {}".format(distance_sum / len(points))) @staticmethod def calculate_distance(a, b): @@ -634,7 +687,7 @@ def load_black_list_files(file_path): import gzip import re token_patern = re.compile(r"(?u)\b\w\w+\b") - q_nodes_set = set() + qnodes_set = set() for each_file in file_path: try: # tar.gz file @@ -663,13 +716,13 @@ def load_black_list_files(file_path): each = each.replace("\n", "") for each_part in token_patern.findall(each): if each_part[0] == "Q" and each_part[1:].isnumeric(): - q_nodes_set.add(each_part) + qnodes_set.add(each_part) except Exception as e: _logger.error("Load black list file {} failed!".format(each_file)) _logger.debug(e, exc_info=True) - _logger.info("Totally {} black list nodes loadded.".format(len(q_nodes_set))) - return q_nodes_set + _logger.info("Totally {} black list nodes loadded.".format(len(qnodes_set))) + return qnodes_set def main(**kwargs): @@ -724,6 +777,7 @@ def main(**kwargs): input_uris = kwargs.get("input_uris", []) output_format = kwargs.get("output_format", "kgtk_format") property_labels_files = kwargs.get("property_labels_file_uri", "") + query_server = kwargs.get("query_server") properties = dict() all_property_relate_inputs = [kwargs.get("label_properties", ["label"]), kwargs.get("description_properties", ["description"]), @@ -732,7 +786,10 @@ def main(**kwargs): ] all_required_properties = ["label_properties", "description_properties", "isa_properties", "has_properties"] - + cache_config = {"use_cache": kwargs.get("use_cache", False), + "host": kwargs.get("cache_host", "dsbox01.isi.edu"), + "port": kwargs.get("cache_port", 6379) + } for each_property, each_input in zip(all_required_properties, all_property_relate_inputs): for each in each_input: properties[each] = each_property @@ -769,7 +826,7 @@ def main(**kwargs): for each_model_name in all_models_names: for each_input_file in input_uris: _logger.info("Running {} model on {}".format(each_model_name, each_input_file)) - process = EmbeddingVector(each_model_name) + process = EmbeddingVector(each_model_name, query_server=query_server, cache_config=cache_config) process.read_input(file_path=each_input_file, skip_nodes_set=black_list_set, input_format=input_format, target_properties=properties, property_labels_dict=property_labels_dict) @@ -849,6 +906,23 @@ def str2bool(v): parser.add_argument("--run-TSNE", type=str2bool, nargs='?', action='store', default=True, dest="run_TSNE", help="whether to run TSNE or not after the embedding, default is true.") + # cache config + parser.add_argument("--use-cache", type=str2bool, nargs='?', action='store', + default=False, dest="use_cache", + help="whether to use cache to get some embedding vectors quicker, default is False") + parser.add_argument("--cache-host", nargs='?', action='store', + default="dsbox01.isi.edu", dest="cache_host", + help="cache host address, default is `dsbox01.isi.edu`" + ) + parser.add_argument("--cache-port", nargs='?', action='store', + default="6379", dest="cache_port", + help="cache server port, default is `6379`" + ) + # query server + parser.add_argument("--query-server", nargs='?', action='store', + default="", dest="query_server", + help="cache host address, default is https://query.wikidata.org/sparql" + ) def run(**kwargs): diff --git a/kgtk/cli/text_embedding_README.md b/kgtk/cli/text_embedding_README.md index 36e834268..d9e694a17 100644 --- a/kgtk/cli/text_embedding_README.md +++ b/kgtk/cli/text_embedding_README.md @@ -151,6 +151,19 @@ Third column is the embeded vecotrs. This will have embedded vectors values after running TSNE and reduced dimension to 2-dimensions for each Q nodes. This is used for visulization. (for example, you can view it at Google's online tools here: http://projector.tensorflow.org/) 3. Metadata for the generated vectors: This will contains the metadata information for the Q nodes generated from 2 files mentioned above. It will contains the Q node value of each vector, the type (it is a `candidate` or a `ground truth` node), the given label of the Q node and corresponding fetched description information from wikidata. +#### Query / cache related +##### --query-server +You can change the query wikidata server address when the input format is `test_format`. The default is to use wikidata official query server, but it has limit on query time and frequency. Alternatively, you can choose to use dsbox02's one as `https://dsbox02.isi.edu:8888/bigdata/namespace/wdq/sparql` (vpn needed). + +##### --use-cache +If set to be true, the system will try to get the cached results for embedding computations. The default value is False, not to use cache. Basically the cache service is a Redis server. + +##### --cache-host +The host address for the Redis cache service. Default is `dsbox01.isi.edu` + +##### --cache-port +The host port for the Redis cache service. Default is `6379` + #### Usage of vector projector You can apply any of the tsv vector files along with the metadata file to display it on google's tools for further experiment. Step 1: Click the `Load` button on the left side of the web. diff --git a/kgtk/cli/text_embedding_requirement.txt b/kgtk/cli/text_embedding_requirement.txt index 4247dfe7e..5783b492f 100644 --- a/kgtk/cli/text_embedding_requirement.txt +++ b/kgtk/cli/text_embedding_requirement.txt @@ -2,4 +2,5 @@ sentence-transformers sklearn matplotlib SPARQLWrapper -torch \ No newline at end of file +torch +redis \ No newline at end of file From 9d36b1b028575eaecd1736cddff5b4eeb32cd851 Mon Sep 17 00:00:00 2001 From: ckxz105 Date: Thu, 16 Apr 2020 23:46:56 -0700 Subject: [PATCH 002/278] bug fix on sentence embedding --- kgtk/cli/text_embedding.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kgtk/cli/text_embedding.py b/kgtk/cli/text_embedding.py index 2118da76d..94a0c3949 100644 --- a/kgtk/cli/text_embedding.py +++ b/kgtk/cli/text_embedding.py @@ -298,10 +298,11 @@ def read_input(self, file_path: str, skip_nodes_set: set=None, raise ValueError("Can't find ground truth id column! It should either named as `GT_kg_id` or `kg_id`") for _, each in input_df.iterrows(): - if each['candidates'] is np.nan or math.isnan(each['candidates']): - temp = [] - else: + if isinstance(each["candidates"], str): temp = str(each['candidates']).split("|") + elif each['candidates'] is np.nan or math.isnan(each['candidates']): + temp = [] + to_remove_q = set() if each[gt_column_id] is np.nan: self._logger.warning("Ignore NaN gt value form {}".format(str(each))) From 0ae780f900753246f2bb5852f2e6fb43fd7ea20a Mon Sep 17 00:00:00 2001 From: ckxz105 Date: Tue, 21 Apr 2020 15:03:15 -0700 Subject: [PATCH 003/278] bug fix on text embedding --- kgtk/cli/text_embedding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kgtk/cli/text_embedding.py b/kgtk/cli/text_embedding.py index 94a0c3949..f8c650a9b 100644 --- a/kgtk/cli/text_embedding.py +++ b/kgtk/cli/text_embedding.py @@ -546,8 +546,8 @@ def plot_result(self, use_cache=True, vector_dump_file=None, # self._logger.info("Using cached 2D vector file!") # self.load_vectors(vector_dump_file, "2D") # else: - vectors = list(self.vectors_map.values()) self.vectors_map = {k: v for k, v in sorted(self.vectors_map.items(), key=lambda item: item[0], reverse=True)} + vectors = list(self.vectors_map.values()) # use tsne to reduce dimension if run_TSNE: self._logger.warning("Start running TSNE to reduce dimension. It will take a long time.") From 8b35fb325e1bc1ea3c1ad87aba5a792486e810c1 Mon Sep 17 00:00:00 2001 From: ckxz105 Date: Tue, 21 Apr 2020 18:03:19 -0700 Subject: [PATCH 004/278] add model name for embedding cache key --- kgtk/cli/text_embedding.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kgtk/cli/text_embedding.py b/kgtk/cli/text_embedding.py index f8c650a9b..05404a54d 100644 --- a/kgtk/cli/text_embedding.py +++ b/kgtk/cli/text_embedding.py @@ -29,7 +29,7 @@ def __init__(self, model_name=None, query_server=None, cache_config:dict={}): self._logger = logging.getLogger(__name__) from collections import defaultdict if model_name is None: - model_name = 'bert-base-nli-mean-tokens' + self.model_name = 'bert-base-nli-mean-tokens' # xlnet need to be trained before using, we can't use this for now # elif model_name == "xlnet-base-cased": # word_embedding_model = models.XLNet('xlnet-base-cased') @@ -41,7 +41,7 @@ def __init__(self, model_name=None, query_server=None, cache_config:dict={}): # self.model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) else: self.model_name = model_name - self.model = SentenceTransformer(model_name) + self.model = SentenceTransformer(model_name) if query_server is None or query_server == "": self.wikidata_server = "https://query.wikidata.org/sparql" else: @@ -103,14 +103,17 @@ def get_sentences_embedding(self, sentences: typing.List[str], qnodes: typing.Li if self.redis_server is not None: sentence_embeddings = [] for each_node, each_sentence in zip(qnodes, sentences): - cache_res = self.redis_server.get(each_node+each_sentence) + query_cache_key = each_node + each_sentence + if self.model_name != "bert-base-wikipedia-sections-mean-tokens": + query_cache_key += self.model_name + cache_res = self.redis_server.get(query_cache_key) if cache_res is not None: sentence_embeddings.append(literal_eval(cache_res.decode("utf-8"))) # self._logger.error("{} hit!".format(each_node+each_sentence)) else: each_embedding = self.model.encode([each_sentence], show_progress_bar=False) sentence_embeddings.extend(each_embedding) - self.redis_server.set(each_node+each_sentence, str(each_embedding[0].tolist())) + self.redis_server.set(query_cache_key, str(each_embedding[0].tolist())) else: sentence_embeddings = self.model.encode(sentences, show_progress_bar=False) # self.embedding_cache[sentences] = sentence_embeddings From 79c53d383322a0f4eef257395fc505e7ff2254ab Mon Sep 17 00:00:00 2001 From: Divij Bhatia Date: Wed, 22 Apr 2020 23:00:04 -0700 Subject: [PATCH 005/278] added export_neo4j command --- README.md | 1 + kgtk/cli/export_neo4j.py | 257 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 258 insertions(+) create mode 100644 kgtk/cli/export_neo4j.py diff --git a/README.md b/README.md index a8a4dcb58..0f0d09dcc 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,7 @@ https://www.mankier.com/1/mlr * `sort` * `merge_identical_nodes` * `zconcat` +* `export_neo4j` To get an information on how to use each of them, run: `kgtk [TOOL] -h` diff --git a/kgtk/cli/export_neo4j.py b/kgtk/cli/export_neo4j.py new file mode 100644 index 000000000..2b2d6826f --- /dev/null +++ b/kgtk/cli/export_neo4j.py @@ -0,0 +1,257 @@ +import csv +import re +from pathlib import Path + + +class Node: + def __init__(self): + self.properties = None + self.instance_type = [] + + def add_property(self, property_name: str, property_value: str) -> None: + if not self.properties: + self.properties = Properties() + self.properties.add_property(property_name, property_value) + + def add_instance_type(self, instance_type): + self.instance_type.append(instance_type) + + def serialize(self, node_id): + if self.properties: + serialized_properties = self.properties.serialize_node_properties() + else: + serialized_properties = "" + serialized_instance_type = ':'.join(self.instance_type) + serialized_node = "CREATE ({}:{} {})".format(node_id, serialized_instance_type, serialized_properties) + return serialized_node + + +class NodeStore: + def __init__(self): + self.directory = dict() + + def get_or_create(self, node_id: str) -> Node: + if node_id not in self.directory: + self.directory[node_id] = Node() + return self.directory[node_id] + + def serialize(self): + for node_id, node in self.directory.items(): + yield node.serialize(node_id) + + +class Relationship: + def __init__(self): + self.name = None + self.src_node = None + self.dst_node = None + self.properties = None + + def add_names(self, name, src_node_id, dst_node_id): + self.name = name + self.src_node = src_node_id + self.dst_node = dst_node_id + + def add_property(self, property_name, property_value): + if not self.properties: + self.properties = Properties() + self.properties.add_property(property_name, property_value) + + def serialize(self): + if self.properties: + serialized_properties = self.properties.serialize_qualifier_properties() + else: + serialized_properties = "" + serialized_relationship = "({})-[:{} {}]->({}),".format(self.src_node, self.name, serialized_properties, self.dst_node) + return serialized_relationship + + +class RelationshipStore: + def __init__(self): + self.directory = dict() + + def get(self, relationship_id) -> Relationship: + return self.directory.get(relationship_id, None) + + def get_or_create(self, relationship_id: str) -> Relationship: + if relationship_id not in self.directory: + self.directory[relationship_id] = Relationship() + return self.directory[relationship_id] + + def serialize(self): + yield "CREATE" + total_relationships = len(self.directory) + i = 0 + for relationship_id, relationship in self.directory.items(): + if i >= total_relationships - 1: + yield relationship.serialize().rstrip(",") + else: + yield relationship.serialize() + i += 1 + + +class Properties: + def __init__(self): + self.property_map = dict() + + def add_property(self, property_name: str, property_value: str) -> None: + if property_name not in self.property_map: + self.property_map[property_name] = list() + self.property_map[property_name].append(property_value) + + def serialize_node_properties(self): + serialized_properties = """{""" + for property_name, property_value in self.property_map.items(): + property_name = clean_string(property_name) + property_value = [clean_string(v) for v in property_value] + if is_property(property_name): + serialized_property = "{}: ".format(property_name) + else: + serialized_property = "\"{}\": ".format(property_name) + if len(property_value) == 1: + serialized_property += "\"{}\"".format(str(property_value[0])) + else: + serialized_property += "[" + for value in property_value: + serialized_property += "\"{}\", ".format(str(value)) + serialized_property = serialized_property[:-2] + "]" + serialized_properties += serialized_property + ", " + serialized_properties = serialized_properties[:-2] + "}" + return serialized_properties + + def serialize_qualifier_properties(self): + serialized_properties = ["{"] + for property_name, property_value in self.property_map.items(): + property_name = clean_string(property_name) + property_value = [clean_string(v) for v in property_value] + if is_property(property_name): + serialized_properties.append("{}: ".format(property_name)) + else: + serialized_properties.append("\"{}\": ".format(property_name)) + serialized_properties.append("[") + for value in property_value: + if is_item(value) or is_property(value): + serialized_properties.append("{}, ".format(str(value))) + else: + serialized_properties.append("\"{}\", ".format(str(value))) + serialized_properties[-1] = serialized_properties[-1][:-2] + serialized_properties.append("], ") + serialized_properties[-1] = serialized_properties[-1][:-2] + serialized_properties_as_string = ''.join(serialized_properties) + return serialized_properties_as_string + + +class Graph: + def __init__(self): + self.node_store = NodeStore() + self.relationship_store = RelationshipStore() + + def serialize(self, output_directory): + file_name = str(Path(output_directory) / "results.cql") + with open(file_name, 'w', encoding='utf8') as output_file: + for node in self.node_store.serialize(): + output_file.write(node) + output_file.write("\n") + is_first_relationship = True + for relationship in self.relationship_store.serialize(): + if is_first_relationship: + is_first_relationship = False + else: + output_file.write("\t") + output_file.write(relationship) + output_file.write("\n") + + +def is_item(string: str) -> bool: + item_pattern = "^Q[0-9]+$" + match = re.match(item_pattern, string) + if match: + return True + else: + return False + + +def is_property(string: str) -> bool: + property_pattern = "^P[0-9]+$" + match = re.match(property_pattern, string) + if match: + return True + else: + return False + + +def clean_string(string): + return string.strip("\"") + + +def create_graph(statement_file_name: str, qualifier_file_name: str, statement_file_encoding: str, qualifier_file_encoding: str): + # required tsv format headers for statements(unordered): + # id, node1, property, node2 + # or + # node1, property, node2, id, node1_label, node2_label, property_label + # required tsv format headers for qualifiers(unordered): + # node1, property, node2, id + if not statement_file_encoding: + statement_file_encoding = "utf8" + if not qualifier_file_encoding: + qualifier_file_encoding = "utf8" + + graph = Graph() + node_store = graph.node_store + relationship_store = graph.relationship_store + if statement_file_name: + with open(statement_file_name, 'r', encoding=statement_file_encoding) as input_file: + statements = csv.DictReader(input_file, dialect='excel-tab', restval="") + for statement in statements: + src_node = node_store.get_or_create(statement['node1']) + if is_item(statement['node2']): + if statement['property'] == 'P31': + src_node.add_instance_type(statement['node2']) + else: + dst_node = node_store.get_or_create(statement['node2']) + if 'node2_label' in statement and statement['node2_label']: + dst_node.add_property("label", statement["node2_label"]) + relationship = relationship_store.get_or_create(statement['id']) + relationship.add_names(statement['property'], statement['node1'], statement['node2']) + if 'property_label' in statement and statement['property_label']: + relationship.add_property("label", statement["property_label"]) + else: + src_node.add_property(statement['property'], statement['node2']) + if 'node1_label' in statement and statement['node1_label']: + src_node.add_property("label", statement["node1_label"]) + + if qualifier_file_name: + with open(qualifier_file_name, 'r', encoding=qualifier_file_encoding) as input_file: + qualifiers = csv.DictReader(input_file, dialect='excel-tab', restval="") + for qualifier in qualifiers: + relationship = relationship_store.get(qualifier['node1']) + if relationship: + relationship.add_property(qualifier['property'], qualifier['node2']) + return graph + + +def parser(): + return {'help': 'Exports data to Neo4J Cypher Query Language statements.'} + + +def add_arguments(parser): + """ + Parse arguments + Args: + parser (argparse.ArgumentParser) + """ + parser.add_argument('-sf', "--statement_file_path", action="store", type=str, dest="statement_file_path", help="Filepath of the statement file", default="") + parser.add_argument('-qf', '--qualifier_file_path', type=str, dest="qualifier_file_path", help="Filepath of the qualifier file", default="") + parser.add_argument('-o', '--output_directory', action="store", type=str, dest='output_directory', help="Directory where the result file will be saved", default="") + parser.add_argument('-se', '--statement_file_encoding', type=str, dest='statement_file_encoding', help="Encoding of the statement file, eg.: utf8", default="") + parser.add_argument('-qe', '--qualifier_file_encoding', type=str, dest='qualifier_file_encoding', help="Encoding of the qualifier file, eg.: utf8", default="") + + +def run(statement_file_path: str, qualifier_file_path: str, output_directory: str, statement_file_encoding: str, qualifier_file_encoding: str): + try: + graph = create_graph(statement_file_path, qualifier_file_path, statement_file_encoding, qualifier_file_encoding) + graph.serialize(output_directory) + except FileNotFoundError as exception: + raise exception + except Exception as ex: + raise ex From d6cf315dd6c90192b00006fb618513610c742b87 Mon Sep 17 00:00:00 2001 From: Divij Bhatia Date: Thu, 23 Apr 2020 23:03:30 -0700 Subject: [PATCH 006/278] fixed bugs --- kgtk/cli/export_neo4j.py | 48 ++++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/kgtk/cli/export_neo4j.py b/kgtk/cli/export_neo4j.py index 2b2d6826f..4cc951ad1 100644 --- a/kgtk/cli/export_neo4j.py +++ b/kgtk/cli/export_neo4j.py @@ -22,7 +22,10 @@ def serialize(self, node_id): else: serialized_properties = "" serialized_instance_type = ':'.join(self.instance_type) - serialized_node = "CREATE ({}:{} {})".format(node_id, serialized_instance_type, serialized_properties) + if self.instance_type: + serialized_node = "CREATE ({}:{} {})".format(node_id, serialized_instance_type, serialized_properties) + else: + serialized_node = "CREATE ({} {})".format(node_id, serialized_properties) return serialized_node @@ -107,7 +110,7 @@ def serialize_node_properties(self): if is_property(property_name): serialized_property = "{}: ".format(property_name) else: - serialized_property = "\"{}\": ".format(property_name) + serialized_property = "{}: ".format(property_name) if len(property_value) == 1: serialized_property += "\"{}\"".format(str(property_value[0])) else: @@ -127,7 +130,7 @@ def serialize_qualifier_properties(self): if is_property(property_name): serialized_properties.append("{}: ".format(property_name)) else: - serialized_properties.append("\"{}\": ".format(property_name)) + serialized_properties.append("{}: ".format(property_name)) serialized_properties.append("[") for value in property_value: if is_item(value) or is_property(value): @@ -181,7 +184,20 @@ def is_property(string: str) -> bool: def clean_string(string): - return string.strip("\"") + string = string.strip("\"") + string = string.replace("\"", "'") + return string + + +def clean_label(label): + cleaned_label_list = [""] * len(label) + for index, char in enumerate(label): + if char.isalnum(): + cleaned_label_list[index] = label[index] + else: + cleaned_label_list[index] = "_" + cleaned_label = ''.join(cleaned_label_list) + return cleaned_label def create_graph(statement_file_name: str, qualifier_file_name: str, statement_file_encoding: str, qualifier_file_encoding: str): @@ -192,9 +208,9 @@ def create_graph(statement_file_name: str, qualifier_file_name: str, statement_f # required tsv format headers for qualifiers(unordered): # node1, property, node2, id if not statement_file_encoding: - statement_file_encoding = "utf8" + statement_file_encoding = "UTF-8" if not qualifier_file_encoding: - qualifier_file_encoding = "utf8" + qualifier_file_encoding = "UTF-8" graph = Graph() node_store = graph.node_store @@ -203,20 +219,20 @@ def create_graph(statement_file_name: str, qualifier_file_name: str, statement_f with open(statement_file_name, 'r', encoding=statement_file_encoding) as input_file: statements = csv.DictReader(input_file, dialect='excel-tab', restval="") for statement in statements: - src_node = node_store.get_or_create(statement['node1']) + src_node = node_store.get_or_create(clean_label(statement['node1'])) if is_item(statement['node2']): if statement['property'] == 'P31': - src_node.add_instance_type(statement['node2']) + src_node.add_instance_type(clean_label(statement['node2'])) else: - dst_node = node_store.get_or_create(statement['node2']) + dst_node = node_store.get_or_create(clean_label(statement['node2'])) if 'node2_label' in statement and statement['node2_label']: dst_node.add_property("label", statement["node2_label"]) - relationship = relationship_store.get_or_create(statement['id']) - relationship.add_names(statement['property'], statement['node1'], statement['node2']) + relationship = relationship_store.get_or_create(clean_label(statement['id'])) + relationship.add_names(clean_label(statement['property']), clean_label(statement['node1']), clean_label(statement['node2'])) if 'property_label' in statement and statement['property_label']: relationship.add_property("label", statement["property_label"]) else: - src_node.add_property(statement['property'], statement['node2']) + src_node.add_property(clean_label(statement['property']), statement['node2']) if 'node1_label' in statement and statement['node1_label']: src_node.add_property("label", statement["node1_label"]) @@ -224,9 +240,9 @@ def create_graph(statement_file_name: str, qualifier_file_name: str, statement_f with open(qualifier_file_name, 'r', encoding=qualifier_file_encoding) as input_file: qualifiers = csv.DictReader(input_file, dialect='excel-tab', restval="") for qualifier in qualifiers: - relationship = relationship_store.get(qualifier['node1']) + relationship = relationship_store.get(clean_label(qualifier['node1'])) if relationship: - relationship.add_property(qualifier['property'], qualifier['node2']) + relationship.add_property(clean_label(qualifier['property']), qualifier['node2']) return graph @@ -243,8 +259,8 @@ def add_arguments(parser): parser.add_argument('-sf', "--statement_file_path", action="store", type=str, dest="statement_file_path", help="Filepath of the statement file", default="") parser.add_argument('-qf', '--qualifier_file_path', type=str, dest="qualifier_file_path", help="Filepath of the qualifier file", default="") parser.add_argument('-o', '--output_directory', action="store", type=str, dest='output_directory', help="Directory where the result file will be saved", default="") - parser.add_argument('-se', '--statement_file_encoding', type=str, dest='statement_file_encoding', help="Encoding of the statement file, eg.: utf8", default="") - parser.add_argument('-qe', '--qualifier_file_encoding', type=str, dest='qualifier_file_encoding', help="Encoding of the qualifier file, eg.: utf8", default="") + parser.add_argument('-se', '--statement_file_encoding', type=str, dest='statement_file_encoding', help="Encoding of the statement file, eg.: UTF-8", default="") + parser.add_argument('-qe', '--qualifier_file_encoding', type=str, dest='qualifier_file_encoding', help="Encoding of the qualifier file, eg.: UTF-8", default="") def run(statement_file_path: str, qualifier_file_path: str, output_directory: str, statement_file_encoding: str, qualifier_file_encoding: str): From 8886320a3f2c0f7272b194ca4439d5c02e0ef19b Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Mon, 27 Apr 2020 17:30:38 -0700 Subject: [PATCH 007/278] Better handling of feedback messages. --- kgtk/cli/clean_data.py | 34 +++++++++++++++++++--------------- kgtk/cli/validate.py | 14 ++++++++------ 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/kgtk/cli/clean_data.py b/kgtk/cli/clean_data.py index 9a89ff97d..5fdb9dcd4 100644 --- a/kgtk/cli/clean_data.py +++ b/kgtk/cli/clean_data.py @@ -57,8 +57,11 @@ def add_arguments(parser): help="The action to take when an empty line is detected.", type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - parser.add_argument( "--errors-to-stdout", dest="errors_to_stdout", - help="Send errors to stdout instead of stderr", action="store_true") + errors_to = parser.add_mutually_exclusive_group() + errors_to.add_argument( "--errors-to-stdout", dest="errors_to_stdout", + help="Send errors to stdout instead of stderr (default)", action="store_true") + errors_to.add_argument( "--errors-to-stderr", dest="errors_to_stderr", + help="Send errors to stderr instead of stdout", action="store_true") parser.add_argument( "--error-limit", dest="error_limit", help="The maximum number of errors to report before failing", type=int, default=KgtkReader.ERROR_LIMIT_DEFAULT) @@ -141,19 +144,20 @@ def run(input_file: typing.Optional[Path], # import modules locally from kgtk.exceptions import KGTKException - try: - if verbose: - if input_file is not None: - print("Cleaning data from '%s'" % str(input_file), file=sys.stderr) - else: - print ("Cleaning data from stdin", file=sys.stderr) - if output_file is not None: - print("Writing data to '%s'" % str(output_file), file=sys.stderr) - else: - print ("Writing data to stdin", file=sys.stderr) + # Select where to send error messages, defaulting to stderr. + error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr + + if verbose: + if input_file is not None: + print("Cleaning data from '%s'" % str(input_file), file=error_file) + else: + print ("Cleaning data from stdin", file=error_file) + if output_file is not None: + print("Writing data to '%s'" % str(output_file), file=error_file) + else: + print ("Writing data to stdin", file=error_file) - error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr - + try: kr: KgtkReader = KgtkReader.open(input_file, force_column_names=force_column_names, skip_first_record=skip_first_record, @@ -195,7 +199,7 @@ def run(input_file: typing.Optional[Path], kw.close() if verbose: - print("Copied %d clean data lines" % line_count, file=sys.stderr) + print("Copied %d clean data lines" % line_count, file=error_file) return 0 except Exception as e: diff --git a/kgtk/cli/validate.py b/kgtk/cli/validate.py index e027ef03e..ae480b223 100644 --- a/kgtk/cli/validate.py +++ b/kgtk/cli/validate.py @@ -148,16 +148,18 @@ def run(kgtk_files: typing.Optional[typing.List[typing.Optional[Path]]], if kgtk_files is None or len(kgtk_files) == 0: kgtk_files = [ None ] + # Select where to send error messages, defaulting to stderr. + error_file: typing.TextIO = sys.stderr if errors_to_stderr else sys.stdout + try: kgtk_file: typing.Optional[Path] for kgtk_file in kgtk_files: if verbose: + print("\n====================================================") if kgtk_file is not None: - print("Validating '%s'" % str(kgtk_file)) + print("Validating '%s'" % str(kgtk_file), file=error_file) else: - print ("Validating from stdin") - - error_file: typing.TextIO = sys.stderr if errors_to_stderr else sys.stdout + print ("Validating from stdin", file=error_file) kr: KgtkReader = KgtkReader.open(kgtk_file, force_column_names=force_column_names, @@ -186,14 +188,14 @@ def run(kgtk_files: typing.Optional[typing.List[typing.Optional[Path]]], if header_only: kr.close() if verbose: - print("Validated the header only.") + print("Validated the header only.", file=error_file) else: line_count: int = 0 row: typing.List[str] for row in kr: line_count += 1 if verbose: - print("Validated %d data lines" % line_count) + print("Validated %d data lines" % line_count, file=error_file) return 0 except SystemExit as e: From 753365dd85585275c603e71e781ddf1f105c4ca5 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Mon, 27 Apr 2020 17:46:58 -0700 Subject: [PATCH 008/278] Add attrs to the list of required modules. --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index d25b0a5ee..c467a6c44 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ tqdm etk simplejson pyrallel.lib +attrs From 04c2300b546e1af87a2a63c5fa5386dd19f88094 Mon Sep 17 00:00:00 2001 From: Rongpeng Date: Mon, 27 Apr 2020 19:14:24 -0700 Subject: [PATCH 009/278] read line by line, reproduced the missing prefix bugs --- kgtk/cli/generate_wikidata_triples.py | 12 +++++++++--- requirements.txt | 3 ++- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/kgtk/cli/generate_wikidata_triples.py b/kgtk/cli/generate_wikidata_triples.py index 573bcf6ed..b2eab47c0 100644 --- a/kgtk/cli/generate_wikidata_triples.py +++ b/kgtk/cli/generate_wikidata_triples.py @@ -535,9 +535,15 @@ def replaceIllegalString(s:str)->str: truthy=truthy ) # process stdin - for num, edge in enumerate(sys.stdin.readlines()): - if edge.startswith("#") or num == 0: # TODO First line omit + num_line = 0 + while True: + edge = sys.stdin.readline() + if not edge: + break + if edge.startswith("#") or num_line == 0: # TODO First line omit + num_line += 1 continue else: - generator.entryPoint(num, edge) + generator.entryPoint(num_line, edge) + num_line += 1 generator.finalize() diff --git a/requirements.txt b/requirements.txt index a6817e3d1..7cb0d1c7f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,6 +7,7 @@ sh sklearn SPARQLWrapper tqdm -etk +rdflib==5.0.0 +etk==2.2.1 simplejson pyrallel.lib From 0027ae478c31b19c2f25739ac43385d843ced422 Mon Sep 17 00:00:00 2001 From: Rongpeng Date: Mon, 27 Apr 2020 19:28:20 -0700 Subject: [PATCH 010/278] separate the cli file and the class file --- kgtk/cli/generate_wikidata_triples.py | 419 +------------------------- kgtk/triple_generator.py | 419 ++++++++++++++++++++++++++ 2 files changed, 420 insertions(+), 418 deletions(-) create mode 100644 kgtk/triple_generator.py diff --git a/kgtk/cli/generate_wikidata_triples.py b/kgtk/cli/generate_wikidata_triples.py index b2eab47c0..880f92e3b 100644 --- a/kgtk/cli/generate_wikidata_triples.py +++ b/kgtk/cli/generate_wikidata_triples.py @@ -106,425 +106,8 @@ def run( # logging_level:str ): # import modules locally + from kgtk.triple_generator import TripleGenerator import sys - import warnings - import re - import requests - from typing import TextIO - import logging - from etk.wikidata.entity import WDItem, WDProperty - from kgtk.exceptions import KGTKException - - class TripleGenerator: - """ - A class to maintain the status of the generator - """ - def __init__( - self, - propFile: str, - labelSet: str, - aliasSet: str, - descriptionSet: str, - ignore: bool, - n: int, - destFp: TextIO = sys.stdout, - truthy:bool =False - ): - from etk.wikidata.statement import Rank - from etk.etk import ETK - from etk.knowledge_graph import KGSchema - from etk.etk_module import ETKModule - self.ignore = ignore - self.propTypes = self.__setPropTypes(propFile) - self.labelSet, self.aliasSet, self.descriptionSet = self.__setSets( - labelSet, aliasSet, descriptionSet - ) - self.fp = destFp - self.n = int(n) - self.read = 0 - # ignore-logging, if not ignore, log them and move on. - if not self.ignore: - self.ignoreFile = open("ignored.log","w") - # corrupted statement id - self.corrupted_statement_id = None - # serialize prfix - kg_schema = KGSchema() - kg_schema.add_schema("@prefix : .", "ttl") - self.etk = ETK(kg_schema=kg_schema, modules=ETKModule) - self.doc = self.__setDoc() - self.__serialize_prefix() - - def _node_2_entity(self, node:str): - ''' - A node can be Qxxx or Pxxx, return the proper entity. - ''' - if node in self.propTypes: - entity = WDProperty(node, self.propTypes[node]) - else: - entity = WDItem(TripleGenerator.replaceIllegalString(node.upper())) - return entity - - - def __setPropTypes(self, propFile: str): - from etk.wikidata.value import ( - Item, - StringValue, - TimeValue, - QuantityValue, - MonolingualText, - GlobeCoordinate, - ExternalIdentifier, - URLValue - ) - dataTypeMappings = { - "item": Item, - "time": TimeValue, - "globe-coordinate": GlobeCoordinate, - "quantity": QuantityValue, - "monolingualtext": MonolingualText, - "string": StringValue, - "external-identifier":ExternalIdentifier, - "url":URLValue - } - with open(propFile, "r") as fp: - props = fp.readlines() - __propTypes = {} - for line in props[1:]: - node1, _, node2 = line.split("\t") - try: - __propTypes[node1] = dataTypeMappings[node2.strip()] - except: - if not self.ignore: - raise KGTKException( - "DataType {} of node {} is not supported.\n".format( - node2, node1 - ) - ) - return __propTypes - - def __setSets(self, labelSet: str, aliasSet: str, descriptionSet: str): - return ( - set(labelSet.split(",")), - set(aliasSet.split(",")), - set(descriptionSet.split(",")), - ) - - def __setDoc(self, doc_id: str = "http://isi.edu/default-ns/projects"): - """ - reset the doc object and return it. Called at initialization and after outputting triples. - """ - doc = self.etk.create_document({}, doc_id=doc_id) - # bind prefixes - doc.kg.bind("wikibase", "http://wikiba.se/ontology#") - doc.kg.bind("wd", "http://www.wikidata.org/entity/") - doc.kg.bind("wdt", "http://www.wikidata.org/prop/direct/") - doc.kg.bind("wdtn", "http://www.wikidata.org/prop/direct-normalized/") - doc.kg.bind("wdno", "http://www.wikidata.org/prop/novalue/") - doc.kg.bind("wds", "http://www.wikidata.org/entity/statement/") - doc.kg.bind("wdv", "http://www.wikidata.org/value/") - doc.kg.bind("wdref", "http://www.wikidata.org/reference/") - doc.kg.bind("p", "http://www.wikidata.org/prop/") - doc.kg.bind("pr", "http://www.wikidata.org/prop/reference/") - doc.kg.bind("prv", "http://www.wikidata.org/prop/reference/value/") - doc.kg.bind( - "prn", "http://www.wikidata.org/prop/reference/value-normalized/" - ) - doc.kg.bind("ps", "http://www.wikidata.org/prop/statement/") - doc.kg.bind("psv", "http://www.wikidata.org/prop/statement/value/") - doc.kg.bind( - "psn", "http://www.wikidata.org/prop/statement/value-normalized/" - ) - doc.kg.bind("pq", "http://www.wikidata.org/prop/qualifier/") - doc.kg.bind("pqv", "http://www.wikidata.org/prop/qualifier/value/") - doc.kg.bind( - "pqn", "http://www.wikidata.org/prop/qualifier/value-normalized/" - ) - doc.kg.bind("skos", "http://www.w3.org/2004/02/skos/core#") - doc.kg.bind("prov", "http://www.w3.org/ns/prov#") - doc.kg.bind("schema", "http://schema.org/") - return doc - - @staticmethod - def _process_text_string(string:str)->[str,str]: - ''' - ''' - if "@" in string: - res = string.split("@") - textString = "@".join(res[:-1]).replace('"', "").replace("'", "") - lang = res[-1].replace('"','').replace("'","") - if len(lang) != 2: - lang = "en" - else: - textString = string.replace('"', "").replace("'", "") - lang = "en" - return [textString, lang] - - def genLabelTriple(self, node1: str, label: str, node2: str) -> bool: - entity = self._node_2_entity(node1) - textString, lang = TripleGenerator._process_text_string(node2) - entity.add_label(textString, lang=lang) - self.doc.kg.add_subject(entity) - return True - - def genDescriptionTriple(self, node1: str, label: str, node2: str) -> bool: - entity = self._node_2_entity(node1) - textString, lang = TripleGenerator._process_text_string(node2) - entity.add_description(textString, lang=lang) - self.doc.kg.add_subject(entity) - return True - - def genDescriptionTriple(self, node1: str, label: str, node2: str) -> bool: - entity = self._node_2_entity(node1) - textString, lang = TripleGenerator._process_text_string(node2) - entity.add_description(textString, lang=lang) - self.doc.kg.add_subject(entity) - return True - - def genAliasTriple(self, node1: str, label: str, node2: str) -> bool: - entity = self._node_2_entity(node1) - textString, lang = TripleGenerator._process_text_string(node2) - entity.add_alias(textString, lang=lang) - self.doc.kg.add_subject(entity) - return True - - def genPropDeclarationTriple(self, node1: str, label: str, node2: str) -> bool: - prop = WDProperty(node1, self.propTypes[node1]) - self.doc.kg.add_subject(prop) - return True - - def genNormalTriple( - self, node1: str, label: str, node2: str, isQualifierEdge: bool) -> bool: - from etk.wikidata.value import ( - Item, - StringValue, - TimeValue, - QuantityValue, - MonolingualText, - GlobeCoordinate, - ExternalIdentifier, - URLValue, - Precision - ) - - entity = self._node_2_entity(node1) - # determine the edge type - edgeType = self.propTypes[label] - if edgeType == Item: - OBJECT = WDItem(TripleGenerator.replaceIllegalString(node2.upper())) - elif edgeType == TimeValue: - # https://www.wikidata.org/wiki/Help:Dates - # ^2013-01-01T00:00:00Z/11 - # ^8000000-00-00T00:00:00Z/3 - if re.compile("[0-9]{4}").match(node2): - try: - dateTimeString = node2 + "-01-01" - OBJECT = TimeValue( - value=dateTimeString, #TODO - calendar=Item("Q1985727"), - precision=Precision.year, - time_zone=0, - ) - except: - return False - else: - try: - dateTimeString, precision = node2[1:].split("/") - dateTimeString = dateTimeString[:-1] # remove "Z" - # 2016-00-00T00:00:00 case - if "-00-00" in dateTimeString: - dateTimeString = "-01-01".join(dateTimeString.split("-00-00")) - elif dateTimeString[8:10] == "00": - dateTimeString = dateTimeString[:8]+"01" + dateTimeString[10:] - OBJECT = TimeValue( - value=dateTimeString, - calendar=Item("Q1985727"), - precision=precision, - time_zone=0, - ) - except: - return False - - #TODO other than that, not supported. Creation of normal triple fails - - - elif edgeType == GlobeCoordinate: - latitude, longitude = node2[1:].split("/") - OBJECT = GlobeCoordinate( - latitude, longitude, 0.0001, globe=StringValue("Earth") - ) - - elif edgeType == QuantityValue: - # +70[+60,+80]Q743895 - res = re.compile("([\+|\-]?[0-9]+\.?[0-9]*)(?:\[([\+|\-]?[0-9]+\.?[0-9]*),([\+|\-]?[0-9]+\.?[0-9]*)\])?([U|Q](?:[0-9]+))?").match(node2).groups() - amount, lower_bound, upper_bound, unit = res - - # Handle extra small numbers for now. TODO - if TripleGenerator._is_invalid_decimal_string(amount) or TripleGenerator._is_invalid_decimal_string(lower_bound) or TripleGenerator._is_invalid_decimal_string(upper_bound): - return False - amount = TripleGenerator._clean_number_string(amount) - lower_bound = TripleGenerator._clean_number_string(lower_bound) - upper_bound = TripleGenerator._clean_number_string(upper_bound) - if unit != None: - if upper_bound != None and lower_bound != None: - OBJECT = QuantityValue(amount, unit=Item(unit),upper_bound=upper_bound,lower_bound=lower_bound) - else: - OBJECT = QuantityValue(amount, unit=Item(unit)) - else: - if upper_bound != None and lower_bound != None: - OBJECT = QuantityValue(amount, upper_bound=upper_bound,lower_bound=lower_bound) - else: - OBJECT = QuantityValue(amount) - elif edgeType == MonolingualText: - textString, lang = TripleGenerator._process_text_string(node2) - OBJECT = MonolingualText(textString, lang) - elif edgeType == ExternalIdentifier: - OBJECT = ExternalIdentifier(node2) - elif edge == URLValue: - OBJECT = URLValue(node2) - else: - # treat everything else as stringValue - OBJECT = StringValue(node2) - if isQualifierEdge: - # edge: e8 p9 ^2013-01-01T00:00:00Z/11 - # create qualifier edge on previous STATEMENT and return the updated STATEMENT - if type(OBJECT) == WDItem: - self.doc.kg.add_subject(OBJECT) - self.STATEMENT.add_qualifier(label.upper(), OBJECT) - self.doc.kg.add_subject(self.STATEMENT) #TODO maybe can be positioned better for the edge cases. - - else: - # edge: q1 p8 q2 e8 - # create brand new property edge and replace STATEMENT - if type(OBJECT) == WDItem: - self.doc.kg.add_subject(OBJECT) - if truthy: - self.STATEMENT = entity.add_truthy_statement(label.upper(), OBJECT) - else: - self.STATEMENT = entity.add_statement(label.upper(), OBJECT) - self.doc.kg.add_subject(entity) - return True - - @staticmethod - def _is_invalid_decimal_string(num_string): - ''' - if a decimal string too small, return True TODO - ''' - if num_string == None: - return False - else: - if abs(float(num_string)) < 0.0001 and float(num_string) != 0: - return True - return False - - @staticmethod - def _clean_number_string(num): - from numpy import format_float_positional - if num == None: - return None - else: - return format_float_positional(float(num),trim="-") - - def entryPoint(self, line_number:int , edge: str): - """ - generates a list of two, the first element is the determination of the edge type using corresponding edge type - the second element is a bool indicating whether this is a valid property edge or qualifier edge. - Call corresponding downstream functions - """ - edgeList = edge.strip().split("\t") - l = len(edgeList) - if l!=4: - return - - [node1, label, node2, eID] = edgeList - node1, label, node2, eID = node1.strip(),label.strip(),node2.strip(),eID.strip() - if line_number == 0: #TODO ignore header mode - # by default a statement edge - isQualifierEdge = False - # print("#Debug Info: ",line_number, self.ID, eID, isQualifierEdge,self.STATEMENT) - self.ID = eID - self.corrupted_statement_id = None - else: - if node1 != self.ID: - # also a new statement edge - if self.read >= self.n: - self.serialize() - isQualifierEdge = False - # print("#Debug Info: ",line_number, self.ID, node1, isQualifierEdge,self.STATEMENT) - self.ID= eID - self.corrupted_statement_id = None - else: - # qualifier edge or property declaration edge - isQualifierEdge = True - if self.corrupted_statement_id == eID: - # Met a qualifier which associates with a corrupted statement - return - if label != "type" and node1 != self.ID: - # 1. not a property declaration edge and - # 2. the current qualifier's node1 is not the latest property edge id, throw errors. - if not self.ignore: - raise KGTKException( - "Node1 {} at line {} doesn't agree with latest property edge id {}.\n".format( - node1, line_number, self.ID - ) - ) - if label in self.labelSet: - success = self.genLabelTriple(node1, label, node2) - elif label in self.descriptionSet: - success= self.genDescriptionTriple(node1, label, node2) - elif label in self.aliasSet: - success = self.genAliasTriple(node1, label, node2) - elif label == "type": - # special edge of prop declaration - success = self.genPropDeclarationTriple(node1, label, node2) - else: - if label in self.propTypes: - success= self.genNormalTriple(node1, label, node2, isQualifierEdge) - else: - if not self.ignore: - raise KGTKException( - "property {}'s type is unknown at line {}.\n".format(label, line_number) - ) - success = False - if (not success) and (not isQualifierEdge) and (not self.ignore): - # We have a corrupted edge here. - self.ignoreFile.write("Corrupted statement at line number: {} with id {} with current corrupted id {}\n".format(line_number, eID, self.corrupted_statement_id)) - self.ignoreFile.flush() - self.corrupted_statement_id = eID - else: - self.read += 1 - self.corrupted_statement_id = None - - def serialize(self): - """ - Seriealize the triples. Used a hack to avoid serializing the prefix again. - """ - docs = self.etk.process_ems(self.doc) - self.fp.write("\n\n".join(docs[0].kg.serialize("ttl").split("\n\n")[1:])) - self.fp.flush() - self.__reset() - - def __serialize_prefix(self): - """ - This function should be called only once after the doc object is initialized. - """ - docs = self.etk.process_ems(self.doc) - self.fp.write(docs[0].kg.serialize("ttl").split("\n\n")[0] + "\n\n") - self.fp.flush() - self.__reset() - - def __reset(self): - self.ID = None - self.STATEMENT = None - self.read = 0 - self.doc = self.__setDoc() - - def finalize(self): - self.serialize() - - @staticmethod - def replaceIllegalString(s:str)->str: - return s.replace(":","-") - generator = TripleGenerator( propFile=propFile, labelSet=labels, diff --git a/kgtk/triple_generator.py b/kgtk/triple_generator.py new file mode 100644 index 000000000..9d86ae35d --- /dev/null +++ b/kgtk/triple_generator.py @@ -0,0 +1,419 @@ +import sys +import re +from typing import TextIO +from kgtk.exceptions import KGTKException +from etk.wikidata.entity import WDItem, WDProperty + +class TripleGenerator: + """ + A class to maintain the status of the generator + """ + def __init__( + self, + propFile: str, + labelSet: str, + aliasSet: str, + descriptionSet: str, + ignore: bool, + n: int, + destFp: TextIO = sys.stdout, + truthy:bool =False + ): + + import logging + from etk.wikidata.statement import Rank + from etk.etk import ETK + from etk.knowledge_graph import KGSchema + from etk.etk_module import ETKModule + self.ignore = ignore + self.propTypes = self.__setPropTypes(propFile) + self.labelSet, self.aliasSet, self.descriptionSet = self.__setSets( + labelSet, aliasSet, descriptionSet + ) + self.fp = destFp + self.n = int(n) + self.read = 0 + # ignore-logging, if not ignore, log them and move on. + if not self.ignore: + self.ignoreFile = open("ignored.log","w") + # corrupted statement id + self.corrupted_statement_id = None + # truthy + self.truthy = truthy + # serialize prfix + kg_schema = KGSchema() + kg_schema.add_schema("@prefix : .", "ttl") + self.etk = ETK(kg_schema=kg_schema, modules=ETKModule) + self.doc = self.__setDoc() + self.__serialize_prefix() + + def _node_2_entity(self, node:str): + ''' + A node can be Qxxx or Pxxx, return the proper entity. + ''' + if node in self.propTypes: + entity = WDProperty(node, self.propTypes[node]) + else: + entity = WDItem(TripleGenerator.replaceIllegalString(node.upper())) + return entity + + + def __setPropTypes(self, propFile: str): + from etk.wikidata.value import ( + Item, + StringValue, + TimeValue, + QuantityValue, + MonolingualText, + GlobeCoordinate, + ExternalIdentifier, + URLValue + ) + dataTypeMappings = { + "item": Item, + "time": TimeValue, + "globe-coordinate": GlobeCoordinate, + "quantity": QuantityValue, + "monolingualtext": MonolingualText, + "string": StringValue, + "external-identifier":ExternalIdentifier, + "url":URLValue + } + with open(propFile, "r") as fp: + props = fp.readlines() + __propTypes = {} + for line in props[1:]: + node1, _, node2 = line.split("\t") + try: + __propTypes[node1] = dataTypeMappings[node2.strip()] + except: + if not self.ignore: + raise KGTKException( + "DataType {} of node {} is not supported.\n".format( + node2, node1 + ) + ) + return __propTypes + + def __setSets(self, labelSet: str, aliasSet: str, descriptionSet: str): + return ( + set(labelSet.split(",")), + set(aliasSet.split(",")), + set(descriptionSet.split(",")), + ) + + def __setDoc(self, doc_id: str = "http://isi.edu/default-ns/projects"): + """ + reset the doc object and return it. Called at initialization and after outputting triples. + """ + doc = self.etk.create_document({}, doc_id=doc_id) + # bind prefixes + doc.kg.bind("wikibase", "http://wikiba.se/ontology#") + doc.kg.bind("wd", "http://www.wikidata.org/entity/") + doc.kg.bind("wdt", "http://www.wikidata.org/prop/direct/") + doc.kg.bind("wdtn", "http://www.wikidata.org/prop/direct-normalized/") + doc.kg.bind("wdno", "http://www.wikidata.org/prop/novalue/") + doc.kg.bind("wds", "http://www.wikidata.org/entity/statement/") + doc.kg.bind("wdv", "http://www.wikidata.org/value/") + doc.kg.bind("wdref", "http://www.wikidata.org/reference/") + doc.kg.bind("p", "http://www.wikidata.org/prop/") + doc.kg.bind("pr", "http://www.wikidata.org/prop/reference/") + doc.kg.bind("prv", "http://www.wikidata.org/prop/reference/value/") + doc.kg.bind( + "prn", "http://www.wikidata.org/prop/reference/value-normalized/" + ) + doc.kg.bind("ps", "http://www.wikidata.org/prop/statement/") + doc.kg.bind("psv", "http://www.wikidata.org/prop/statement/value/") + doc.kg.bind( + "psn", "http://www.wikidata.org/prop/statement/value-normalized/" + ) + doc.kg.bind("pq", "http://www.wikidata.org/prop/qualifier/") + doc.kg.bind("pqv", "http://www.wikidata.org/prop/qualifier/value/") + doc.kg.bind( + "pqn", "http://www.wikidata.org/prop/qualifier/value-normalized/" + ) + doc.kg.bind("skos", "http://www.w3.org/2004/02/skos/core#") + doc.kg.bind("prov", "http://www.w3.org/ns/prov#") + doc.kg.bind("schema", "http://schema.org/") + return doc + + @staticmethod + def _process_text_string(string:str)->[str,str]: + ''' + ''' + if "@" in string: + res = string.split("@") + textString = "@".join(res[:-1]).replace('"', "").replace("'", "") + lang = res[-1].replace('"','').replace("'","") + if len(lang) != 2: + lang = "en" + else: + textString = string.replace('"', "").replace("'", "") + lang = "en" + return [textString, lang] + + def genLabelTriple(self, node1: str, label: str, node2: str) -> bool: + entity = self._node_2_entity(node1) + textString, lang = TripleGenerator._process_text_string(node2) + entity.add_label(textString, lang=lang) + self.doc.kg.add_subject(entity) + return True + + def genDescriptionTriple(self, node1: str, label: str, node2: str) -> bool: + entity = self._node_2_entity(node1) + textString, lang = TripleGenerator._process_text_string(node2) + entity.add_description(textString, lang=lang) + self.doc.kg.add_subject(entity) + return True + + def genDescriptionTriple(self, node1: str, label: str, node2: str) -> bool: + entity = self._node_2_entity(node1) + textString, lang = TripleGenerator._process_text_string(node2) + entity.add_description(textString, lang=lang) + self.doc.kg.add_subject(entity) + return True + + def genAliasTriple(self, node1: str, label: str, node2: str) -> bool: + entity = self._node_2_entity(node1) + textString, lang = TripleGenerator._process_text_string(node2) + entity.add_alias(textString, lang=lang) + self.doc.kg.add_subject(entity) + return True + + def genPropDeclarationTriple(self, node1: str, label: str, node2: str) -> bool: + prop = WDProperty(node1, self.propTypes[node1]) + self.doc.kg.add_subject(prop) + return True + + def genNormalTriple( + self, node1: str, label: str, node2: str, isQualifierEdge: bool) -> bool: + from etk.wikidata.value import ( + Item, + StringValue, + TimeValue, + QuantityValue, + MonolingualText, + GlobeCoordinate, + ExternalIdentifier, + URLValue, + Precision + ) + + entity = self._node_2_entity(node1) + # determine the edge type + edgeType = self.propTypes[label] + if edgeType == Item: + OBJECT = WDItem(TripleGenerator.replaceIllegalString(node2.upper())) + elif edgeType == TimeValue: + # https://www.wikidata.org/wiki/Help:Dates + # ^2013-01-01T00:00:00Z/11 + # ^8000000-00-00T00:00:00Z/3 + if re.compile("[0-9]{4}").match(node2): + try: + dateTimeString = node2 + "-01-01" + OBJECT = TimeValue( + value=dateTimeString, #TODO + calendar=Item("Q1985727"), + precision=Precision.year, + time_zone=0, + ) + except: + return False + else: + try: + dateTimeString, precision = node2[1:].split("/") + dateTimeString = dateTimeString[:-1] # remove "Z" + # 2016-00-00T00:00:00 case + if "-00-00" in dateTimeString: + dateTimeString = "-01-01".join(dateTimeString.split("-00-00")) + elif dateTimeString[8:10] == "00": + dateTimeString = dateTimeString[:8]+"01" + dateTimeString[10:] + OBJECT = TimeValue( + value=dateTimeString, + calendar=Item("Q1985727"), + precision=precision, + time_zone=0, + ) + except: + return False + + #TODO other than that, not supported. Creation of normal triple fails + + + elif edgeType == GlobeCoordinate: + latitude, longitude = node2[1:].split("/") + OBJECT = GlobeCoordinate( + latitude, longitude, 0.0001, globe=StringValue("Earth") + ) + + elif edgeType == QuantityValue: + # +70[+60,+80]Q743895 + res = re.compile("([\+|\-]?[0-9]+\.?[0-9]*)(?:\[([\+|\-]?[0-9]+\.?[0-9]*),([\+|\-]?[0-9]+\.?[0-9]*)\])?([U|Q](?:[0-9]+))?").match(node2).groups() + amount, lower_bound, upper_bound, unit = res + + # Handle extra small numbers for now. TODO + if TripleGenerator._is_invalid_decimal_string(amount) or TripleGenerator._is_invalid_decimal_string(lower_bound) or TripleGenerator._is_invalid_decimal_string(upper_bound): + return False + amount = TripleGenerator._clean_number_string(amount) + lower_bound = TripleGenerator._clean_number_string(lower_bound) + upper_bound = TripleGenerator._clean_number_string(upper_bound) + if unit != None: + if upper_bound != None and lower_bound != None: + OBJECT = QuantityValue(amount, unit=Item(unit),upper_bound=upper_bound,lower_bound=lower_bound) + else: + OBJECT = QuantityValue(amount, unit=Item(unit)) + else: + if upper_bound != None and lower_bound != None: + OBJECT = QuantityValue(amount, upper_bound=upper_bound,lower_bound=lower_bound) + else: + OBJECT = QuantityValue(amount) + elif edgeType == MonolingualText: + textString, lang = TripleGenerator._process_text_string(node2) + OBJECT = MonolingualText(textString, lang) + elif edgeType == ExternalIdentifier: + OBJECT = ExternalIdentifier(node2) + elif edgeType == URLValue: + OBJECT = URLValue(node2) + else: + # treat everything else as stringValue + OBJECT = StringValue(node2) + if isQualifierEdge: + # edge: e8 p9 ^2013-01-01T00:00:00Z/11 + # create qualifier edge on previous STATEMENT and return the updated STATEMENT + if type(OBJECT) == WDItem: + self.doc.kg.add_subject(OBJECT) + self.STATEMENT.add_qualifier(label.upper(), OBJECT) + self.doc.kg.add_subject(self.STATEMENT) #TODO maybe can be positioned better for the edge cases. + + else: + # edge: q1 p8 q2 e8 + # create brand new property edge and replace STATEMENT + if type(OBJECT) == WDItem: + self.doc.kg.add_subject(OBJECT) + if self.truthy: + self.STATEMENT = entity.add_truthy_statement(label.upper(), OBJECT) + else: + self.STATEMENT = entity.add_statement(label.upper(), OBJECT) + self.doc.kg.add_subject(entity) + return True + + @staticmethod + def _is_invalid_decimal_string(num_string): + ''' + if a decimal string too small, return True TODO + ''' + if num_string == None: + return False + else: + if abs(float(num_string)) < 0.0001 and float(num_string) != 0: + return True + return False + + @staticmethod + def _clean_number_string(num): + from numpy import format_float_positional + if num == None: + return None + else: + return format_float_positional(float(num),trim="-") + + def entryPoint(self, line_number:int , edge: str): + """ + generates a list of two, the first element is the determination of the edge type using corresponding edge type + the second element is a bool indicating whether this is a valid property edge or qualifier edge. + Call corresponding downstream functions + """ + edgeList = edge.strip().split("\t") + l = len(edgeList) + if l!=4: + return + + [node1, label, node2, eID] = edgeList + node1, label, node2, eID = node1.strip(),label.strip(),node2.strip(),eID.strip() + if line_number == 0: #TODO ignore header mode + # by default a statement edge + isQualifierEdge = False + # print("#Debug Info: ",line_number, self.ID, eID, isQualifierEdge,self.STATEMENT) + self.ID = eID + self.corrupted_statement_id = None + else: + if node1 != self.ID: + # also a new statement edge + if self.read >= self.n: + self.serialize() + isQualifierEdge = False + # print("#Debug Info: ",line_number, self.ID, node1, isQualifierEdge,self.STATEMENT) + self.ID= eID + self.corrupted_statement_id = None + else: + # qualifier edge or property declaration edge + isQualifierEdge = True + if self.corrupted_statement_id == eID: + # Met a qualifier which associates with a corrupted statement + return + if label != "type" and node1 != self.ID: + # 1. not a property declaration edge and + # 2. the current qualifier's node1 is not the latest property edge id, throw errors. + if not self.ignore: + raise KGTKException( + "Node1 {} at line {} doesn't agree with latest property edge id {}.\n".format( + node1, line_number, self.ID + ) + ) + if label in self.labelSet: + success = self.genLabelTriple(node1, label, node2) + elif label in self.descriptionSet: + success= self.genDescriptionTriple(node1, label, node2) + elif label in self.aliasSet: + success = self.genAliasTriple(node1, label, node2) + elif label == "type": + # special edge of prop declaration + success = self.genPropDeclarationTriple(node1, label, node2) + else: + if label in self.propTypes: + success= self.genNormalTriple(node1, label, node2, isQualifierEdge) + else: + if not self.ignore: + raise KGTKException( + "property {}'s type is unknown at line {}.\n".format(label, line_number) + ) + success = False + if (not success) and (not isQualifierEdge) and (not self.ignore): + # We have a corrupted edge here. + self.ignoreFile.write("Corrupted statement at line number: {} with id {} with current corrupted id {}\n".format(line_number, eID, self.corrupted_statement_id)) + self.ignoreFile.flush() + self.corrupted_statement_id = eID + else: + self.read += 1 + self.corrupted_statement_id = None + + def serialize(self): + """ + Seriealize the triples. Used a hack to avoid serializing the prefix again. + """ + docs = self.etk.process_ems(self.doc) + self.fp.write("\n\n".join(docs[0].kg.serialize("ttl").split("\n\n")[1:])) + self.fp.flush() + self.__reset() + + def __serialize_prefix(self): + """ + This function should be called only once after the doc object is initialized. + """ + docs = self.etk.process_ems(self.doc) + self.fp.write(docs[0].kg.serialize("ttl").split("\n\n")[0] + "\n\n") + self.fp.flush() + self.__reset() + + def __reset(self): + self.ID = None + self.STATEMENT = None + self.read = 0 + self.doc = self.__setDoc() + + def finalize(self): + self.serialize() + + @staticmethod + def replaceIllegalString(s:str)->str: + return s.replace(":","-") \ No newline at end of file From f8d61c0e7255c2419da0c8f9323ae0eff249177a Mon Sep 17 00:00:00 2001 From: Rongpeng Date: Mon, 27 Apr 2020 19:43:41 -0700 Subject: [PATCH 011/278] comply with PEP8 naming conventions, rename some varialbes --- kgtk/cli/generate_wikidata_triples.py | 16 +- kgtk/triple_generator.py | 260 ++++++++++++-------------- 2 files changed, 129 insertions(+), 147 deletions(-) diff --git a/kgtk/cli/generate_wikidata_triples.py b/kgtk/cli/generate_wikidata_triples.py index 880f92e3b..ee17e77f2 100644 --- a/kgtk/cli/generate_wikidata_triples.py +++ b/kgtk/cli/generate_wikidata_triples.py @@ -31,7 +31,7 @@ def add_arguments(parser): Parse arguments Args: parser (argparse.ArgumentParser) - propFile: str, labelSet: str, aliasSet: str, descriptionSet: str, n: str, dest: Any --output-n-lines --generate-truthy + prop_file: str, labelSet: str, aliasSet: str, descriptionSet: str, n: str, dest: Any --output-n-lines --generate-truthy """ parser.add_argument( "-lp", @@ -63,7 +63,7 @@ def add_arguments(parser): action="store", type=str, help="path to the file which contains the property datatype mapping in kgtk format.", - dest="propFile", + dest="prop_file", ) parser.add_argument( "-n", @@ -99,7 +99,7 @@ def run( labels: str, aliases: str, descriptions: str, - propFile: str, + prop_file: str, n: int, truthy: bool, ignore: bool, @@ -109,10 +109,10 @@ def run( from kgtk.triple_generator import TripleGenerator import sys generator = TripleGenerator( - propFile=propFile, - labelSet=labels, - aliasSet=aliases, - descriptionSet=descriptions, + prop_file=prop_file, + label_set=labels, + alias_set=aliases, + description_set=descriptions, n=n, ignore=ignore, truthy=truthy @@ -127,6 +127,6 @@ def run( num_line += 1 continue else: - generator.entryPoint(num_line, edge) + generator.entry_point(num_line, edge) num_line += 1 generator.finalize() diff --git a/kgtk/triple_generator.py b/kgtk/triple_generator.py index 9d86ae35d..7dafe800f 100644 --- a/kgtk/triple_generator.py +++ b/kgtk/triple_generator.py @@ -3,6 +3,17 @@ from typing import TextIO from kgtk.exceptions import KGTKException from etk.wikidata.entity import WDItem, WDProperty +from etk.wikidata.value import ( +Precision, +Item, +StringValue, +TimeValue, +QuantityValue, +MonolingualText, +GlobeCoordinate, +ExternalIdentifier, +URLValue +) class TripleGenerator: """ @@ -10,13 +21,13 @@ class TripleGenerator: """ def __init__( self, - propFile: str, - labelSet: str, - aliasSet: str, - descriptionSet: str, + prop_file: str, + label_set: str, + alias_set: str, + description_set: str, ignore: bool, n: int, - destFp: TextIO = sys.stdout, + dest_fp: TextIO = sys.stdout, truthy:bool =False ): @@ -26,16 +37,16 @@ def __init__( from etk.knowledge_graph import KGSchema from etk.etk_module import ETKModule self.ignore = ignore - self.propTypes = self.__setPropTypes(propFile) - self.labelSet, self.aliasSet, self.descriptionSet = self.__setSets( - labelSet, aliasSet, descriptionSet + self.prop_types = self.set_properties(prop_file) + self.label_set, self.alias_set, self.description_set = self.__setSets( + label_set, alias_set, description_set ) - self.fp = destFp + self.fp = dest_fp self.n = int(n) - self.read = 0 + self.read_num_of_lines = 0 # ignore-logging, if not ignore, log them and move on. if not self.ignore: - self.ignoreFile = open("ignored.log","w") + self.ignore_file = open("ignored.log","w") # corrupted statement id self.corrupted_statement_id = None # truthy @@ -51,25 +62,15 @@ def _node_2_entity(self, node:str): ''' A node can be Qxxx or Pxxx, return the proper entity. ''' - if node in self.propTypes: - entity = WDProperty(node, self.propTypes[node]) + if node in self.prop_types: + entity = WDProperty(node, self.prop_types[node]) else: entity = WDItem(TripleGenerator.replaceIllegalString(node.upper())) return entity - def __setPropTypes(self, propFile: str): - from etk.wikidata.value import ( - Item, - StringValue, - TimeValue, - QuantityValue, - MonolingualText, - GlobeCoordinate, - ExternalIdentifier, - URLValue - ) - dataTypeMappings = { + def set_properties(self, prop_file: str): + datatype_mapping = { "item": Item, "time": TimeValue, "globe-coordinate": GlobeCoordinate, @@ -79,13 +80,13 @@ def __setPropTypes(self, propFile: str): "external-identifier":ExternalIdentifier, "url":URLValue } - with open(propFile, "r") as fp: + with open(prop_file, "r") as fp: props = fp.readlines() - __propTypes = {} + prop_types = {} for line in props[1:]: node1, _, node2 = line.split("\t") try: - __propTypes[node1] = dataTypeMappings[node2.strip()] + prop_types[node1] = datatype_mapping[node2.strip()] except: if not self.ignore: raise KGTKException( @@ -93,13 +94,13 @@ def __setPropTypes(self, propFile: str): node2, node1 ) ) - return __propTypes + return prop_types - def __setSets(self, labelSet: str, aliasSet: str, descriptionSet: str): + def __setSets(self, label_set: str, alias_set: str, description_set: str): return ( - set(labelSet.split(",")), - set(aliasSet.split(",")), - set(descriptionSet.split(",")), + set(label_set.split(",")), + set(alias_set.split(",")), + set(description_set.split(",")), ) def __setDoc(self, doc_id: str = "http://isi.edu/default-ns/projects"): @@ -138,80 +139,61 @@ def __setDoc(self, doc_id: str = "http://isi.edu/default-ns/projects"): return doc @staticmethod - def _process_text_string(string:str)->[str,str]: + def process_text_string(string:str)->[str,str]: ''' ''' if "@" in string: res = string.split("@") - textString = "@".join(res[:-1]).replace('"', "").replace("'", "") + text_string = "@".join(res[:-1]).replace('"', "").replace("'", "") lang = res[-1].replace('"','').replace("'","") if len(lang) != 2: lang = "en" else: - textString = string.replace('"', "").replace("'", "") + text_string = string.replace('"', "").replace("'", "") lang = "en" - return [textString, lang] + return [text_string, lang] - def genLabelTriple(self, node1: str, label: str, node2: str) -> bool: + def generate_label_triple(self, node1: str, label: str, node2: str) -> bool: entity = self._node_2_entity(node1) - textString, lang = TripleGenerator._process_text_string(node2) - entity.add_label(textString, lang=lang) + text_string, lang = TripleGenerator.process_text_string(node2) + entity.add_label(text_string, lang=lang) self.doc.kg.add_subject(entity) return True - def genDescriptionTriple(self, node1: str, label: str, node2: str) -> bool: + def generate_description_triple(self, node1: str, label: str, node2: str) -> bool: entity = self._node_2_entity(node1) - textString, lang = TripleGenerator._process_text_string(node2) - entity.add_description(textString, lang=lang) + text_string, lang = TripleGenerator.process_text_string(node2) + entity.add_description(text_string, lang=lang) self.doc.kg.add_subject(entity) return True - def genDescriptionTriple(self, node1: str, label: str, node2: str) -> bool: + def generate_alias_triple(self, node1: str, label: str, node2: str) -> bool: entity = self._node_2_entity(node1) - textString, lang = TripleGenerator._process_text_string(node2) - entity.add_description(textString, lang=lang) + text_string, lang = TripleGenerator.process_text_string(node2) + entity.add_alias(text_string, lang=lang) self.doc.kg.add_subject(entity) return True - def genAliasTriple(self, node1: str, label: str, node2: str) -> bool: - entity = self._node_2_entity(node1) - textString, lang = TripleGenerator._process_text_string(node2) - entity.add_alias(textString, lang=lang) - self.doc.kg.add_subject(entity) - return True - - def genPropDeclarationTriple(self, node1: str, label: str, node2: str) -> bool: - prop = WDProperty(node1, self.propTypes[node1]) + def generate_prop_declaration_triple(self, node1: str, label: str, node2: str) -> bool: + prop = WDProperty(node1, self.prop_types[node1]) self.doc.kg.add_subject(prop) return True - def genNormalTriple( - self, node1: str, label: str, node2: str, isQualifierEdge: bool) -> bool: - from etk.wikidata.value import ( - Item, - StringValue, - TimeValue, - QuantityValue, - MonolingualText, - GlobeCoordinate, - ExternalIdentifier, - URLValue, - Precision - ) - + def generate_normal_triple( + self, node1: str, label: str, node2: str, is_qualifier_edge: bool) -> bool: entity = self._node_2_entity(node1) # determine the edge type - edgeType = self.propTypes[label] - if edgeType == Item: - OBJECT = WDItem(TripleGenerator.replaceIllegalString(node2.upper())) - elif edgeType == TimeValue: + edge_type = self.prop_types[label] + if edge_type == Item: + object = WDItem(TripleGenerator.replaceIllegalString(node2.upper())) + elif edge_type == TimeValue: # https://www.wikidata.org/wiki/Help:Dates # ^2013-01-01T00:00:00Z/11 # ^8000000-00-00T00:00:00Z/3 if re.compile("[0-9]{4}").match(node2): try: dateTimeString = node2 + "-01-01" - OBJECT = TimeValue( + object = TimeValue( value=dateTimeString, #TODO calendar=Item("Q1985727"), precision=Precision.year, @@ -228,7 +210,7 @@ def genNormalTriple( dateTimeString = "-01-01".join(dateTimeString.split("-00-00")) elif dateTimeString[8:10] == "00": dateTimeString = dateTimeString[:8]+"01" + dateTimeString[10:] - OBJECT = TimeValue( + object = TimeValue( value=dateTimeString, calendar=Item("Q1985727"), precision=precision, @@ -240,65 +222,65 @@ def genNormalTriple( #TODO other than that, not supported. Creation of normal triple fails - elif edgeType == GlobeCoordinate: + elif edge_type == GlobeCoordinate: latitude, longitude = node2[1:].split("/") - OBJECT = GlobeCoordinate( + object = GlobeCoordinate( latitude, longitude, 0.0001, globe=StringValue("Earth") ) - elif edgeType == QuantityValue: + elif edge_type == QuantityValue: # +70[+60,+80]Q743895 res = re.compile("([\+|\-]?[0-9]+\.?[0-9]*)(?:\[([\+|\-]?[0-9]+\.?[0-9]*),([\+|\-]?[0-9]+\.?[0-9]*)\])?([U|Q](?:[0-9]+))?").match(node2).groups() amount, lower_bound, upper_bound, unit = res # Handle extra small numbers for now. TODO - if TripleGenerator._is_invalid_decimal_string(amount) or TripleGenerator._is_invalid_decimal_string(lower_bound) or TripleGenerator._is_invalid_decimal_string(upper_bound): + if TripleGenerator.is_invalid_decimal_string(amount) or TripleGenerator.is_invalid_decimal_string(lower_bound) or TripleGenerator.is_invalid_decimal_string(upper_bound): return False - amount = TripleGenerator._clean_number_string(amount) - lower_bound = TripleGenerator._clean_number_string(lower_bound) - upper_bound = TripleGenerator._clean_number_string(upper_bound) + amount = TripleGenerator.clean_number_string(amount) + lower_bound = TripleGenerator.clean_number_string(lower_bound) + upper_bound = TripleGenerator.clean_number_string(upper_bound) if unit != None: if upper_bound != None and lower_bound != None: - OBJECT = QuantityValue(amount, unit=Item(unit),upper_bound=upper_bound,lower_bound=lower_bound) + object = QuantityValue(amount, unit=Item(unit),upper_bound=upper_bound,lower_bound=lower_bound) else: - OBJECT = QuantityValue(amount, unit=Item(unit)) + object = QuantityValue(amount, unit=Item(unit)) else: if upper_bound != None and lower_bound != None: - OBJECT = QuantityValue(amount, upper_bound=upper_bound,lower_bound=lower_bound) + object = QuantityValue(amount, upper_bound=upper_bound,lower_bound=lower_bound) else: - OBJECT = QuantityValue(amount) - elif edgeType == MonolingualText: - textString, lang = TripleGenerator._process_text_string(node2) - OBJECT = MonolingualText(textString, lang) - elif edgeType == ExternalIdentifier: - OBJECT = ExternalIdentifier(node2) - elif edgeType == URLValue: - OBJECT = URLValue(node2) + object = QuantityValue(amount) + elif edge_type == MonolingualText: + text_string, lang = TripleGenerator.process_text_string(node2) + object = MonolingualText(text_string, lang) + elif edge_type == ExternalIdentifier: + object = ExternalIdentifier(node2) + elif edge_type == URLValue: + object = URLValue(node2) else: # treat everything else as stringValue - OBJECT = StringValue(node2) - if isQualifierEdge: + object = StringValue(node2) + if is_qualifier_edge: # edge: e8 p9 ^2013-01-01T00:00:00Z/11 # create qualifier edge on previous STATEMENT and return the updated STATEMENT - if type(OBJECT) == WDItem: - self.doc.kg.add_subject(OBJECT) - self.STATEMENT.add_qualifier(label.upper(), OBJECT) - self.doc.kg.add_subject(self.STATEMENT) #TODO maybe can be positioned better for the edge cases. + if type(object) == WDItem: + self.doc.kg.add_subject(object) + self.to_append_statement.add_qualifier(label.upper(), object) + self.doc.kg.add_subject(self.to_append_statement) #TODO maybe can be positioned better for the edge cases. else: # edge: q1 p8 q2 e8 # create brand new property edge and replace STATEMENT - if type(OBJECT) == WDItem: - self.doc.kg.add_subject(OBJECT) + if type(object) == WDItem: + self.doc.kg.add_subject(object) if self.truthy: - self.STATEMENT = entity.add_truthy_statement(label.upper(), OBJECT) + self.to_append_statement = entity.add_truthy_statement(label.upper(), object) else: - self.STATEMENT = entity.add_statement(label.upper(), OBJECT) + self.to_append_statement = entity.add_statement(label.upper(), object) self.doc.kg.add_subject(entity) return True @staticmethod - def _is_invalid_decimal_string(num_string): + def is_invalid_decimal_string(num_string): ''' if a decimal string too small, return True TODO ''' @@ -310,81 +292,81 @@ def _is_invalid_decimal_string(num_string): return False @staticmethod - def _clean_number_string(num): + def clean_number_string(num): from numpy import format_float_positional if num == None: return None else: return format_float_positional(float(num),trim="-") - def entryPoint(self, line_number:int , edge: str): + def entry_point(self, line_number:int , edge: str): """ generates a list of two, the first element is the determination of the edge type using corresponding edge type the second element is a bool indicating whether this is a valid property edge or qualifier edge. Call corresponding downstream functions """ - edgeList = edge.strip().split("\t") - l = len(edgeList) + edge_list = edge.strip().split("\t") + l = len(edge_list) if l!=4: return - [node1, label, node2, eID] = edgeList - node1, label, node2, eID = node1.strip(),label.strip(),node2.strip(),eID.strip() + [node1, label, node2, e_id] = edge_list + node1, label, node2, e_id = node1.strip(),label.strip(),node2.strip(),e_id.strip() if line_number == 0: #TODO ignore header mode # by default a statement edge - isQualifierEdge = False - # print("#Debug Info: ",line_number, self.ID, eID, isQualifierEdge,self.STATEMENT) - self.ID = eID + is_qualifier_edge = False + # print("#Debug Info: ",line_number, self.to_append_statement_id, e_id, is_qualifier_edge,self.to_append_statement) + self.to_append_statement_id = e_id self.corrupted_statement_id = None else: - if node1 != self.ID: + if node1 != self.to_append_statement_id: # also a new statement edge - if self.read >= self.n: + if self.read_num_of_lines >= self.n: self.serialize() - isQualifierEdge = False - # print("#Debug Info: ",line_number, self.ID, node1, isQualifierEdge,self.STATEMENT) - self.ID= eID + is_qualifier_edge = False + # print("#Debug Info: ",line_number, self.to_append_statement_id, node1, is_qualifier_edge,self.to_append_statement) + self.to_append_statement_id= e_id self.corrupted_statement_id = None else: # qualifier edge or property declaration edge - isQualifierEdge = True - if self.corrupted_statement_id == eID: + is_qualifier_edge = True + if self.corrupted_statement_id == e_id: # Met a qualifier which associates with a corrupted statement return - if label != "type" and node1 != self.ID: + if label != "type" and node1 != self.to_append_statement_id: # 1. not a property declaration edge and # 2. the current qualifier's node1 is not the latest property edge id, throw errors. if not self.ignore: raise KGTKException( "Node1 {} at line {} doesn't agree with latest property edge id {}.\n".format( - node1, line_number, self.ID + node1, line_number, self.to_append_statement_id ) ) - if label in self.labelSet: - success = self.genLabelTriple(node1, label, node2) - elif label in self.descriptionSet: - success= self.genDescriptionTriple(node1, label, node2) - elif label in self.aliasSet: - success = self.genAliasTriple(node1, label, node2) + if label in self.label_set: + success = self.generate_label_triple(node1, label, node2) + elif label in self.description_set: + success= self.generate_description_triple(node1, label, node2) + elif label in self.alias_set: + success = self.generate_alias_triple(node1, label, node2) elif label == "type": # special edge of prop declaration - success = self.genPropDeclarationTriple(node1, label, node2) + success = self.generate_prop_declaration_triple(node1, label, node2) else: - if label in self.propTypes: - success= self.genNormalTriple(node1, label, node2, isQualifierEdge) + if label in self.prop_types: + success= self.generate_normal_triple(node1, label, node2, is_qualifier_edge) else: if not self.ignore: raise KGTKException( "property {}'s type is unknown at line {}.\n".format(label, line_number) ) success = False - if (not success) and (not isQualifierEdge) and (not self.ignore): + if (not success) and (not is_qualifier_edge) and (not self.ignore): # We have a corrupted edge here. - self.ignoreFile.write("Corrupted statement at line number: {} with id {} with current corrupted id {}\n".format(line_number, eID, self.corrupted_statement_id)) - self.ignoreFile.flush() - self.corrupted_statement_id = eID + self.ignore_file.write("Corrupted statement at line number: {} with id {} with current corrupted id {}\n".format(line_number, e_id, self.corrupted_statement_id)) + self.ignore_file.flush() + self.corrupted_statement_id = e_id else: - self.read += 1 + self.read_num_of_lines += 1 self.corrupted_statement_id = None def serialize(self): @@ -406,9 +388,9 @@ def __serialize_prefix(self): self.__reset() def __reset(self): - self.ID = None - self.STATEMENT = None - self.read = 0 + self.to_append_statement_id = None + self.to_append_statement = None + self.read_num_of_lines = 0 self.doc = self.__setDoc() def finalize(self): From f7fa7e9a0c81c7c0bd025186c3c8d2f52aa1bbac Mon Sep 17 00:00:00 2001 From: Rongpeng Date: Mon, 27 Apr 2020 22:38:17 -0700 Subject: [PATCH 012/278] fix the bug of missing prefix --- kgtk/triple_generator.py | 119 ++++++++++++++++----------------------- 1 file changed, 48 insertions(+), 71 deletions(-) diff --git a/kgtk/triple_generator.py b/kgtk/triple_generator.py index 7dafe800f..ec328c16d 100644 --- a/kgtk/triple_generator.py +++ b/kgtk/triple_generator.py @@ -3,6 +3,10 @@ from typing import TextIO from kgtk.exceptions import KGTKException from etk.wikidata.entity import WDItem, WDProperty +from etk.etk_module import ETKModule +from etk.etk import ETK +from etk.knowledge_graph import KGSchema +from etk.wikidata import wiki_namespaces from etk.wikidata.value import ( Precision, Item, @@ -15,6 +19,7 @@ URLValue ) + class TripleGenerator: """ A class to maintain the status of the generator @@ -30,15 +35,10 @@ def __init__( dest_fp: TextIO = sys.stdout, truthy:bool =False ): - - import logging from etk.wikidata.statement import Rank - from etk.etk import ETK - from etk.knowledge_graph import KGSchema - from etk.etk_module import ETKModule self.ignore = ignore self.prop_types = self.set_properties(prop_file) - self.label_set, self.alias_set, self.description_set = self.__setSets( + self.label_set, self.alias_set, self.description_set = self.set_sets( label_set, alias_set, description_set ) self.fp = dest_fp @@ -50,13 +50,9 @@ def __init__( # corrupted statement id self.corrupted_statement_id = None # truthy - self.truthy = truthy - # serialize prfix - kg_schema = KGSchema() - kg_schema.add_schema("@prefix : .", "ttl") - self.etk = ETK(kg_schema=kg_schema, modules=ETKModule) - self.doc = self.__setDoc() - self.__serialize_prefix() + self.truthy = truthy + self.reset_etk_doc() + self.serialize_prefix() def _node_2_entity(self, node:str): ''' @@ -96,47 +92,54 @@ def set_properties(self, prop_file: str): ) return prop_types - def __setSets(self, label_set: str, alias_set: str, description_set: str): + def set_sets(self, label_set: str, alias_set: str, description_set: str): return ( set(label_set.split(",")), set(alias_set.split(",")), set(description_set.split(",")), ) - def __setDoc(self, doc_id: str = "http://isi.edu/default-ns/projects"): + def reset_etk_doc(self, doc_id: str = "http://isi.edu/default-ns/projects"): """ reset the doc object and return it. Called at initialization and after outputting triples. """ - doc = self.etk.create_document({}, doc_id=doc_id) - # bind prefixes - doc.kg.bind("wikibase", "http://wikiba.se/ontology#") - doc.kg.bind("wd", "http://www.wikidata.org/entity/") - doc.kg.bind("wdt", "http://www.wikidata.org/prop/direct/") - doc.kg.bind("wdtn", "http://www.wikidata.org/prop/direct-normalized/") - doc.kg.bind("wdno", "http://www.wikidata.org/prop/novalue/") - doc.kg.bind("wds", "http://www.wikidata.org/entity/statement/") - doc.kg.bind("wdv", "http://www.wikidata.org/value/") - doc.kg.bind("wdref", "http://www.wikidata.org/reference/") - doc.kg.bind("p", "http://www.wikidata.org/prop/") - doc.kg.bind("pr", "http://www.wikidata.org/prop/reference/") - doc.kg.bind("prv", "http://www.wikidata.org/prop/reference/value/") - doc.kg.bind( - "prn", "http://www.wikidata.org/prop/reference/value-normalized/" - ) - doc.kg.bind("ps", "http://www.wikidata.org/prop/statement/") - doc.kg.bind("psv", "http://www.wikidata.org/prop/statement/value/") - doc.kg.bind( - "psn", "http://www.wikidata.org/prop/statement/value-normalized/" - ) - doc.kg.bind("pq", "http://www.wikidata.org/prop/qualifier/") - doc.kg.bind("pqv", "http://www.wikidata.org/prop/qualifier/value/") - doc.kg.bind( - "pqn", "http://www.wikidata.org/prop/qualifier/value-normalized/" - ) - doc.kg.bind("skos", "http://www.w3.org/2004/02/skos/core#") - doc.kg.bind("prov", "http://www.w3.org/ns/prov#") - doc.kg.bind("schema", "http://schema.org/") - return doc + kg_schema = KGSchema() + kg_schema.add_schema("@prefix : .", "ttl") + self.etk = ETK(kg_schema=kg_schema, modules=ETKModule) + self.doc = self.etk.create_document({}, doc_id=doc_id) + for k, v in wiki_namespaces.items(): + self.doc.kg.bind(k, v) + + def serialize(self): + """ + Seriealize the triples. Used a hack to avoid serializing the prefix again. + """ + docs = self.etk.process_ems(self.doc) + self.fp.write("\n\n".join(docs[0].kg.serialize("ttl").split("\n\n")[1:])) + self.fp.flush() + self.reset() + + def serialize_prefix(self): + """ + This function should be called only once after the doc object is initialized. + In order to serialize the prefix at the very begining it has to be printed per the change of rdflib 4.2.2->5.0.0 + Relevent issue: https://github.com/RDFLib/rdflib/issues/965 + """ + for k, v in wiki_namespaces.items(): + line = "@prefix " + k + " " + v + " .\n" + self.fp.write(line) + self.fp.write("\n") + self.fp.flush() + self.reset() + + def reset(self): + self.to_append_statement_id = None + self.to_append_statement = None + self.read_num_of_lines = 0 + self.reset_etk_doc() + + def finalize(self): + self.serialize() @staticmethod def process_text_string(string:str)->[str,str]: @@ -369,32 +372,6 @@ def entry_point(self, line_number:int , edge: str): self.read_num_of_lines += 1 self.corrupted_statement_id = None - def serialize(self): - """ - Seriealize the triples. Used a hack to avoid serializing the prefix again. - """ - docs = self.etk.process_ems(self.doc) - self.fp.write("\n\n".join(docs[0].kg.serialize("ttl").split("\n\n")[1:])) - self.fp.flush() - self.__reset() - - def __serialize_prefix(self): - """ - This function should be called only once after the doc object is initialized. - """ - docs = self.etk.process_ems(self.doc) - self.fp.write(docs[0].kg.serialize("ttl").split("\n\n")[0] + "\n\n") - self.fp.flush() - self.__reset() - - def __reset(self): - self.to_append_statement_id = None - self.to_append_statement = None - self.read_num_of_lines = 0 - self.doc = self.__setDoc() - - def finalize(self): - self.serialize() @staticmethod def replaceIllegalString(s:str)->str: From b6314a53a999490fa8111da6800651913fe4d2ae Mon Sep 17 00:00:00 2001 From: Rongpeng Date: Mon, 27 Apr 2020 22:44:54 -0700 Subject: [PATCH 013/278] added the gzip file support with '-gz yes' option --- kgtk/cli/generate_wikidata_triples.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/kgtk/cli/generate_wikidata_triples.py b/kgtk/cli/generate_wikidata_triples.py index ee17e77f2..bafa0cb99 100644 --- a/kgtk/cli/generate_wikidata_triples.py +++ b/kgtk/cli/generate_wikidata_triples.py @@ -83,12 +83,20 @@ def add_arguments(parser): ) parser.add_argument( "-ig", - "--ig", + "--ignore", action="store", type=str2bool, help="if set to yes, ignore various kinds of exceptions and mistakes and log them to a log file with line number in input file, rather than stopping. logging", dest="ignore", ) + parser.add_argument( + "-gz", + "--use-gz", + action="store", + type=str2bool, + help="if set to yes, read from compressed gz file", + dest="use_gz", + ) # logging level # parser.add_argument('-l', '--logging-level', action='store', dest='logging_level', # default="info", choices=("error", "warning", "info", "debug"), @@ -103,9 +111,10 @@ def run( n: int, truthy: bool, ignore: bool, - # logging_level:str + use_gz: bool ): # import modules locally + import gzip from kgtk.triple_generator import TripleGenerator import sys generator = TripleGenerator( @@ -119,8 +128,12 @@ def run( ) # process stdin num_line = 0 + if use_gz: + fp = gzip.open(sys.stdin.buffer, 'rt') + else: + fp = sys.stdin while True: - edge = sys.stdin.readline() + edge = fp.readline() if not edge: break if edge.startswith("#") or num_line == 0: # TODO First line omit From 69ffc1c703f57ab91a21bd13a0eb34ad09eaeed3 Mon Sep 17 00:00:00 2001 From: ckxz105 Date: Tue, 28 Apr 2020 09:48:45 -0700 Subject: [PATCH 014/278] code clean --- kgtk/cli/text_embedding.py | 84 +++++++++++++------------------------- 1 file changed, 28 insertions(+), 56 deletions(-) diff --git a/kgtk/cli/text_embedding.py b/kgtk/cli/text_embedding.py index 05404a54d..3a439c8ce 100644 --- a/kgtk/cli/text_embedding.py +++ b/kgtk/cli/text_embedding.py @@ -1,5 +1,6 @@ import sys import typing +from kgtk.exceptions import KGTKException ALL_EMBEDDING_MODELS_NAMES = [ "bert-base-nli-cls-token", @@ -41,7 +42,9 @@ def __init__(self, model_name=None, query_server=None, cache_config:dict={}): # self.model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) else: self.model_name = model_name - self.model = SentenceTransformer(model_name) + self._logger.info("Using model {}".format(self.model_name)) + self.model = SentenceTransformer(self.model_name) + # setup redis cache server if query_server is None or query_server == "": self.wikidata_server = "https://query.wikidata.org/sparql" else: @@ -135,7 +138,7 @@ def send_sparql_query(self, query_body:str): results = qm.query().convert()['results']['bindings'] return results except: - raise ValueError("Sending Sparl query to {} failed!".format(self.wikidata_server)) + raise KGTKException("Sending Sparl query to {} failed!".format(self.wikidata_server)) def get_item_description(self, qnodes: typing.List[str]=None, target_properties:dict={}, gt_label:str=""): """ @@ -147,7 +150,6 @@ def get_item_description(self, qnodes: typing.List[str]=None, target_properties: find_all_properties = True else: find_all_properties = False - # self._logger.error(str(qnodes)) properties_list = [[] for _ in range(4)] used_p_node_ids = set() names = ["labels", "descriptions", "isa_properties", "has_properties"] @@ -166,8 +168,7 @@ def get_item_description(self, qnodes: typing.List[str]=None, target_properties: for each_node in qnodes: cache_res = self.redis_server.get(each_node+str(properties_list)) if cache_res is not None: - sentences_cache_dict[each_node] = cache_res - # self._logger.error("{} hit!".format(each_node+str(properties_list))) + sentences_cache_dict[each_node] = cache_res.decode("utf-8") if len(sentences_cache_dict) > 0: qnodes = set(qnodes) - set(sentences_cache_dict.keys()) @@ -267,7 +268,6 @@ def get_item_description(self, qnodes: typing.List[str]=None, target_properties: each_sentence = self.attribute_to_sentence(self.candidates[each_node_id], each_node_id) self.candidates[each_node_id]["sentence"] = each_sentence if self.redis_server is not None: - # self._logger.error("Pushed: {}".format(each_node+str(properties_list))) self.redis_server.set(each_node+str(properties_list), each_sentence) for each_node_id, sentence in sentences_cache_dict.items(): @@ -285,6 +285,7 @@ def read_input(self, file_path: str, skip_nodes_set: set=None, import pandas as pd # type: ignore import numpy as np import math + self.property_labels_dict = property_labels_dict if input_format == "test_format": @@ -298,7 +299,7 @@ def read_input(self, file_path: str, skip_nodes_set: set=None, elif "kg_id" in input_df.columns: gt_column_id = "kg_id" else: - raise ValueError("Can't find ground truth id column! It should either named as `GT_kg_id` or `kg_id`") + raise KGTKException("Can't find ground truth id column! It should either named as `GT_kg_id` or `kg_id`") for _, each in input_df.iterrows(): if isinstance(each["candidates"], str): @@ -318,8 +319,8 @@ def read_input(self, file_path: str, skip_nodes_set: set=None, if label == "": self._logger.error("Skip a row with no label given: as {}".format(str(each))) continue - # candidates[each['label']] = temp temp.extend(gt_nodes) + for each_q in temp: self.q_node_to_label[each_q] = label if skip_nodes_set is not None and each_q in skip_nodes_set: @@ -344,7 +345,7 @@ def read_input(self, file_path: str, skip_nodes_set: set=None, # get header headers = f.readline().replace("\n", "").split("\t") if len(headers) < 3: - raise ValueError("No enough columns found on given input file. Only {} columns given but at least 3 needed.".format(len(headers))) + raise KGTKException("No enough columns found on given input file. Only {} columns given but at least 3 needed.".format(len(headers))) elif "node" in headers and "property" in headers and "value" in headers: column_references = {"node": headers.index("node"), "property": headers.index("property"), @@ -355,7 +356,7 @@ def read_input(self, file_path: str, skip_nodes_set: set=None, "value": 2} else: missing_column = set(["node", "property", "value"]) - set(headers) - raise ValueError("Missing column {}".format(missing_column)) + raise KGTKException("Missing column {}".format(missing_column)) self._logger.debug("column index information: ") self._logger.debug(str(column_references)) # read contents @@ -370,7 +371,7 @@ def read_input(self, file_path: str, skip_nodes_set: set=None, if "@" in node_value and node_value[0] != "@": node_value_org = node_value node_value = node_value[:node_value.index("@")] - # print("{} --> {}".format(node_value_org, node_value)) + # remove extra double quote " and single quote ' if node_value[0]== '"' and node_value[-1] == '"': node_value = node_value[1:-1] @@ -397,7 +398,7 @@ def read_input(self, file_path: str, skip_nodes_set: set=None, each_node_attributes["has_properties"].append(node_value) else: - raise ValueError("Unkonwn input format {}".format(input_format)) + raise KGTKException("Unkonwn input format {}".format(input_format)) self._logger.info("Totally {} Q nodes loaded.".format(len(self.candidates))) self.vector_dump_file = "dump_vectors_{}_{}.pkl".format(file_path[:file_path.rfind(".")], self. model_name) @@ -543,12 +544,6 @@ def plot_result(self, use_cache=True, vector_dump_file=None, import time from sklearn.manifold import TSNE # type: ignore - # if vector_dump_file is None: - # vector_dump_file = self.vector_dump_file.replace(".pkl", "_2D.pkl") - # if use_cache and os.path.exists(vector_dump_file): - # self._logger.info("Using cached 2D vector file!") - # self.load_vectors(vector_dump_file, "2D") - # else: self.vectors_map = {k: v for k, v in sorted(self.vectors_map.items(), key=lambda item: item[0], reverse=True)} vectors = list(self.vectors_map.values()) # use tsne to reduce dimension @@ -560,7 +555,6 @@ def plot_result(self, use_cache=True, vector_dump_file=None, self._logger.info("Totally used {} seconds.".format(time.time() - start)) if input_format == "test_format": - # # start plot gt_indexes = set() vector_map_keys = list(self.vectors_map.keys()) for each_node in self.gt_nodes: @@ -615,8 +609,6 @@ def evaluate_result(self): else: points = self.gt_indexes for i, each in enumerate(self.vectors_map.keys()): - # label = self.q_node_to_label[each] - # description = self.qnodes_descriptions.get(each, "") if i in points: if centroid is None: centroid = np.array(self.vectors_map[each]) @@ -633,25 +625,13 @@ def evaluate_result(self): @staticmethod def calculate_distance(a, b): if len(a) != len(b): - raise ValueError("Vector dimension are different!") + raise KGTKException("Vector dimension are different!") dist = 0 for v1, v2 in zip(a,b): dist += (v1 - v2) **2 dist = dist ** 0.5 return dist -# removed -# def load_embedding_model_names(): -# names = [] -# import os -# model_file_path = os.path.join(repr(__file__).replace("'","").replace("/text_embedding.py", ""), "all_embedding_models_names.txt") -# if os.path.exists(model_file_path): -# with open(model_file_path, "r") as f: -# for each_line in f.readlines(): -# names.append(each_line.replace("\n", "")) -# else: -# raise ValueError("Embedding model names list file lost! Please check.") -# return names def load_property_labels_file(input_files: typing.List[str]): labels_dict = {} @@ -663,7 +643,7 @@ def load_property_labels_file(input_files: typing.List[str]): if headers is None: headers = each_line if len(headers) < 2: - raise ValueError("No enough columns found on given input file. Only {} columns given but at least 2 needed.".format(len(headers))) + raise KGTKException("No enough columns found on given input file. Only {} columns given but at least 2 needed.".format(len(headers))) elif "predicate" in headers and "label" in headers: column_references = {"predicate": headers.index("predicate"), "label": headers.index("label")} @@ -672,7 +652,7 @@ def load_property_labels_file(input_files: typing.List[str]): "label": headers.index("label"), } else: - raise ValueError("Can't determine which column is label column for label file!") + raise KGTKException("Can't determine which column is label column for label file!") else: node_id = each_line[column_references["predicate"]] @@ -730,18 +710,21 @@ def load_black_list_files(file_path): def main(**kwargs): - # setup logger format - # console = logging.StreamHandler() - # console.setLevel(logging.DEBUG) - # formatter = logging.Formatter("%(asctime)s [%(levelname)s] %(name)s %(lineno)d -- %(message)s", '%m-%d %H:%M:%S') - # console.setFormatter(formatter) - # logging.getLogger('').addHandler(console) from kgtk.exceptions import KGTKException try: import logging import os import time from time import strftime + import torch + import typing + import pandas as pd + import string + import math + import re + import argparse + import pickle + logging_level = kwargs.get("logging_level", "warning") if logging_level == "info": logging_level_class = logging.INFO @@ -753,7 +736,6 @@ def main(**kwargs): logging_level_class = logging.ERROR else: logging_level_class = logging.WARNING - if logging_level != "none": logger_path = os.path.join(os.environ.get("HOME"), "kgtk_text_embedding_log_{}.log".format(strftime("%Y-%m-%d-%H-%M"))) logging.basicConfig(level=logging_level_class, @@ -763,15 +745,6 @@ def main(**kwargs): filemode='w') _logger = logging.getLogger(__name__) _logger.warning("Running with logging level {}".format(_logger.getEffectiveLevel())) - import torch - import typing - - import pandas as pd - import string - import math - import re - import argparse - import pickle # get input parameters from kwargs output_uri = kwargs.get("output_uri", "") @@ -797,7 +770,6 @@ def main(**kwargs): for each_property, each_input in zip(all_required_properties, all_property_relate_inputs): for each in each_input: properties[each] = each_property - output_properties = { "metatada_properties": kwargs.get("metatada_properties", []), @@ -809,9 +781,9 @@ def main(**kwargs): if isinstance(input_uris, str): input_uris = [input_uris] if len(all_models_names) == 0: - raise ValueError("No embedding vector model name given!") + raise KGTKException("No embedding vector model name given!") if len(input_uris) == 0: - raise ValueError("No input file path given!") + raise KGTKException("No input file path given!") if output_uri == "": output_uri = os.getenv("HOME") # os.getcwd() @@ -925,7 +897,7 @@ def str2bool(v): # query server parser.add_argument("--query-server", nargs='?', action='store', default="", dest="query_server", - help="cache host address, default is https://query.wikidata.org/sparql" + help="sparql query endpoint used for test_format input files, default is https://query.wikidata.org/sparql" ) From a6572bd8b791365d24c20c8ff6b2edd1b6721f8c Mon Sep 17 00:00:00 2001 From: Rongpeng Date: Tue, 28 Apr 2020 13:43:26 -0700 Subject: [PATCH 015/278] a one-time solution for uri validation error rooted in rdflib --- kgtk/triple_generator.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/kgtk/triple_generator.py b/kgtk/triple_generator.py index ec328c16d..12158ee81 100644 --- a/kgtk/triple_generator.py +++ b/kgtk/triple_generator.py @@ -7,6 +7,7 @@ from etk.etk import ETK from etk.knowledge_graph import KGSchema from etk.wikidata import wiki_namespaces +import rfc3986 from etk.wikidata.value import ( Precision, Item, @@ -74,7 +75,7 @@ def set_properties(self, prop_file: str): "monolingualtext": MonolingualText, "string": StringValue, "external-identifier":ExternalIdentifier, - "url":URLValue + "url":StringValue } with open(prop_file, "r") as fp: props = fp.readlines() @@ -258,7 +259,10 @@ def generate_normal_triple( elif edge_type == ExternalIdentifier: object = ExternalIdentifier(node2) elif edge_type == URLValue: - object = URLValue(node2) + if TripleGenerator.is_valid_uri_with_scheme_and_host(node2): + object = URLValue(node2) + else: + return False else: # treat everything else as stringValue object = StringValue(node2) @@ -294,6 +298,18 @@ def is_invalid_decimal_string(num_string): return True return False + @staticmethod + def is_valid_uri_with_scheme_and_host(uri:str): + ''' + https://github.com/python-hyper/rfc3986/issues/30#issuecomment-461661883 + ''' + try: + uri = rfc3986.URIReference.from_string(uri) + rfc3986.validators.Validator().require_presence_of("scheme", "host").check_validity_of("scheme", "host").validate(uri) + return True + except : + return False + @staticmethod def clean_number_string(num): from numpy import format_float_positional From f0250c9bca554175382a52488d057f94c2f8164b Mon Sep 17 00:00:00 2001 From: Rongpeng Date: Tue, 28 Apr 2020 20:50:09 -0700 Subject: [PATCH 016/278] add language detection for label/description/string, etc --- kgtk/triple_generator.py | 32 ++++++++++++++++++++++++++++---- requirements.txt | 1 + 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/kgtk/triple_generator.py b/kgtk/triple_generator.py index 12158ee81..d958bf2c8 100644 --- a/kgtk/triple_generator.py +++ b/kgtk/triple_generator.py @@ -8,6 +8,7 @@ from etk.knowledge_graph import KGSchema from etk.wikidata import wiki_namespaces import rfc3986 +from langdetect import detect from etk.wikidata.value import ( Precision, Item, @@ -127,7 +128,7 @@ def serialize_prefix(self): Relevent issue: https://github.com/RDFLib/rdflib/issues/965 """ for k, v in wiki_namespaces.items(): - line = "@prefix " + k + " " + v + " .\n" + line = "@prefix " + k + ": <" + v + "> .\n" self.fp.write(line) self.fp.write("\n") self.fp.flush() @@ -145,16 +146,17 @@ def finalize(self): @staticmethod def process_text_string(string:str)->[str,str]: ''' + detect language ''' if "@" in string: res = string.split("@") text_string = "@".join(res[:-1]).replace('"', "").replace("'", "") lang = res[-1].replace('"','').replace("'","") if len(lang) != 2: - lang = "en" + lang = detect(text_string) else: text_string = string.replace('"', "").replace("'", "") - lang = "en" + lang = detect(text_string) return [text_string, lang] def generate_label_triple(self, node1: str, label: str, node2: str) -> bool: @@ -194,7 +196,18 @@ def generate_normal_triple( # https://www.wikidata.org/wiki/Help:Dates # ^2013-01-01T00:00:00Z/11 # ^8000000-00-00T00:00:00Z/3 - if re.compile("[0-9]{4}").match(node2): + if re.compile("[12]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])").match(node2): + try: + dateTimeString = node2 + object = TimeValue( + value=dateTimeString, #TODO + calendar=Item("Q1985727"), + precision=Precision.year, + time_zone=0, + ) + except: + return False + elif re.compile("[12]\d{3}").match(node2): try: dateTimeString = node2 + "-01-01" object = TimeValue( @@ -205,6 +218,17 @@ def generate_normal_triple( ) except: return False + elif re.compile("[12]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])").match(node2): + try: + dateTimeString = node2 + object = TimeValue( + value=dateTimeString, #TODO + calendar=Item("Q1985727"), + precision=Precision.year, + time_zone=0, + ) + except: + return False else: try: dateTimeString, precision = node2[1:].split("/") diff --git a/requirements.txt b/requirements.txt index 7cb0d1c7f..36e6ec5bf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,6 +7,7 @@ sh sklearn SPARQLWrapper tqdm +langdetect rdflib==5.0.0 etk==2.2.1 simplejson From 137a9a35a7b14d7546143f8eb452919bc1d66a72 Mon Sep 17 00:00:00 2001 From: Rongpeng Date: Tue, 28 Apr 2020 21:20:58 -0700 Subject: [PATCH 017/278] fix empty string language tag issue --- kgtk/cli/generate_wikidata_triples.py | 5 +++-- kgtk/triple_generator.py | 12 +++++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/kgtk/cli/generate_wikidata_triples.py b/kgtk/cli/generate_wikidata_triples.py index bafa0cb99..4f0022000 100644 --- a/kgtk/cli/generate_wikidata_triples.py +++ b/kgtk/cli/generate_wikidata_triples.py @@ -127,7 +127,7 @@ def run( truthy=truthy ) # process stdin - num_line = 0 + num_line = 1 if use_gz: fp = gzip.open(sys.stdin.buffer, 'rt') else: @@ -136,10 +136,11 @@ def run( edge = fp.readline() if not edge: break - if edge.startswith("#") or num_line == 0: # TODO First line omit + if edge.startswith("#") or num_line == 1: # TODO First line omit num_line += 1 continue else: + print(num_line) generator.entry_point(num_line, edge) num_line += 1 generator.finalize() diff --git a/kgtk/triple_generator.py b/kgtk/triple_generator.py index d958bf2c8..087cca112 100644 --- a/kgtk/triple_generator.py +++ b/kgtk/triple_generator.py @@ -76,7 +76,7 @@ def set_properties(self, prop_file: str): "monolingualtext": MonolingualText, "string": StringValue, "external-identifier":ExternalIdentifier, - "url":StringValue + "url":URLValue } with open(prop_file, "r") as fp: props = fp.readlines() @@ -148,12 +148,18 @@ def process_text_string(string:str)->[str,str]: ''' detect language ''' + if len(string)==0: + return ["","en"] if "@" in string: res = string.split("@") text_string = "@".join(res[:-1]).replace('"', "").replace("'", "") lang = res[-1].replace('"','').replace("'","") - if len(lang) != 2: - lang = detect(text_string) + try: + detected_lang = detect(text_string) + if detected_lang != lang: + lang = detected_lang + except: + lang = "en" else: text_string = string.replace('"', "").replace("'", "") lang = detect(text_string) From 5ef1e199415eba144b1bbb4f8c255ef267ec5d62 Mon Sep 17 00:00:00 2001 From: Rongpeng Date: Wed, 29 Apr 2020 14:25:31 -0700 Subject: [PATCH 018/278] remove language detection in triple generator, pre-compile regex --- kgtk/cli/generate_wikidata_triples.py | 1 - kgtk/triple_generator.py | 39 +++++++++------------------ 2 files changed, 13 insertions(+), 27 deletions(-) diff --git a/kgtk/cli/generate_wikidata_triples.py b/kgtk/cli/generate_wikidata_triples.py index 4f0022000..4d57637b7 100644 --- a/kgtk/cli/generate_wikidata_triples.py +++ b/kgtk/cli/generate_wikidata_triples.py @@ -140,7 +140,6 @@ def run( num_line += 1 continue else: - print(num_line) generator.entry_point(num_line, edge) num_line += 1 generator.finalize() diff --git a/kgtk/triple_generator.py b/kgtk/triple_generator.py index 087cca112..e947d4e96 100644 --- a/kgtk/triple_generator.py +++ b/kgtk/triple_generator.py @@ -8,7 +8,6 @@ from etk.knowledge_graph import KGSchema from etk.wikidata import wiki_namespaces import rfc3986 -from langdetect import detect from etk.wikidata.value import ( Precision, Item, @@ -55,6 +54,9 @@ def __init__( self.truthy = truthy self.reset_etk_doc() self.serialize_prefix() + self.yyyy_mm_dd_pattern = re.compile("[12]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])") + self.yyyy_pattern = re.compile("[12]\d{3}") + self.quantity_pattern = re.compile("([\+|\-]?[0-9]+\.?[0-9]*)(?:\[([\+|\-]?[0-9]+\.?[0-9]*),([\+|\-]?[0-9]+\.?[0-9]*)\])?([U|Q](?:[0-9]+))?") def _node_2_entity(self, node:str): ''' @@ -63,7 +65,7 @@ def _node_2_entity(self, node:str): if node in self.prop_types: entity = WDProperty(node, self.prop_types[node]) else: - entity = WDItem(TripleGenerator.replaceIllegalString(node.upper())) + entity = WDItem(TripleGenerator.replace_illegal_string(node.upper())) return entity @@ -146,7 +148,7 @@ def finalize(self): @staticmethod def process_text_string(string:str)->[str,str]: ''' - detect language + Language detection is removed from triple generation. The user is responsible for detect the language ''' if len(string)==0: return ["","en"] @@ -154,15 +156,11 @@ def process_text_string(string:str)->[str,str]: res = string.split("@") text_string = "@".join(res[:-1]).replace('"', "").replace("'", "") lang = res[-1].replace('"','').replace("'","") - try: - detected_lang = detect(text_string) - if detected_lang != lang: - lang = detected_lang - except: - lang = "en" + if len(lang) > 2: + lang ="en" else: text_string = string.replace('"', "").replace("'", "") - lang = detect(text_string) + lang = "en" return [text_string, lang] def generate_label_triple(self, node1: str, label: str, node2: str) -> bool: @@ -197,12 +195,12 @@ def generate_normal_triple( # determine the edge type edge_type = self.prop_types[label] if edge_type == Item: - object = WDItem(TripleGenerator.replaceIllegalString(node2.upper())) + object = WDItem(TripleGenerator.replace_illegal_string(node2.upper())) elif edge_type == TimeValue: # https://www.wikidata.org/wiki/Help:Dates # ^2013-01-01T00:00:00Z/11 # ^8000000-00-00T00:00:00Z/3 - if re.compile("[12]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])").match(node2): + if self.yyyy_mm_dd_pattern.match(node2): try: dateTimeString = node2 object = TimeValue( @@ -213,7 +211,7 @@ def generate_normal_triple( ) except: return False - elif re.compile("[12]\d{3}").match(node2): + elif self.yyyy_pattern.match(node2): try: dateTimeString = node2 + "-01-01" object = TimeValue( @@ -224,17 +222,6 @@ def generate_normal_triple( ) except: return False - elif re.compile("[12]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])").match(node2): - try: - dateTimeString = node2 - object = TimeValue( - value=dateTimeString, #TODO - calendar=Item("Q1985727"), - precision=Precision.year, - time_zone=0, - ) - except: - return False else: try: dateTimeString, precision = node2[1:].split("/") @@ -264,7 +251,7 @@ def generate_normal_triple( elif edge_type == QuantityValue: # +70[+60,+80]Q743895 - res = re.compile("([\+|\-]?[0-9]+\.?[0-9]*)(?:\[([\+|\-]?[0-9]+\.?[0-9]*),([\+|\-]?[0-9]+\.?[0-9]*)\])?([U|Q](?:[0-9]+))?").match(node2).groups() + res = self.quantity_pattern.match(node2).groups() amount, lower_bound, upper_bound, unit = res # Handle extra small numbers for now. TODO @@ -420,5 +407,5 @@ def entry_point(self, line_number:int , edge: str): @staticmethod - def replaceIllegalString(s:str)->str: + def replace_illegal_string(s:str)->str: return s.replace(":","-") \ No newline at end of file From f4bb3925b872f7e6c0694aad9656d6d414ba8550 Mon Sep 17 00:00:00 2001 From: greatyyx Date: Wed, 29 Apr 2020 14:48:35 -0700 Subject: [PATCH 019/278] add test case example for dummy --- kgtk/tests/__init__.py | 0 kgtk/tests/test_cli_dummy.py | 26 ++++++++++++++++++++++++++ 2 files changed, 26 insertions(+) create mode 100644 kgtk/tests/__init__.py create mode 100644 kgtk/tests/test_cli_dummy.py diff --git a/kgtk/tests/__init__.py b/kgtk/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/kgtk/tests/test_cli_dummy.py b/kgtk/tests/test_cli_dummy.py new file mode 100644 index 000000000..bd92fd6a6 --- /dev/null +++ b/kgtk/tests/test_cli_dummy.py @@ -0,0 +1,26 @@ +import unittest +from kgtk.cli_entry import cli_entry +from kgtk.cli.dummy import run +from kgtk.exceptions import KGTKException + + +class TestDummy(unittest.TestCase): + + def test_module(self): + # test separate module files + pass + + def test_run(self): + # test run function + # exceptions here are not trapped by KGTKExceptionHandler + with self.assertRaises(KGTKException): + run(name='kgtk', info=None, error=True, _debug=False) + + def test_cli(self): + # test command from cli entry + assert cli_entry('kgtk', 'dummy', 'normal_test') == 0 + assert cli_entry('kgtk', 'dummy', 'test_exception', '-e') != 0 + + +if __name__ == '__main__': + unittest.main() From 5c56ba30d84d8171bdbeccd57c3c44430eb8d642 Mon Sep 17 00:00:00 2001 From: greatyyx Date: Wed, 29 Apr 2020 15:00:10 -0700 Subject: [PATCH 020/278] rename test dummy --- kgtk/tests/{test_cli_dummy.py => test_dummy.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename kgtk/tests/{test_cli_dummy.py => test_dummy.py} (100%) diff --git a/kgtk/tests/test_cli_dummy.py b/kgtk/tests/test_dummy.py similarity index 100% rename from kgtk/tests/test_cli_dummy.py rename to kgtk/tests/test_dummy.py From 5ec943c7c057425bbc6149978cea858e65d88051 Mon Sep 17 00:00:00 2001 From: Rongpeng Date: Wed, 29 Apr 2020 16:02:58 -0700 Subject: [PATCH 021/278] include more bad chars that Blazegraph doesn't accept in entity name; removed the upper() processing. It is the user's duty to verify that P000_author and P000_AUTHOR are two different properties. Processing here may cause trouble if the difference is intended. --- kgtk/triple_generator.py | 21 ++++++++++++++------- requirements.txt | 1 - 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/kgtk/triple_generator.py b/kgtk/triple_generator.py index e947d4e96..ae5316abe 100644 --- a/kgtk/triple_generator.py +++ b/kgtk/triple_generator.py @@ -20,7 +20,8 @@ URLValue ) - +BAD_CHARS = [":", "-", "&", ",", " ", + "(", ")", "\'", '\"', "/", "\\", "[", "]", ";"] class TripleGenerator: """ A class to maintain the status of the generator @@ -57,6 +58,7 @@ def __init__( self.yyyy_mm_dd_pattern = re.compile("[12]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])") self.yyyy_pattern = re.compile("[12]\d{3}") self.quantity_pattern = re.compile("([\+|\-]?[0-9]+\.?[0-9]*)(?:\[([\+|\-]?[0-9]+\.?[0-9]*),([\+|\-]?[0-9]+\.?[0-9]*)\])?([U|Q](?:[0-9]+))?") + def _node_2_entity(self, node:str): ''' @@ -65,7 +67,7 @@ def _node_2_entity(self, node:str): if node in self.prop_types: entity = WDProperty(node, self.prop_types[node]) else: - entity = WDItem(TripleGenerator.replace_illegal_string(node.upper())) + entity = WDItem(TripleGenerator.replace_illegal_string(node())) return entity @@ -195,7 +197,7 @@ def generate_normal_triple( # determine the edge type edge_type = self.prop_types[label] if edge_type == Item: - object = WDItem(TripleGenerator.replace_illegal_string(node2.upper())) + object = WDItem(TripleGenerator.replace_illegal_string(node2())) elif edge_type == TimeValue: # https://www.wikidata.org/wiki/Help:Dates # ^2013-01-01T00:00:00Z/11 @@ -288,7 +290,7 @@ def generate_normal_triple( # create qualifier edge on previous STATEMENT and return the updated STATEMENT if type(object) == WDItem: self.doc.kg.add_subject(object) - self.to_append_statement.add_qualifier(label.upper(), object) + self.to_append_statement.add_qualifier(label(), object) self.doc.kg.add_subject(self.to_append_statement) #TODO maybe can be positioned better for the edge cases. else: @@ -297,9 +299,9 @@ def generate_normal_triple( if type(object) == WDItem: self.doc.kg.add_subject(object) if self.truthy: - self.to_append_statement = entity.add_truthy_statement(label.upper(), object) + self.to_append_statement = entity.add_truthy_statement(label(), object) else: - self.to_append_statement = entity.add_statement(label.upper(), object) + self.to_append_statement = entity.add_statement(label(), object) self.doc.kg.add_subject(entity) return True @@ -408,4 +410,9 @@ def entry_point(self, line_number:int , edge: str): @staticmethod def replace_illegal_string(s:str)->str: - return s.replace(":","-") \ No newline at end of file + ''' + this function serves as the last gate of keeping illegal characters outside of entity creation. + ''' + for char in BAD_CHARS: + s = s.replace(char,"_") + return s \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 36e6ec5bf..7cb0d1c7f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,6 @@ sh sklearn SPARQLWrapper tqdm -langdetect rdflib==5.0.0 etk==2.2.1 simplejson From 3616bf4658a19181cab6dc204d244da6c73b2e3f Mon Sep 17 00:00:00 2001 From: Rongpeng Date: Wed, 29 Apr 2020 16:08:59 -0700 Subject: [PATCH 022/278] fix bug caused by removing upper --- kgtk/triple_generator.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kgtk/triple_generator.py b/kgtk/triple_generator.py index ae5316abe..57a32907b 100644 --- a/kgtk/triple_generator.py +++ b/kgtk/triple_generator.py @@ -67,7 +67,7 @@ def _node_2_entity(self, node:str): if node in self.prop_types: entity = WDProperty(node, self.prop_types[node]) else: - entity = WDItem(TripleGenerator.replace_illegal_string(node())) + entity = WDItem(TripleGenerator.replace_illegal_string(node)) return entity @@ -197,7 +197,7 @@ def generate_normal_triple( # determine the edge type edge_type = self.prop_types[label] if edge_type == Item: - object = WDItem(TripleGenerator.replace_illegal_string(node2())) + object = WDItem(TripleGenerator.replace_illegal_string(node2)) elif edge_type == TimeValue: # https://www.wikidata.org/wiki/Help:Dates # ^2013-01-01T00:00:00Z/11 @@ -290,7 +290,7 @@ def generate_normal_triple( # create qualifier edge on previous STATEMENT and return the updated STATEMENT if type(object) == WDItem: self.doc.kg.add_subject(object) - self.to_append_statement.add_qualifier(label(), object) + self.to_append_statement.add_qualifier(label, object) self.doc.kg.add_subject(self.to_append_statement) #TODO maybe can be positioned better for the edge cases. else: @@ -299,9 +299,9 @@ def generate_normal_triple( if type(object) == WDItem: self.doc.kg.add_subject(object) if self.truthy: - self.to_append_statement = entity.add_truthy_statement(label(), object) + self.to_append_statement = entity.add_truthy_statement(label, object) else: - self.to_append_statement = entity.add_statement(label(), object) + self.to_append_statement = entity.add_statement(label, object) self.doc.kg.add_subject(entity) return True From d414f54efc5e03a85516d6e8a315c59509f2d175 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 29 Apr 2020 16:55:43 -0700 Subject: [PATCH 023/278] Add the iso 639 library. --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 439daf31e..42c81a774 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,3 +13,4 @@ etk==2.2.1 simplejson pyrallel.lib attrs +iso-639 From 5a89adbd0c2a41355b4876b5c75d859883987356 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Thu, 30 Apr 2020 12:51:26 -0700 Subject: [PATCH 024/278] Add KGTK File data types. Add boolean values. Mark certain header conplaints as warnings. Add a KGTK data type validator. --- kgtk/join/kgtkformat.py | 16 +- kgtk/join/kgtkvalue.py | 372 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 385 insertions(+), 3 deletions(-) create mode 100644 kgtk/join/kgtkvalue.py diff --git a/kgtk/join/kgtkformat.py b/kgtk/join/kgtkformat.py index 760589d82..69b28dd6e 100644 --- a/kgtk/join/kgtkformat.py +++ b/kgtk/join/kgtkformat.py @@ -3,6 +3,7 @@ """ +from enum import Enum import sys import typing @@ -21,6 +22,15 @@ class KgtkFormat: # There is only one required column in a node file: ID_COLUMN_NAMES: typing.List[str] = ["id", "ID"] + class DataTypes(Enum): + NUMBER = 0 + STRING = 1 + STRUCTURED_LITERAL = 2 + SYMBOL = 3 + + TRUE_SYMBOL: str = "True" + FALSE_SYMBOL: str = "False" + @classmethod def _yelp(cls, msg: str, @@ -96,11 +106,11 @@ def check_column_name(cls, if ''.join(column_name.split()) != column_name.strip(): results.append("Column name '%s' contains internal white space" % column_name) if "," in column_name: - results.append("Column name '%s' contains a comma (,)" % column_name) + results.append("Warning: Column name '%s' contains a comma (,)" % column_name) if "|" in column_name: - results.append("Column name '%s' contains a vertical bar (|)" % column_name) + results.append("Warning: Column name '%s' contains a vertical bar (|)" % column_name) if ";" in column_name: - results.append("Column name '%s' contains a semicolon (;)" % column_name) + results.append("Warning: Column name '%s' contains a semicolon (;)" % column_name) return results diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py new file mode 100644 index 000000000..27435b6e7 --- /dev/null +++ b/kgtk/join/kgtkvalue.py @@ -0,0 +1,372 @@ +""" +Constants and helpers for the KGTK file format. + +""" + +from argparse import ArgumentParser +import attr +from iso639 import languages # type: ignore +import re +import sys +import typing + +from kgtk.join.kgtkformat import KgtkFormat + +@attr.s(slots=True, frozen=False) +class KgtkValue(KgtkFormat): + value: str = attr.ib(validator=attr.validators.instance_of(str)) + + + split_list_re: typing.Pattern = re.compile(r"(?typing.List[str]: + if self.values is None: + self.values = KgtkValue.split_list_re.split(self.value) + return self.values + + def get_item(self, idx: typing.Optional[int])-> str: + if idx is None: + return self.value + else: + return self.get_list()[idx] + + def is_list(self)->bool: + return len(self.get_list()) > 1 + + def get_values(self)->typing.List['KgtkValue']: + """ + Convert the value into a list of KgtkValues. + """ + if not self.is_list: + return [ self ] + else: + result: typing.List['KgtkValue'] = [ ] + v: str + for v in self.get_list(): + result.append(KgtkValue(v)) + return result + + def is_empty(self, idx: typing.Optional[int] = None)->bool: + """ + Return False if this value is a list and idx is None. + Otherwise, return True if the value is empty. + """ + if self.is_list() and idx is None: + return False + + v: str = self.get_item(idx) + return len(v) == 0 + + def is_number(self, idx: typing.Optional[int] = None)->bool: + """ + Return False if this value is a list and idx is None. + Otherwise, return True if the first character is 0-9,_,-,. . + """ + if self.is_list() and idx is None: + return False + + v: str = self.get_item(idx) + return v.startswith(("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "+", "-", ".")) + + def is_valid_number(self, idx: typing.Optional[int] = None)->bool: + """ + Return False if this value is a list and idx is None. + Otherwise, return True if the first character is 0-9,_,-,. + and Python can parse it. + """ + if self.is_list() and idx is None: + return False + + v: str = self.get_item(idx) + if not v.startswith(("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "+", "-", ".")): + return False + try: + i: int = int(v, 0) # The 0 allows prefixes: 0b, 0o, and 0x. + return True + except ValueError: + try: + f: float = float(v) + return True + except ValueError: + return False + + + def is_string(self, idx: typing.Optional[int] = None)->bool: + """ + Return False if this value is a list and idx is None. + Otherwise, return True if the first character is '"'. + """ + if self.is_list() and idx is None: + return False + + v: str = self.get_item(idx) + return v.startswith('"') + + string_re: typing.Pattern = re.compile(r'^"(?:[^"]|\\.)*"$') + + def is_valid_string(self, idx: typing.Optional[int] = None)->bool: + """ + Return False if this value is a list and idx is None. + Otherwise, return True if the first character is '"', + the last character is '"', and the only internal '"' is + escaped by backslash. + """ + if self.is_list() and idx is None: + return False + + v: str = self.get_item(idx) + if not v.startswith('"'): + return False + m: typing.Optional[typing.Match] = KgtkValue.string_re.match(v) + return m is not None + + def is_structured_literal(self, idx: typing.Optional[int] = None)->bool: + """ + Return False if this value is a list and idx is None. + Otherwise, return True if the first character is ^@'!. + """ + if self.is_list() and idx is None: + return False + + v: str = self.get_item(idx) + return v.startswith(("^", "@", "'", "!")) + + def is_symbol(self, idx: typing.Optional[int] = None)->bool: + """ + Return False if this value is a list and idx is None. + Otherwise, return True if not a number, string, nor structured literal. + """ + if self.is_list() and idx is None: + return False + + return not (self.is_number(idx) or self.is_string(idx) or self.is_structured_literal(idx)) + + def is_boolean(self, idx: typing.Optional[int] = None)->bool: + """ + Return False if this value is a list and idx is None. + Otherwise, return True if the value matches one of the special boolean symbols.. + """ + if self.is_list() and idx is None: + return False + + v: str = self.get_item(idx) + return v == KgtkFormat.TRUE_SYMBOL or v == KgtkFormat.FALSE_SYMBOL + + + def is_language_qualified_string(self, idx: typing.Optional[int] = None)->bool: + """ + Return False if this value is a list and idx is None. + Otherwise, return True if the first character is ' + """ + if self.is_list() and idx is None: + return False + + v: str = self.get_item(idx) + return v.startswith("'") + + language_qualified_string_re: typing.Pattern = re.compile(r"^(?P'(?:[^']|\\.)*')@(?P[a-zA-Z][a-zA-Z])$") + + def is_valid_language_qualified_string(self, idx: typing.Optional[int] = None)->bool: + """ + Return False if this value is a list and idx is None. + Otherwise, return True if the value looks like a language-qualified string. + """ + if self.is_list() and idx is None: + return False + + v: str = self.get_item(idx) + m: typing.Optional[typing.Match] = KgtkValue.language_qualified_string_re.match(v) + if m is None: + return False + + # Validate the language code: + lang: str = m.group("lang") + # print("lang: %s" % lang) + try: + languages.get(alpha2=lang.lower()) + return True + except KeyError: + return False + + def is_location_coordinates(self, idx: typing.Optional[int] = None)->bool: + """ + Return False if this value is a list and idx is None. + Otherwise, return True if the first character is @ + """ + if self.is_list() and idx is None: + return False + + v: str = self.get_item(idx) + return v.startswith("@") + + location_coordinates_re: typing.Pattern = re.compile(r"^@[-+]?\d{3}\.\d{5}/[-+]?\d{3}\.\d{5}$") + + def is_valid_location_coordinates(self, idx: typing.Optional[int] = None)->bool: + """ + Return False if this value is a list and idx is None. + Otherwise, return True if the value looks like valid location coordinates. + """ + if self.is_list() and idx is None: + return False + + v: str = self.get_item(idx) + m: typing.Optional[typing.Match] = KgtkValue.location_coordinates_re.match(v) + return m is not None + + def is_date_and_times(self, idx: typing.Optional[int] = None)->bool: + """ + Return False if this value is a list and idx is None. + Otherwise, return True if the first character is ^ + """ + if self.is_list() and idx is None: + return False + + v: str = self.get_item(idx) + return v.startswith("^") + + date_and_times_re: typing.Pattern = re.compile(r"^\^(?P[0-9]{4})(?P-)?(?P1[0-2]|0[1-9])(?(hyphen)-)(?P3[01]|0[1-9]|[12][0-9])T(?P2[0-3]|[01][0-9])(?(hyphen):)(?P[0-5][0-9])(?(hyphen):)(?P[0-5][0-9])(?PZ|\+[0-9][0-9](?::[0-9][0-9])?)?(?P/[0-9])?$") + + def is_valid_date_and_times(self, idx: typing.Optional[int] = None)->bool: + """ + Return False if this value is a list and idx is None. + Otherwise, return True if the value looks like valid date and times + literal based on ISO-8601. + """ + if self.is_list() and idx is None: + return False + + v: str = self.get_item(idx) + m: typing.Optional[typing.Match] = KgtkValue.date_and_times_re.match(v) + return m is not None + + def is_extension(self, idx: typing.Optional[int] = None)->bool: + """ + Return False if this value is a list and idx is None. + Otherwise, return True if the first character is ! + """ + if self.is_list() and idx is None: + return False + + v: str = self.get_item(idx) + return v.startswith("!") + + + def is_valid_literal(self, idx: typing.Optional[int] = None)->bool: + """ + Return False if this value is a list and idx is None. + Otherwise, return True if the value looks like a valid literal. + """ + if self.is_list() and idx is None: + return False + + if self.is_string(idx): + return self.is_valid_string(idx) + elif self.is_number(idx): + return self.is_valid_number(idx) + elif self.is_structured_literal(idx): + if self.is_language_qualified_string(idx): + return self.is_valid_language_qualified_string(idx) + elif self.is_location_coordinates(idx): + return self.is_valid_location_coordinates(idx) + elif self.is_date_and_times(idx): + return self.is_valid_date_and_times(idx) + elif self.is_extension(idx): + return False # no validation presently available. + else: + return False # Quantities will reach here at present. + else: + return False + + def is_valid_item(self, idx: typing.Optional[int] = None)->bool: + if self.is_list() and idx is None: + return False + + if self.is_empty(idx): + return True + elif self.is_valid_literal(idx): + return True + else: + return self.is_symbol(idx) # Should always be True + + def is_valid(self)->bool: + """ + Is this a valid KGTK cell value? If the value is a list, are all the + components valid? + """ + result: bool = True + kv: KgtkValue + for kv in self.get_values(): + result = result and kv.is_valid_item() + return result + + def describe(self, idx: typing.Optional[int] = None)->str: + """ + Return False if this value is a list and idx is None. + Otherwise, return a string that descrubes the value. + """ + if self.is_list() and idx is None: + result: str = "" + kv: KgtkValue + first: bool = True + for kv in self.get_values(): + if first: + first = not first + else: + result += KgtkFormat.LIST_SEPARATOR + result += kv.describe() + return result + + if self.is_empty(idx): + return "Empty" + elif self.is_string(idx): + if self.is_valid_string(idx): + return "String" + else: + return "Invalid String" + elif self.is_number(idx): + if self.is_valid_number(idx): + return "Number" + else: + return "Invalid Number" + elif self.is_structured_literal(idx): + if self.is_language_qualified_string(idx): + if self.is_valid_language_qualified_string(idx): + return "Language Qualified String" + else: + return "Invalid Language Qualified String" + elif self.is_location_coordinates(idx): + if self.is_valid_location_coordinates(idx): + return "Location Coordinates" + else: + return "Invalid Location Coordinates" + elif self.is_date_and_times(idx): + if self.is_valid_date_and_times(idx): + return "Date and Times" + else: + return "Invalid Date and Times" + elif self.is_extension(idx): + return "Extension (unvalidated)" + else: + return "Invalid Structured Literal" + else: + return "Symbol" + +def main(): + """ + Test the KGTK value vparser. + """ + parser = ArgumentParser() + parser.add_argument(dest="values", help="The values(s) to test", type=str, nargs="+") + parser.add_argument("-v", "--verbose", dest="verbose", help="Print additional progress messages.", action='store_true') + parser.add_argument( "--very-verbose", dest="very_verbose", help="Print additional progress messages.", action='store_true') + args = parser.parse_args() + + value: str + for value in args.values: + print("%s: %s" % (value, KgtkValue(value).describe())) + +if __name__ == "__main__": + main() From b2574dbd463ec5ed4b8a954be6993c37c5bf2b53 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Thu, 30 Apr 2020 12:52:56 -0700 Subject: [PATCH 025/278] Fix a comment. --- kgtk/join/kgtkvalue.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py index 27435b6e7..34023a4ba 100644 --- a/kgtk/join/kgtkvalue.py +++ b/kgtk/join/kgtkvalue.py @@ -304,8 +304,7 @@ def is_valid(self)->bool: def describe(self, idx: typing.Optional[int] = None)->str: """ - Return False if this value is a list and idx is None. - Otherwise, return a string that descrubes the value. + Return a string that describes the value. """ if self.is_list() and idx is None: result: str = "" From 7e6e21b85e93ce9c6636a0efbcfc9d60134c1030 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Thu, 30 Apr 2020 13:15:13 -0700 Subject: [PATCH 026/278] Document the number and string formats. Validate the coordinates better. --- kgtk/join/kgtkvalue.py | 59 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 54 insertions(+), 5 deletions(-) diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py index 34023a4ba..c9369efd6 100644 --- a/kgtk/join/kgtkvalue.py +++ b/kgtk/join/kgtkvalue.py @@ -1,5 +1,7 @@ """ -Constants and helpers for the KGTK file format. +Validate KGTK File data types. + +Dimensioned quantities are not supported. """ @@ -76,6 +78,20 @@ def is_valid_number(self, idx: typing.Optional[int] = None)->bool: Return False if this value is a list and idx is None. Otherwise, return True if the first character is 0-9,_,-,. and Python can parse it. + + Examples: + 1 + 123 + -123 + +123 + 0b101 + 0o277 + 0x24F + .4 + 0.4 + 10. + 10.4 + 10.4e10 """ if self.is_list() and idx is None: return False @@ -98,6 +114,11 @@ def is_string(self, idx: typing.Optional[int] = None)->bool: """ Return False if this value is a list and idx is None. Otherwise, return True if the first character is '"'. + + Strings begin and end with double quote ("). Any internal double + quotes must be escaped with backslash (\"). Triple-double quoted + strings are not supported by KGTK File Vormat v2. + """ if self.is_list() and idx is None: return False @@ -111,8 +132,8 @@ def is_valid_string(self, idx: typing.Optional[int] = None)->bool: """ Return False if this value is a list and idx is None. Otherwise, return True if the first character is '"', - the last character is '"', and the only internal '"' is - escaped by backslash. + the last character is '"', and any internal '"' characters are + escaped by backslashes. """ if self.is_list() and idx is None: return False @@ -202,19 +223,45 @@ def is_location_coordinates(self, idx: typing.Optional[int] = None)->bool: v: str = self.get_item(idx) return v.startswith("@") - location_coordinates_re: typing.Pattern = re.compile(r"^@[-+]?\d{3}\.\d{5}/[-+]?\d{3}\.\d{5}$") + location_coordinates_re: typing.Pattern = re.compile(r"^@(?P[-+]?\d{3}\.\d{5})/(?P[-+]?\d{3}\.\d{5})$") def is_valid_location_coordinates(self, idx: typing.Optional[int] = None)->bool: """ Return False if this value is a list and idx is None. Otherwise, return True if the value looks like valid location coordinates. + + Note: The coordinates must look exactly like the examples in KGTK + File Format v2, excelt for optional +/- characters. + + @043.26193/010.92708 """ if self.is_list() and idx is None: return False v: str = self.get_item(idx) m: typing.Optional[typing.Match] = KgtkValue.location_coordinates_re.match(v) - return m is not None + if m is None: + return False + + # Latitude runs from -90 to +90 + latstr: str = m.group("lat") + try: + lat: float = float(latstr) + if lat < -90. or lat > 90.: + return False + except ValueError: + return False + + # Longitude runs from -180 to +180 + lonstr: str = m.group("lon") + try: + lon: float = float(lonstr) + if lon < -180. or lon > 180.: + return False + except ValueError: + return False + + return True def is_date_and_times(self, idx: typing.Optional[int] = None)->bool: """ @@ -234,6 +281,8 @@ def is_valid_date_and_times(self, idx: typing.Optional[int] = None)->bool: Return False if this value is a list and idx is None. Otherwise, return True if the value looks like valid date and times literal based on ISO-8601. + + TODO: validate the calendar date, eg fail if 31-Apr-2020. """ if self.is_list() and idx is None: return False From 30341662c195184700b0acefc606cad8616cd56c Mon Sep 17 00:00:00 2001 From: Rongpeng Date: Thu, 30 Apr 2020 13:34:09 -0700 Subject: [PATCH 027/278] added line-by-line option and set up a quick test by running script directly --- kgtk/cli/generate_wikidata_triples.py | 72 +++++++++++++++++++++------ 1 file changed, 56 insertions(+), 16 deletions(-) diff --git a/kgtk/cli/generate_wikidata_triples.py b/kgtk/cli/generate_wikidata_triples.py index 4d57637b7..f3eefd02a 100644 --- a/kgtk/cli/generate_wikidata_triples.py +++ b/kgtk/cli/generate_wikidata_triples.py @@ -97,10 +97,14 @@ def add_arguments(parser): help="if set to yes, read from compressed gz file", dest="use_gz", ) - # logging level - # parser.add_argument('-l', '--logging-level', action='store', dest='logging_level', - # default="info", choices=("error", "warning", "info", "debug"), - # help="set up the logging level, default is INFO level") + parser.add_argument( + "-lbl", + "--line-by-line", + action="store", + type=str2bool, + help="if set to yes, read from standard input line by line, otherwise loads whole file into memory", + dest="line_by_line", + ) def run( @@ -111,7 +115,8 @@ def run( n: int, truthy: bool, ignore: bool, - use_gz: bool + use_gz: bool, + line_by_line: bool, ): # import modules locally import gzip @@ -127,19 +132,54 @@ def run( truthy=truthy ) # process stdin - num_line = 1 if use_gz: fp = gzip.open(sys.stdin.buffer, 'rt') else: fp = sys.stdin - while True: - edge = fp.readline() - if not edge: - break - if edge.startswith("#") or num_line == 1: # TODO First line omit - num_line += 1 - continue - else: - generator.entry_point(num_line, edge) - num_line += 1 + if line_by_line: + print("#line-by-line") + num_line = 1 + while True: + edge = fp.readline() + if not edge: + break + if edge.startswith("#") or num_line == 1: # TODO First line omit + num_line += 1 + continue + else: + generator.entry_point(num_line, edge) + num_line += 1 + else: + # not line by line + print("#not line-by-line") + for num, edge in enumerate(fp.readlines()): + if edge.startswith("#") or num == 0: + continue + else: + generator.entry_point(num+1,edge) generator.finalize() + +# testing profiling locally with direct call + +if __name__ == "__main__": + import gzip + from kgtk.triple_generator import TripleGenerator + import sys + with open("/tmp/gwt.log","w") as dest_fp: + generator = TripleGenerator( + prop_file="/Users/rongpeng/Documents/ISI/Covid19/covid_data/v1.3/heng_props.tsv", + label_set="label", + alias_set="aliases", + description_set="descriptions", + n=10000, + ignore=True, + truthy=True, + dest_fp = dest_fp + ) + with open("/Users/rongpeng/Documents/ISI/Covid19/covid_data/v1.3/kgtk_sample_sorted.tsv","r") as fp: + for num, edge in enumerate(fp.readlines()): + if edge.startswith("#") or num == 0: + continue + else: + generator.entry_point(num+1,edge) + generator.finalize() \ No newline at end of file From 6f10ba6d3952a5bd3d8a65ca87d493b0d9e517fd Mon Sep 17 00:00:00 2001 From: Rongpeng Date: Thu, 30 Apr 2020 13:35:56 -0700 Subject: [PATCH 028/278] added comments about visualization --- kgtk/cli/generate_wikidata_triples.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kgtk/cli/generate_wikidata_triples.py b/kgtk/cli/generate_wikidata_triples.py index f3eefd02a..c87495434 100644 --- a/kgtk/cli/generate_wikidata_triples.py +++ b/kgtk/cli/generate_wikidata_triples.py @@ -160,7 +160,9 @@ def run( generator.finalize() # testing profiling locally with direct call - +# pip3 install snakeviz +# run `snakeviz /tmp/tmp.dat` to visualize the call stacks. +# python3 -m cProfile -o /tmp/tmp.dat generate_wikidata_triples.py if __name__ == "__main__": import gzip from kgtk.triple_generator import TripleGenerator From 0995ea077451843af924b4bbb9fb04c480b33401 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Thu, 30 Apr 2020 13:55:48 -0700 Subject: [PATCH 029/278] Add a check for invalid KGTK values. --- kgtk/cli/validate.py | 12 ++++++++++++ kgtk/join/edgereader.py | 7 +++++-- kgtk/join/kgtkformat.py | 5 ++++- kgtk/join/kgtkreader.py | 35 +++++++++++++++++++++++++++++++++-- kgtk/join/nodereader.py | 7 +++++-- 5 files changed, 59 insertions(+), 7 deletions(-) diff --git a/kgtk/cli/validate.py b/kgtk/cli/validate.py index ae480b223..13c91484f 100644 --- a/kgtk/cli/validate.py +++ b/kgtk/cli/validate.py @@ -90,6 +90,10 @@ def add_arguments(parser): parser.add_argument( "--header-only", dest="header_only", help="Process the only the header of the input file.", action="store_true") + parser.add_argument( "--invalid-value-action", dest="invalid_value_action", + help="The action to take when a data cell value is invalid.", + type=ValidationAction, action=EnumNameAction, default=ValidationAction.REPORT) + parser.add_argument( "--long-line-action", dest="long_line_action", help="The action to take when a long line is detected.", type=ValidationAction, action=EnumNameAction, default=ValidationAction.COMPLAIN) @@ -106,6 +110,10 @@ def add_arguments(parser): parser.add_argument( "--truncate-long-lines", dest="truncate_long_lines", help="Remove excess trailing columns in long lines.", action='store_true') + parser.add_argument( "--unsafe-column-name-action", dest="unsafe_column_name_action", + help="The action to take when a column name is unsafe.", + type=ValidationAction, action=EnumNameAction, default=ValidationAction.REPORT) + parser.add_argument("-v", "--verbose", dest="verbose", help="Print additional progress messages.", action='store_true') parser.add_argument( "--very-verbose", dest="very_verbose", help="Print additional progress messages.", action='store_true') @@ -132,7 +140,9 @@ def run(kgtk_files: typing.Optional[typing.List[typing.Optional[Path]]], blank_id_line_action: typing.Optional[ValidationAction] = None, short_line_action: ValidationAction = ValidationAction.COMPLAIN, long_line_action: ValidationAction = ValidationAction.COMPLAIN, + invalid_value_action: ValidationAction = ValidationAction.REPORT, header_error_action: ValidationAction = ValidationAction.EXIT, + unsafe_column_name_action: ValidationAction = ValidationAction.REPORT, compression_type: typing.Optional[str] = None, gzip_in_parallel: bool = False, gzip_queue_size: int = KgtkReader.GZIP_QUEUE_SIZE_DEFAULT, @@ -177,7 +187,9 @@ def run(kgtk_files: typing.Optional[typing.List[typing.Optional[Path]]], blank_id_line_action=blank_id_line_action, short_line_action=short_line_action, long_line_action=long_line_action, + invalid_value_action=invalid_value_action, header_error_action=header_error_action, + unsafe_column_name_action=unsafe_column_name_action, compression_type=compression_type, gzip_in_parallel=gzip_in_parallel, gzip_queue_size=gzip_queue_size, diff --git a/kgtk/join/edgereader.py b/kgtk/join/edgereader.py index 77d654e62..b5ff87ff5 100644 --- a/kgtk/join/edgereader.py +++ b/kgtk/join/edgereader.py @@ -34,6 +34,7 @@ def open_edge_file(cls, blank_node2_line_action: ValidationAction = ValidationAction.EXCLUDE, short_line_action: ValidationAction = ValidationAction.EXCLUDE, long_line_action: ValidationAction = ValidationAction.EXCLUDE, + invalid_value_action: ValidationAction = ValidationAction.REPORT, header_error_action: ValidationAction = ValidationAction.EXIT, unsafe_column_name_action: ValidationAction = ValidationAction.REPORT, compression_type: typing.Optional[str] = None, @@ -104,6 +105,7 @@ def open_edge_file(cls, blank_node2_line_action=blank_node2_line_action, short_line_action=short_line_action, long_line_action=long_line_action, + invalid_value_action=invalid_value_action, header_error_action=header_error_action, unsafe_column_name_action=unsafe_column_name_action, compression_type=compression_type, @@ -115,7 +117,7 @@ def open_edge_file(cls, very_verbose=very_verbose, ) - def _ignore_if_blank_fields(self, values: typing.List[str], line: str): + def _ignore_if_blank_fields(self, values: typing.List[str], line: str)->bool: # Ignore line_action with blank node1 fields. This code comes after # filling missing trailing columns, although it could be reworked # to come first. @@ -131,7 +133,7 @@ def _ignore_if_blank_fields(self, values: typing.List[str], line: str): return self.exclude_line(self.blank_node2_line_action, "node2 is blank", line) return False # Do not ignore this line - def _skip_reserved_fields(self, column_name): + def _skip_reserved_fields(self, column_name)->bool: if self.node1_column_idx >= 0 and column_name in self.NODE1_COLUMN_NAMES: return True if self.node2_column_idx >= 0 and column_name in self.NODE2_COLUMN_NAMES: @@ -176,6 +178,7 @@ def main(): blank_node2_line_action=args.blank_node2_line_action, short_line_action=args.short_line_action, long_line_action=args.long_line_action, + invalid_value_action=args.invalid_value_action, header_error_action=args.header_error_action, unsafe_column_name_action=args.unsafe_column_name_action, compression_type=args.compression_type, diff --git a/kgtk/join/kgtkformat.py b/kgtk/join/kgtkformat.py index 69b28dd6e..9ab4612fd 100644 --- a/kgtk/join/kgtkformat.py +++ b/kgtk/join/kgtkformat.py @@ -8,6 +8,7 @@ import typing from kgtk.join.validationaction import ValidationAction +from kgtk.join.kgtkvalue import KgtkValue class KgtkFormat: COLUMN_SEPARATOR: str = "\t" @@ -111,8 +112,10 @@ def check_column_name(cls, results.append("Warning: Column name '%s' contains a vertical bar (|)" % column_name) if ";" in column_name: results.append("Warning: Column name '%s' contains a semicolon (;)" % column_name) + kv: KgtkValue = KgtkValue(column_name) + if not kv.is_valid(): + results.append(kv.describe()) return results - @classmethod def check_column_names(cls, diff --git a/kgtk/join/kgtkreader.py b/kgtk/join/kgtkreader.py index a040891cd..710703ded 100644 --- a/kgtk/join/kgtkreader.py +++ b/kgtk/join/kgtkreader.py @@ -20,6 +20,7 @@ from kgtk.join.enumnameaction import EnumNameAction from kgtk.join.gzipprocess import GunzipProcess from kgtk.join.kgtkformat import KgtkFormat +from kgtk.join.kgtkvalue import KgtkValue from kgtk.join.validationaction import ValidationAction @attr.s(slots=True, frozen=False) @@ -79,6 +80,9 @@ class KgtkReader(KgtkFormat, ClosableIter[typing.List[str]]): header_error_action: ValidationAction = attr.ib(validator=attr.validators.instance_of(ValidationAction), default=ValidationAction.EXIT) unsafe_column_name_action: ValidationAction = attr.ib(validator=attr.validators.instance_of(ValidationAction), default=ValidationAction.REPORT) + # Validate data cell values? + invalid_value_action: ValidationAction = attr.ib(validator=attr.validators.instance_of(ValidationAction), default=ValidationAction.REPORT) + # Repair records with too many or too few fields? fill_short_lines: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) truncate_long_lines: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) @@ -122,6 +126,7 @@ def open(cls, blank_id_line_action: typing.Optional[ValidationAction] = None, short_line_action: ValidationAction = ValidationAction.EXCLUDE, long_line_action: ValidationAction = ValidationAction.EXCLUDE, + invalid_value_action: ValidationAction = ValidationAction.REPORT, header_error_action: ValidationAction = ValidationAction.EXIT, unsafe_column_name_action: ValidationAction = ValidationAction.REPORT, compression_type: typing.Optional[str] = None, @@ -238,6 +243,7 @@ def open(cls, blank_id_line_action=blank_id_line_action, short_line_action=short_line_action, long_line_action=long_line_action, + invalid_value_action=invalid_value_action, header_error_action=header_error_action, unsafe_column_name_action=unsafe_column_name_action, compression_type=compression_type, @@ -291,6 +297,7 @@ def open(cls, blank_id_line_action=blank_id_line_action, short_line_action=short_line_action, long_line_action=long_line_action, + invalid_value_action=invalid_value_action, header_error_action=header_error_action, unsafe_column_name_action=unsafe_column_name_action, compression_type=compression_type, @@ -330,6 +337,7 @@ def open(cls, blank_id_line_action=blank_id_line_action, short_line_action=short_line_action, long_line_action=long_line_action, + invalid_value_action=invalid_value_action, header_error_action=header_error_action, unsafe_column_name_action=unsafe_column_name_action, compression_type=compression_type, @@ -549,6 +557,10 @@ def __next__(self)-> typing.List[str]: if self._ignore_if_blank_fields(values, line): continue + if self.invalid_value_action != ValidationAction.PASS: + if self._ignore_invalid_values(values, line): + continue + self.data_lines_passed += 1 if self.very_verbose: sys.stdout.write(".") @@ -556,12 +568,26 @@ def __next__(self)-> typing.List[str]: return values + def _ignore_invalid_values(self, values: typing.List[str], line: str)->bool: + value: str + idx: int = 0 + problems: typing.List[str] = [ ] + for value in values: + kv: KgtkValue = KgtkValue(value) + if not kv.is_valid(): + problems.append("%s: %s" % (self.column_names[idx], kv.describe())) + if len(problems) > 0 and self.exclude_line(self.invalid_value_action, + "; ".join(problems), + line): + return True + return False + # May be overridden - def _ignore_if_blank_fields(self, values: typing.List[str], line: str): + def _ignore_if_blank_fields(self, values: typing.List[str], line: str)->bool: return False # May be overridden - def _skip_reserved_fields(self, column_name): + def _skip_reserved_fields(self, column_name)->bool: return False def additional_column_names(self)->typing.List[str]: @@ -642,6 +668,10 @@ def add_shared_arguments(cls, parser: ArgumentParser): help="The action to take when a header error is detected Only ERROR or EXIT are supported.", type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXIT) + parser.add_argument( "--invalid-value-action", dest="invalid_value_action", + help="The action to take when a data cell value is invalid.", + type=ValidationAction, action=EnumNameAction, default=ValidationAction.REPORT) + parser.add_argument( "--long-line-action", dest="long_line_action", help="The action to take when a long line is detected.", type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) @@ -709,6 +739,7 @@ def main(): blank_id_line_action=args.blank_id_line_action, short_line_action=args.short_line_action, long_line_action=args.long_line_action, + invalid_value_action=args.invalid_value_action, header_error_action=args.header_error_action, unsafe_column_name_action=args.unsafe_column_name_action, compression_type=args.compression_type, diff --git a/kgtk/join/nodereader.py b/kgtk/join/nodereader.py index 327150fb1..a6fdcae07 100644 --- a/kgtk/join/nodereader.py +++ b/kgtk/join/nodereader.py @@ -33,6 +33,7 @@ def open_node_file(cls, blank_id_line_action: ValidationAction = ValidationAction.EXCLUDE, short_line_action: ValidationAction = ValidationAction.EXCLUDE, long_line_action: ValidationAction = ValidationAction.EXCLUDE, + invalid_value_action: ValidationAction = ValidationAction.REPORT, header_error_action: ValidationAction = ValidationAction.EXIT, unsafe_column_name_action: ValidationAction = ValidationAction.REPORT, compression_type: typing.Optional[str] = None, @@ -95,6 +96,7 @@ def open_node_file(cls, blank_id_line_action=blank_id_line_action, short_line_action=short_line_action, long_line_action=long_line_action, + invalid_value_action=invalid_value_action, header_error_action=header_error_action, unsafe_column_name_action=unsafe_column_name_action, compression_type=compression_type, @@ -106,7 +108,7 @@ def open_node_file(cls, very_verbose=very_verbose, ) - def _ignore_if_blank_fields(self, values: typing.List[str], line: str): + def _ignore_if_blank_fields(self, values: typing.List[str], line: str)->bool: # Ignore line_action with blank id fields. This code comes after # filling missing trailing columns, although it could be reworked # to come first. @@ -116,7 +118,7 @@ def _ignore_if_blank_fields(self, values: typing.List[str], line: str): return self.exclude_line(self.blank_id_line_action, "id is blank", line) return False # Do not ignore this line - def _skip_reserved_fields(self, column_name): + def _skip_reserved_fields(self, column_name)->bool: if self.id_column_idx >= 0 and column_name in self.ID_COLUMN_NAMES: return True return False @@ -153,6 +155,7 @@ def main(): blank_id_line_action=args.blank_id_line_action, short_line_action=args.short_line_action, long_line_action=args.long_line_action, + invalid_value_action=args.invalid_value_action, header_error_action=args.header_error_action, unsafe_column_name_action=args.unsafe_column_name_action, compression_type=args.compression_type, From 4570aa94b4c45b28ed79666bf85ff6c23b80cba5 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Thu, 30 Apr 2020 14:02:55 -0700 Subject: [PATCH 030/278] Split kgtkformat.py into kgtkformat.py and kgtkbase.py to avoid a circular import. --- kgtk/join/kgtkbase.py | 204 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 204 insertions(+) create mode 100644 kgtk/join/kgtkbase.py diff --git a/kgtk/join/kgtkbase.py b/kgtk/join/kgtkbase.py new file mode 100644 index 000000000..9aaab4e11 --- /dev/null +++ b/kgtk/join/kgtkbase.py @@ -0,0 +1,204 @@ +""" +Constants and helpers for the KGTK file format. + +""" + +from enum import Enum +import sys +import typing + +from kgtk.join.validationaction import ValidationAction +from kgtk.join.kgtkformat import KgtkFormat +from kgtk.join.kgtkvalue import KgtkValue + +class KgtkBase(KgtkFormat): + @classmethod + def _yelp(cls, + msg: str, + header_line: str, + error_action: ValidationAction, + error_file: typing.TextIO = sys.stderr)->bool: + """ + Take a validation action. Only ERROR is special, all other values are treated as EXIT. + """ + result: bool + if error_action == ValidationAction.ERROR: + # Immediately raise an exception. + raise ValueError("In input header'%s': %s" % (header_line, msg)) + + if (error_action in [ValidationAction.REPORT, ValidationAction.COMPLAIN, ValidationAction.EXIT ]): + print("In input header '%s': %s" % (header_line, msg), file=error_file) + if error_action == ValidationAction.EXIT: + sys.exit(1) + return error_action in [ValidationAction.PASS, ValidationAction.REPORT] + + @classmethod + def get_column_idx(cls, + name_or_aliases: typing.List[str], + column_name_map: typing.Mapping[str, int], + header_line: str, + error_action: ValidationAction, + error_file: typing.TextIO = sys.stderr, + is_optional: bool = False, + )->int: + """ + Get the indices of the required column using one of its allowable names. + Return -1 if the column is not found and is optional. + """ + found_column_name: str = "" + column_idx: int = -1 + col_name: str + for col_name in name_or_aliases: + if col_name in column_name_map: + if column_idx >= 0: + cls._yelp("Ambiguous required column names %s and %s" % (found_column_name, col_name), + header_line=header_line, error_action=error_action, error_file=error_file) + column_idx = column_name_map[col_name] + found_column_name = col_name + if column_idx < 0 and not is_optional: + # TODO: throw a better exception: + cls._yelp("Missing required column: %s" % " | ".join(name_or_aliases), + header_line=header_line, error_action=error_action, error_file=error_file) + return column_idx + + @classmethod + def check_column_name(cls, + column_name: str, + header_line: str, + error_action: ValidationAction, + error_file: typing.TextIO = sys.stderr)->typing.List[str]: + # Returns a list of complaints. + # Check for valid column names. + # 1) Check for leading white space + # 2) Check for trailing white space + # 3) Check for internal white space + # 1) except inside "" and '' quoted strings + # 4) Check for commas + # 5) Check for vertical bars + # 6) Check for semicolons + # + # TODO: It might be possible to make some of these checks more efficient. + results: typing.List[str] = [ ] + if column_name.lstrip() != column_name: + results.append("Column name '%s' starts with leading white space" % column_name) + if column_name.rstrip() != column_name: + results.append("Column name '%s' ends with leading white space" % column_name) + if not (column_name.startswith('"') or column_name.startswith("'")): + if ''.join(column_name.split()) != column_name.strip(): + results.append("Column name '%s' contains internal white space" % column_name) + if "," in column_name: + results.append("Warning: Column name '%s' contains a comma (,)" % column_name) + if "|" in column_name: + results.append("Warning: Column name '%s' contains a vertical bar (|)" % column_name) + if ";" in column_name: + results.append("Warning: Column name '%s' contains a semicolon (;)" % column_name) + kv: KgtkValue = KgtkValue(column_name) + if not kv.is_valid(): + results.append(kv.describe()) + return results + + @classmethod + def check_column_names(cls, + column_names: typing.List[str], + header_line: str, + error_action: ValidationAction, + error_file: typing.TextIO = sys.stderr)->bool: + """ + Returns True if the column names are OK. + """ + complaints: typing.List[str] = [ ] + column_name: str + for column_name in column_names: + gripes: typing.List[str] = cls.check_column_name(column_name, header_line, error_action, error_file) + complaints.extend(gripes) + if len(complaints) == 0: + return True + # take the error action, joining the complaints into a single message. + msg = ", ".join(complaints) + cls._yelp(msg, header_line=header_line, error_action=error_action, error_file=error_file) + return False + + @classmethod + def build_column_name_map(cls, + column_names: typing.List[str], + header_line: str, + error_action: ValidationAction, + error_file: typing.TextIO = sys.stderr + )->typing.Mapping[str, int]: + # Validate the column names and build a map from column name + # to column index. + column_name_map: typing.MutableMapping[str, int] = { } + column_idx: int = 0 # There may be a more pythonic way to do this + column_name: str + for column_name in column_names: + if column_name is None or len(column_name) == 0: + cls._yelp("Column %d has an invalid name in the file header" % column_idx, + header_line=header_line, error_action=error_action, error_file=error_file) + + # Ensure that columns names are not duplicated: + if column_name in column_name_map: + cls._yelp("Column %d (%s) is a duplicate of column %d" % (column_idx, column_name, column_name_map[column_name]), + header_line=header_line, error_action=error_action, error_file=error_file) + + column_name_map[column_name] = column_idx + column_idx += 1 + return column_name_map + + @classmethod + def required_edge_columns(cls, + column_name_map: typing.Mapping[str, int], + header_line: str, + error_action: ValidationAction, + error_file: typing.TextIO = sys.stderr + )->typing.Tuple[int, int, int]: + # Ensure that the three required columns are present: + node1_column_idx: int = cls.get_column_idx(cls.NODE1_COLUMN_NAMES, column_name_map, + header_line=header_line, error_action=error_action, error_file=error_file) + + node2_column_idx: int = cls.get_column_idx(cls.NODE2_COLUMN_NAMES, column_name_map, + header_line=header_line, error_action=error_action, error_file=error_file) + + label_column_idx: int = cls.get_column_idx(cls.LABEL_COLUMN_NAMES, column_name_map, + header_line=header_line, error_action=error_action, error_file=error_file) + + return (node1_column_idx, node2_column_idx, label_column_idx) + + @classmethod + def required_node_column(cls, + column_name_map: typing.Mapping[str, int], + header_line: str, + error_action: ValidationAction, + error_file: typing.TextIO = sys.stderr + )->int: + # Ensure that the required column is present: + return cls.get_column_idx(cls.ID_COLUMN_NAMES, column_name_map, + header_line=header_line, error_action=error_action, error_file=error_file) + + @classmethod + def additional_edge_columns(cls, column_names: typing.List[str])->typing.List[str]: + """ + Return a list of column names in this file excluding the required columns. + """ + additional_columns: typing.List[str] = [ ] + column_name: str + for column_name in column_names: + if column_name not in KgtkFormat.NODE1_COLUMN_NAMES and \ + column_name not in KgtkFormat.NODE2_COLUMN_NAMES and \ + column_name not in KgtkFormat.LABEL_COLUMN_NAMES: + additional_columns.append(column_name) + return additional_columns + + @classmethod + def additional_node_columns(cls, + column_names: typing.List[str], + )->typing.List[str]: + """ + Return a list of column names in this file excluding the required columns. + """ + additional_columns: typing.List[str] = [ ] + column_name: str + for column_name in column_names: + if column_name not in KgtkFormat.ID_COLUMN_NAMES: + additional_columns.append(column_name) + return additional_columns + From 74b2b0b545e3f535867786a712bb937af64ce976 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Thu, 30 Apr 2020 14:06:56 -0700 Subject: [PATCH 031/278] Add missing column index. --- kgtk/join/kgtkformat.py | 194 ---------------------------------------- kgtk/join/kgtkreader.py | 8 +- kgtk/join/kgtkwriter.py | 3 +- 3 files changed, 7 insertions(+), 198 deletions(-) diff --git a/kgtk/join/kgtkformat.py b/kgtk/join/kgtkformat.py index 9ab4612fd..6c392fe82 100644 --- a/kgtk/join/kgtkformat.py +++ b/kgtk/join/kgtkformat.py @@ -7,9 +7,6 @@ import sys import typing -from kgtk.join.validationaction import ValidationAction -from kgtk.join.kgtkvalue import KgtkValue - class KgtkFormat: COLUMN_SEPARATOR: str = "\t" COMMENT_INDICATOR: str = "#" @@ -31,194 +28,3 @@ class DataTypes(Enum): TRUE_SYMBOL: str = "True" FALSE_SYMBOL: str = "False" - - @classmethod - def _yelp(cls, - msg: str, - header_line: str, - error_action: ValidationAction, - error_file: typing.TextIO = sys.stderr)->bool: - """ - Take a validation action. Only ERROR is special, all other values are treated as EXIT. - """ - result: bool - if error_action == ValidationAction.ERROR: - # Immediately raise an exception. - raise ValueError("In input header'%s': %s" % (header_line, msg)) - - if (error_action in [ValidationAction.REPORT, ValidationAction.COMPLAIN, ValidationAction.EXIT ]): - print("In input header '%s': %s" % (header_line, msg), file=error_file) - if error_action == ValidationAction.EXIT: - sys.exit(1) - return error_action in [ValidationAction.PASS, ValidationAction.REPORT] - - @classmethod - def get_column_idx(cls, - name_or_aliases: typing.List[str], - column_name_map: typing.Mapping[str, int], - header_line: str, - error_action: ValidationAction, - error_file: typing.TextIO = sys.stderr, - is_optional: bool = False, - )->int: - """ - Get the indices of the required column using one of its allowable names. - Return -1 if the column is not found and is optional. - """ - found_column_name: str = "" - column_idx: int = -1 - col_name: str - for col_name in name_or_aliases: - if col_name in column_name_map: - if column_idx >= 0: - cls._yelp("Ambiguous required column names %s and %s" % (found_column_name, col_name), - header_line=header_line, error_action=error_action, error_file=error_file) - column_idx = column_name_map[col_name] - found_column_name = col_name - if column_idx < 0 and not is_optional: - # TODO: throw a better exception: - cls._yelp("Missing required column: %s" % " | ".join(name_or_aliases), - header_line=header_line, error_action=error_action, error_file=error_file) - return column_idx - - @classmethod - def check_column_name(cls, - column_name: str, - header_line: str, - error_action: ValidationAction, - error_file: typing.TextIO = sys.stderr)->typing.List[str]: - # Returns a list of complaints. - # Check for valid column names. - # 1) Check for leading white space - # 2) Check for trailing white space - # 3) Check for internal white space - # 1) except inside "" and '' quoted strings - # 4) Check for commas - # 5) Check for vertical bars - # 6) Check for semicolons - # - # TODO: It might be possible to make some of these checks more efficient. - results: typing.List[str] = [ ] - if column_name.lstrip() != column_name: - results.append("Column name '%s' starts with leading white space" % column_name) - if column_name.rstrip() != column_name: - results.append("Column name '%s' ends with leading white space" % column_name) - if not (column_name.startswith('"') or column_name.startswith("'")): - if ''.join(column_name.split()) != column_name.strip(): - results.append("Column name '%s' contains internal white space" % column_name) - if "," in column_name: - results.append("Warning: Column name '%s' contains a comma (,)" % column_name) - if "|" in column_name: - results.append("Warning: Column name '%s' contains a vertical bar (|)" % column_name) - if ";" in column_name: - results.append("Warning: Column name '%s' contains a semicolon (;)" % column_name) - kv: KgtkValue = KgtkValue(column_name) - if not kv.is_valid(): - results.append(kv.describe()) - return results - - @classmethod - def check_column_names(cls, - column_names: typing.List[str], - header_line: str, - error_action: ValidationAction, - error_file: typing.TextIO = sys.stderr)->bool: - """ - Returns True if the column names are OK. - """ - complaints: typing.List[str] = [ ] - column_name: str - for column_name in column_names: - gripes: typing.List[str] = cls.check_column_name(column_name, header_line, error_action, error_file) - complaints.extend(gripes) - if len(complaints) == 0: - return True - # take the error action, joining the complaints into a single message. - msg = ", ".join(complaints) - cls._yelp(msg, header_line=header_line, error_action=error_action, error_file=error_file) - return False - - @classmethod - def build_column_name_map(cls, - column_names: typing.List[str], - header_line: str, - error_action: ValidationAction, - error_file: typing.TextIO = sys.stderr - )->typing.Mapping[str, int]: - # Validate the column names and build a map from column name - # to column index. - column_name_map: typing.MutableMapping[str, int] = { } - column_idx: int = 0 # There may be a more pythonic way to do this - column_name: str - for column_name in column_names: - if column_name is None or len(column_name) == 0: - cls._yelp("Column %d has an invalid name in the file header" % column_idx, - header_line=header_line, error_action=error_action, error_file=error_file) - - # Ensure that columns names are not duplicated: - if column_name in column_name_map: - cls._yelp("Column %d (%s) is a duplicate of column %d" % (column_idx, column_name, column_name_map[column_name]), - header_line=header_line, error_action=error_action, error_file=error_file) - - column_name_map[column_name] = column_idx - column_idx += 1 - return column_name_map - - @classmethod - def required_edge_columns(cls, - column_name_map: typing.Mapping[str, int], - header_line: str, - error_action: ValidationAction, - error_file: typing.TextIO = sys.stderr - )->typing.Tuple[int, int, int]: - # Ensure that the three required columns are present: - node1_column_idx: int = cls.get_column_idx(cls.NODE1_COLUMN_NAMES, column_name_map, - header_line=header_line, error_action=error_action, error_file=error_file) - - node2_column_idx: int = cls.get_column_idx(cls.NODE2_COLUMN_NAMES, column_name_map, - header_line=header_line, error_action=error_action, error_file=error_file) - - label_column_idx: int = cls.get_column_idx(cls.LABEL_COLUMN_NAMES, column_name_map, - header_line=header_line, error_action=error_action, error_file=error_file) - - return (node1_column_idx, node2_column_idx, label_column_idx) - - @classmethod - def required_node_column(cls, - column_name_map: typing.Mapping[str, int], - header_line: str, - error_action: ValidationAction, - error_file: typing.TextIO = sys.stderr - )->int: - # Ensure that the required column is present: - return cls.get_column_idx(cls.ID_COLUMN_NAMES, column_name_map, - header_line=header_line, error_action=error_action, error_file=error_file) - - @classmethod - def additional_edge_columns(cls, column_names: typing.List[str])->typing.List[str]: - """ - Return a list of column names in this file excluding the required columns. - """ - additional_columns: typing.List[str] = [ ] - column_name: str - for column_name in column_names: - if column_name not in KgtkFormat.NODE1_COLUMN_NAMES and \ - column_name not in KgtkFormat.NODE2_COLUMN_NAMES and \ - column_name not in KgtkFormat.LABEL_COLUMN_NAMES: - additional_columns.append(column_name) - return additional_columns - - @classmethod - def additional_node_columns(cls, - column_names: typing.List[str], - )->typing.List[str]: - """ - Return a list of column names in this file excluding the required columns. - """ - additional_columns: typing.List[str] = [ ] - column_name: str - for column_name in column_names: - if column_name not in KgtkFormat.ID_COLUMN_NAMES: - additional_columns.append(column_name) - return additional_columns - diff --git a/kgtk/join/kgtkreader.py b/kgtk/join/kgtkreader.py index 710703ded..0ee76ef03 100644 --- a/kgtk/join/kgtkreader.py +++ b/kgtk/join/kgtkreader.py @@ -19,12 +19,13 @@ from kgtk.join.closableiter import ClosableIter, ClosableIterTextIOWrapper from kgtk.join.enumnameaction import EnumNameAction from kgtk.join.gzipprocess import GunzipProcess +from kgtk.join.kgtkbase import KgtkBase from kgtk.join.kgtkformat import KgtkFormat from kgtk.join.kgtkvalue import KgtkValue from kgtk.join.validationaction import ValidationAction @attr.s(slots=True, frozen=False) -class KgtkReader(KgtkFormat, ClosableIter[typing.List[str]]): +class KgtkReader(KgtkBase, ClosableIter[typing.List[str]]): ERROR_LIMIT_DEFAULT: int = 1000 GZIP_QUEUE_SIZE_DEFAULT: int = GunzipProcess.GZIP_QUEUE_SIZE_DEFAULT @@ -576,6 +577,7 @@ def _ignore_invalid_values(self, values: typing.List[str], line: str)->bool: kv: KgtkValue = KgtkValue(value) if not kv.is_valid(): problems.append("%s: %s" % (self.column_names[idx], kv.describe())) + idx += 1 if len(problems) > 0 and self.exclude_line(self.invalid_value_action, "; ".join(problems), line): @@ -592,9 +594,9 @@ def _skip_reserved_fields(self, column_name)->bool: def additional_column_names(self)->typing.List[str]: if self.is_edge_file: - return KgtkFormat.additional_edge_columns(self.column_names) + return KgtkBase.additional_edge_columns(self.column_names) elif self.is_node_file: - return KgtkFormat.additional_node_columns(self.column_names) + return KgtkBase.additional_node_columns(self.column_names) else: # TODO: throw a better exception. raise ValueError("KgtkReader: Unknown Kgtk file type.") diff --git a/kgtk/join/kgtkwriter.py b/kgtk/join/kgtkwriter.py index 20c55ff4b..2aac2f7e1 100644 --- a/kgtk/join/kgtkwriter.py +++ b/kgtk/join/kgtkwriter.py @@ -18,11 +18,12 @@ from kgtk.join.kgtkreader import KgtkReader from kgtk.join.enumnameaction import EnumNameAction from kgtk.join.gzipprocess import GzipProcess +from kgtk.join.kgtkbase import KgtkBase from kgtk.join.kgtkformat import KgtkFormat from kgtk.join.validationaction import ValidationAction @attr.s(slots=True, frozen=False) -class KgtkWriter(KgtkFormat): +class KgtkWriter(KgtkBase): GZIP_QUEUE_SIZE_DEFAULT: int = GzipProcess.GZIP_QUEUE_SIZE_DEFAULT file_path: typing.Optional[Path] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(Path))) From c8c5602933429e25289b5b0e8b13042bdff9e9ad Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Thu, 30 Apr 2020 14:20:37 -0700 Subject: [PATCH 032/278] Better routing of feedback and error messages. --- kgtk/cli/validate.py | 10 +++++----- kgtk/join/edgereader.py | 2 ++ kgtk/join/kgtkbase.py | 2 +- kgtk/join/kgtkreader.py | 42 ++++++++++++++++++++++------------------- kgtk/join/nodereader.py | 2 ++ 5 files changed, 33 insertions(+), 25 deletions(-) diff --git a/kgtk/cli/validate.py b/kgtk/cli/validate.py index 13c91484f..9d97293e9 100644 --- a/kgtk/cli/validate.py +++ b/kgtk/cli/validate.py @@ -165,11 +165,11 @@ def run(kgtk_files: typing.Optional[typing.List[typing.Optional[Path]]], kgtk_file: typing.Optional[Path] for kgtk_file in kgtk_files: if verbose: - print("\n====================================================") + print("\n====================================================", flush=True) if kgtk_file is not None: - print("Validating '%s'" % str(kgtk_file), file=error_file) + print("Validating '%s'" % str(kgtk_file), file=error_file, flush=True) else: - print ("Validating from stdin", file=error_file) + print ("Validating from stdin", file=error_file, flush=True) kr: KgtkReader = KgtkReader.open(kgtk_file, force_column_names=force_column_names, @@ -200,14 +200,14 @@ def run(kgtk_files: typing.Optional[typing.List[typing.Optional[Path]]], if header_only: kr.close() if verbose: - print("Validated the header only.", file=error_file) + print("Validated the header only.", file=error_file, flush=True) else: line_count: int = 0 row: typing.List[str] for row in kr: line_count += 1 if verbose: - print("Validated %d data lines" % line_count, file=error_file) + print("Validated %d data lines" % line_count, file=error_file, flush=True) return 0 except SystemExit as e: diff --git a/kgtk/join/edgereader.py b/kgtk/join/edgereader.py index b5ff87ff5..b15c464dc 100644 --- a/kgtk/join/edgereader.py +++ b/kgtk/join/edgereader.py @@ -48,6 +48,7 @@ def open_edge_file(cls, compression_type=compression_type, gzip_in_parallel=gzip_in_parallel, gzip_queue_size=gzip_queue_size, + error_file=error_file, verbose=verbose) # Read the edge file header and split it into column names. @@ -57,6 +58,7 @@ def open_edge_file(cls, force_column_names=force_column_names, skip_first_record=skip_first_record, column_separator=column_separator, + error_file=error_file, verbose=verbose) # Check for unsafe column names. diff --git a/kgtk/join/kgtkbase.py b/kgtk/join/kgtkbase.py index 9aaab4e11..7f6ed4afd 100644 --- a/kgtk/join/kgtkbase.py +++ b/kgtk/join/kgtkbase.py @@ -27,7 +27,7 @@ def _yelp(cls, raise ValueError("In input header'%s': %s" % (header_line, msg)) if (error_action in [ValidationAction.REPORT, ValidationAction.COMPLAIN, ValidationAction.EXIT ]): - print("In input header '%s': %s" % (header_line, msg), file=error_file) + print("In input header '%s': %s" % (header_line, msg), file=error_file, flush=True) if error_action == ValidationAction.EXIT: sys.exit(1) return error_action in [ValidationAction.PASS, ValidationAction.REPORT] diff --git a/kgtk/join/kgtkreader.py b/kgtk/join/kgtkreader.py index 0ee76ef03..54dfed634 100644 --- a/kgtk/join/kgtkreader.py +++ b/kgtk/join/kgtkreader.py @@ -144,6 +144,7 @@ def open(cls, compression_type=compression_type, gzip_in_parallel=gzip_in_parallel, gzip_queue_size=gzip_queue_size, + error_file=error_file, verbose=verbose) # Read the kgtk file header and split it into column names. We get the @@ -154,6 +155,7 @@ def open(cls, force_column_names=force_column_names, skip_first_record=skip_first_record, column_separator=column_separator, + error_file=error_file, verbose=verbose) # Check for unsafe column names. cls.check_column_names(column_names, @@ -182,12 +184,12 @@ def open(cls, is_edge_file = True is_node_file = False if verbose: - print("%s column found, this is a KGTK edge file" % column_names[node1_idx], file=error_file) + print("%s column found, this is a KGTK edge file" % column_names[node1_idx], file=error_file, flush=True) else: is_edge_file = False is_node_file = True if verbose: - print("node1 column not found, assuming this is a KGTK node file", file=error_file) + print("node1 column not found, assuming this is a KGTK node file", file=error_file, flush=True) elif mode is KgtkReader.Mode.EDGE: is_edge_file = True @@ -211,7 +213,7 @@ def open(cls, error_file=error_file) if verbose: - print("KgtkReader: Reading an edge file. node1=%d label=%d node2=%d" % (node1_column_idx, label_column_idx, node2_column_idx), file=error_file) + print("KgtkReader: Reading an edge file. node1=%d label=%d node2=%d" % (node1_column_idx, label_column_idx, node2_column_idx), file=error_file, flush=True) # Apply the proper defaults to the blank node1, node2, and id actions: if blank_node1_line_action is None: @@ -267,7 +269,7 @@ def open(cls, error_file=error_file) if verbose: - print("KgtkReader: Reading an node file. id=%d" % (id_column_idx), file=error_file) + print("KgtkReader: Reading an node file. id=%d" % (id_column_idx), file=error_file, flush=True) # Apply the proper defaults to the blank node1, node2, and id actions: if blank_node1_line_action is None: @@ -356,27 +358,28 @@ def _open_compressed_file(cls, file_name: str, file_or_path: typing.Union[Path, typing.TextIO], who: str, + error_file: typing.TextIO, verbose: bool)->typing.TextIO: # TODO: find a better way to coerce typing.IO[Any] to typing.TextIO if compression_type in [".gz", "gz"]: if verbose: - print("%s: reading gzip %s" % (who, file_name)) + print("%s: reading gzip %s" % (who, file_name), file=error_file, flush=True) return gzip.open(file_or_path, mode="rt") # type: ignore elif compression_type in [".bz2", "bz2"]: if verbose: - print("%s: reading bz2 %s" % (who, file_name)) + print("%s: reading bz2 %s" % (who, file_name), file=error_file, flush=True) return bz2.open(file_or_path, mode="rt") # type: ignore elif compression_type in [".xz", "xz"]: if verbose: - print("%s: reading lzma %s" % (who, file_name)) + print("%s: reading lzma %s" % (who, file_name), file=error_file, flush=True) return lzma.open(file_or_path, mode="rt") # type: ignore elif compression_type in [".lz4", "lz4"]: if verbose: - print("%s: reading lz4 %s" % (who, file_name)) + print("%s: reading lz4 %s" % (who, file_name), file=error_file, flush=True) return lz4.frame.open(file_or_path, mode="rt") # type: ignore else: # TODO: throw a better exception. @@ -387,24 +390,25 @@ def _openfile(cls, file_path: typing.Optional[Path], compression_type: typing.Optional[str], gzip_in_parallel: bool, gzip_queue_size: int, + error_file: typing.TextIO, verbose: bool)->ClosableIter[str]: who: str = cls.__name__ if file_path is None or str(file_path) == "-": if compression_type is not None and len(compression_type) > 0: - return ClosableIterTextIOWrapper(cls._open_compressed_file(compression_type, "-", sys.stdin, who, verbose)) + return ClosableIterTextIOWrapper(cls._open_compressed_file(compression_type, "-", sys.stdin, who, error_file, verbose)) else: if verbose: - print("%s: reading stdin" % who) + print("%s: reading stdin" % who, file=error_file, flush=True) return ClosableIterTextIOWrapper(sys.stdin) if verbose: - print("%s: File_path.suffix: %s" % (who, file_path.suffix)) + print("%s: File_path.suffix: %s" % (who, file_path.suffix), file=error_file, flush=True) gzip_file: typing.TextIO if compression_type is not None and len(compression_type) > 0: - gzip_file = cls._open_compressed_file(compression_type, str(file_path), file_path, who, verbose) + gzip_file = cls._open_compressed_file(compression_type, str(file_path), file_path, who, error_file, verbose) elif file_path.suffix in [".bz2", ".gz", ".lz4", ".xz"]: - gzip_file = cls._open_compressed_file(file_path.suffix, str(file_path), file_path, who, verbose) + gzip_file = cls._open_compressed_file(file_path.suffix, str(file_path), file_path, who, error_file, verbose) else: if verbose: print("%s: reading file %s" % (who, str(file_path))) @@ -424,6 +428,7 @@ def _build_column_names(cls, force_column_names: typing.Optional[typing.List[str]], skip_first_record: bool, column_separator: str, + error_file: typing.TextIO, verbose: bool = False, )->typing.Tuple[str, typing.List[str]]: """ @@ -436,8 +441,7 @@ def _build_column_names(cls, # TODO: if the read fails, throw a more useful exception with the line number. header: str = next(source).rstrip("\r\n") if verbose: - print("header: %s" % header) - + print("header: %s" % header, file=error_file, flush=True) # Split the first line into column names. return header, header.split(column_separator) @@ -469,10 +473,10 @@ def exclude_line(self, action: ValidationAction, msg: str, line: str)->bool: # Immediately raise an exception. raise ValueError("In input data line %d, %s: %s" % (self.data_lines_read, msg, line)) elif action == ValidationAction.EXIT: - print("In input data line %d, %s: %s" % (self.data_lines_read, msg, line), file=self.error_file) + print("In input data line %d, %s: %s" % (self.data_lines_read, msg, line), file=self.error_file, flush=True) sys.exit(1) - print("In input data line %d, %s: %s" % (self.data_lines_read, msg, line), file=self.error_file) + print("In input data line %d, %s: %s" % (self.data_lines_read, msg, line), file=self.error_file, flush=True) self.data_errors_reported += 1 if self.error_limit > 0 and self.data_errors_reported >= self.error_limit: raise ValueError("Too many data errors.") @@ -507,7 +511,7 @@ def __next__(self)-> typing.List[str]: line = line.rstrip("\r\n") if self.very_verbose: - print("'%s'" % line) + print("'%s'" % line, file=self.error_file, flush=True) # Ignore empty lines. if self.empty_line_action != ValidationAction.PASS and len(line) == 0: @@ -755,7 +759,7 @@ def main(): row: typing.List[str] for row in kr: line_count += 1 - print("Read %d lines" % line_count) + print("Read %d lines" % line_count, file=error_file, flush=True) if __name__ == "__main__": main() diff --git a/kgtk/join/nodereader.py b/kgtk/join/nodereader.py index a6fdcae07..4b3de9587 100644 --- a/kgtk/join/nodereader.py +++ b/kgtk/join/nodereader.py @@ -47,6 +47,7 @@ def open_node_file(cls, compression_type=compression_type, gzip_in_parallel=gzip_in_parallel, gzip_queue_size=gzip_queue_size, + error_file=error_file, verbose=verbose) # Read the node file header and split it into column names. @@ -56,6 +57,7 @@ def open_node_file(cls, force_column_names=force_column_names, skip_first_record=skip_first_record, column_separator=column_separator, + error_file=error_file, verbose=verbose) # Check for unsafe column names. cls.check_column_names(column_names, From 1673789a7c38823cee9af5d5dd09c5da9dc77081 Mon Sep 17 00:00:00 2001 From: Filip Ilievski <6437407+filievski@users.noreply.github.com> Date: Thu, 30 Apr 2020 14:53:40 -0700 Subject: [PATCH 033/278] Update README.md --- README.md | 46 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 41 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index f386ae6d5..40ba1e138 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,27 @@ -# kgtk +# kgtk: Knowledge Graph Toolkit + +KGTK is a Python library for easy manipulation with knowledge graphs. It provides a flexible framework that allows chaining of common graph operations, such as: extraction of subgraphs, filtering, computation of graph metrics, validation, cleaning, generating embeddings, and so on. Its principal format is TSV, though we do support a number of other inputs. + +### Documentation + +To-do. + +### Features + +* Computation of class instances +* Computation of reachable nodes +* Filtering based on property values +* Removal of columns +* Sorting +* Computation of various embeddings +* Cleaning and validation +* Computation of graph metrics +* Joining and concatenation of graphs +* Manipulation of Wikidata data + +### Releases + +* [Source code](https://github.com/usc-isi-i2/kgtk/releases) ### Installation @@ -10,7 +33,7 @@ conda activate kgtk-env ``` **Note:** Installing Graph-tool is problematic on python 3.8 and out of a virtual environment. Thus: **the advised installation path is by using a virtual environment.** -2. Install (the dev branch at this point): `pip install git+https://github.com/usc-isi-i2/kgtk.git@dev` +2. Install (the dev branch at this point): `pip install kgtk` You can test if `kgtk` is installed properly now with: `kgtk -h`. @@ -25,7 +48,20 @@ You can test if `kgtk` is installed properly now with: `kgtk -h`. More installation options for `mlr` can be found [here](https://johnkerl.org/miller/doc/build.html). -### The Miller Package +### Running KGTK commands + +To list all the available KGTK commands, run: +`kgtk -h` + +To see the arguments of a particular commands, run: +`kgtk -h` + +An example command that computes instances of the subclasses of two classes: +`kgtk instances --transitive --class Q13442814,Q12345678` + +### Additional information + +#### The Miller Package 1. Our code uses the "miller" package to manipulate formatted data. @@ -40,7 +76,7 @@ https://www.mankier.com/1/mlr 4. You may need to install the miller command (mlr) on your system. * OpenSUSE Tumbleweed Linux: install package `miller` from Main Repository (OSS) -### List of supported tools +#### List of supported tools * `instances` * `reachable_nodes` * `filter` @@ -57,6 +93,6 @@ To get an information on how to use each of them, run: More detailed description of the arguments will be added here promptly. -### Developer Instruction +#### Developer Instruction Please refer to [this](README_dev.md) From 995792fdf15dea8b4190da341d8212c3390a0743 Mon Sep 17 00:00:00 2001 From: Filip Ilievski <6437407+filievski@users.noreply.github.com> Date: Thu, 30 Apr 2020 14:53:56 -0700 Subject: [PATCH 034/278] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 40ba1e138..fd2e15a5d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# kgtk: Knowledge Graph Toolkit +# KGTK: Knowledge Graph Toolkit KGTK is a Python library for easy manipulation with knowledge graphs. It provides a flexible framework that allows chaining of common graph operations, such as: extraction of subgraphs, filtering, computation of graph metrics, validation, cleaning, generating embeddings, and so on. Its principal format is TSV, though we do support a number of other inputs. From d1cb0bb62e6e7639863cac330a859dd5cda5cf2c Mon Sep 17 00:00:00 2001 From: Filip Ilievski <6437407+filievski@users.noreply.github.com> Date: Thu, 30 Apr 2020 14:54:35 -0700 Subject: [PATCH 035/278] Update README.md --- README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index fd2e15a5d..1d9247fba 100644 --- a/README.md +++ b/README.md @@ -2,11 +2,11 @@ KGTK is a Python library for easy manipulation with knowledge graphs. It provides a flexible framework that allows chaining of common graph operations, such as: extraction of subgraphs, filtering, computation of graph metrics, validation, cleaning, generating embeddings, and so on. Its principal format is TSV, though we do support a number of other inputs. -### Documentation +## Documentation To-do. -### Features +## Features * Computation of class instances * Computation of reachable nodes @@ -19,11 +19,11 @@ To-do. * Joining and concatenation of graphs * Manipulation of Wikidata data -### Releases +## Releases * [Source code](https://github.com/usc-isi-i2/kgtk/releases) -### Installation +## Installation 0. Our installations will be in a conda environment. If you don't have a conda installed, follow [link](https://docs.conda.io/projects/conda/en/latest/user-guide/install/) to install it. 1. Set up your own conda environment: @@ -48,7 +48,7 @@ You can test if `kgtk` is installed properly now with: `kgtk -h`. More installation options for `mlr` can be found [here](https://johnkerl.org/miller/doc/build.html). -### Running KGTK commands +## Running KGTK commands To list all the available KGTK commands, run: `kgtk -h` @@ -59,9 +59,9 @@ To see the arguments of a particular commands, run: An example command that computes instances of the subclasses of two classes: `kgtk instances --transitive --class Q13442814,Q12345678` -### Additional information +## Additional information -#### The Miller Package +### The Miller Package 1. Our code uses the "miller" package to manipulate formatted data. @@ -76,7 +76,7 @@ https://www.mankier.com/1/mlr 4. You may need to install the miller command (mlr) on your system. * OpenSUSE Tumbleweed Linux: install package `miller` from Main Repository (OSS) -#### List of supported tools +### List of supported tools * `instances` * `reachable_nodes` * `filter` @@ -93,6 +93,6 @@ To get an information on how to use each of them, run: More detailed description of the arguments will be added here promptly. -#### Developer Instruction +### Developer Instructions Please refer to [this](README_dev.md) From 17b311938728de5ca14da5b8443da08f218bfc47 Mon Sep 17 00:00:00 2001 From: Filip Ilievski <6437407+filievski@users.noreply.github.com> Date: Thu, 30 Apr 2020 14:55:37 -0700 Subject: [PATCH 036/278] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 1d9247fba..4110d54ac 100644 --- a/README.md +++ b/README.md @@ -51,12 +51,15 @@ More installation options for `mlr` can be found [here](https://johnkerl.org/mil ## Running KGTK commands To list all the available KGTK commands, run: + `kgtk -h` To see the arguments of a particular commands, run: + `kgtk -h` An example command that computes instances of the subclasses of two classes: + `kgtk instances --transitive --class Q13442814,Q12345678` ## Additional information From 5c3d6ab9a748724c5117c0621d12cc27635441da Mon Sep 17 00:00:00 2001 From: ckxz105 Date: Thu, 30 Apr 2020 15:50:01 -0700 Subject: [PATCH 037/278] temporaray push for text-embedding, bug fix --- kgtk/cli/text_embedding.py | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/kgtk/cli/text_embedding.py b/kgtk/cli/text_embedding.py index 3a439c8ce..420f4f9e0 100644 --- a/kgtk/cli/text_embedding.py +++ b/kgtk/cli/text_embedding.py @@ -384,12 +384,12 @@ def read_input(self, file_path: str, skip_nodes_set: set=None, else: # if we get to next id # concate all properties into one sentence to represent the Q node - concated_sentence = self.attribute_to_sentence(each_node_attributes, node_id) + concated_sentence = self.attribute_to_sentence(each_node_attributes, current_process_node_id) each_node_attributes["sentence"] = concated_sentence - self.candidates[node_id] = each_node_attributes - self._logger.debug("{} --> {}".format(node_id, concated_sentence)) + self.candidates[current_process_node_id] = each_node_attributes # after write down finish, we can cleaer and start parsing next one each_node_attributes = {"has_properties":[], "isa_properties":[], "label_properties":[], "description_properties": []} + # update to new id current_process_node_id = node_id if node_property in target_properties: @@ -725,24 +725,16 @@ def main(**kwargs): import argparse import pickle - logging_level = kwargs.get("logging_level", "warning") - if logging_level == "info": - logging_level_class = logging.INFO - elif logging_level == "debug": + do_logging = kwargs.get("logging_level", None) + if do_logging and do_logging.lower() != "none": logging_level_class = logging.DEBUG - elif logging_level == "warning": - logging_level_class = logging.WARNING - elif logging_level == "error": - logging_level_class = logging.ERROR - else: - logging_level_class = logging.WARNING - if logging_level != "none": logger_path = os.path.join(os.environ.get("HOME"), "kgtk_text_embedding_log_{}.log".format(strftime("%Y-%m-%d-%H-%M"))) logging.basicConfig(level=logging_level_class, - format="%(asctime)s [%(levelname)s] %(name)s %(lineno)d -- %(message)s", - datefmt='%m-%d %H:%M:%S', - filename=logger_path, - filemode='w') + format="%(asctime)s [%(levelname)s] %(name)s %(lineno)d -- %(message)s", + datefmt='%m-%d %H:%M:%S', + filename=logger_path, + filemode='w') + _logger = logging.getLogger(__name__) _logger.warning("Running with logging level {}".format(_logger.getEffectiveLevel())) @@ -832,10 +824,12 @@ def str2bool(v): return False else: raise argparse.ArgumentTypeError('Boolean value expected.') - # logging level + # logging level, no longer need as there is a global choice for it parser.add_argument('-l', '--logging-level', action='store', dest='logging_level', default="info", choices=("error", "warning", "info", "debug", "none"), help="set up the logging level, default is INFO level") + # parser.add_argument('--debug', action='store_true', dest='logging_level', + # help='set up to make logging and store at home directory.') # model name all_models_names = ALL_EMBEDDING_MODELS_NAMES parser.add_argument('-m', '--model', action='store', nargs='+', dest='all_models_names', From a74599f6c116e78326e7af32000e55727d345046 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Thu, 30 Apr 2020 16:56:43 -0700 Subject: [PATCH 038/278] Optimize for the case of empty columns. --- kgtk/join/kgtkreader.py | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/kgtk/join/kgtkreader.py b/kgtk/join/kgtkreader.py index 54dfed634..d6a30a209 100644 --- a/kgtk/join/kgtkreader.py +++ b/kgtk/join/kgtkreader.py @@ -574,19 +574,28 @@ def __next__(self)-> typing.List[str]: return values def _ignore_invalid_values(self, values: typing.List[str], line: str)->bool: + """Give a row of values, validate each value. If we find one or more + validation problems, we might want to emit erro messages and we might + want to ignore the entire row. + + Returns True to indicate that the row should be ignored (skipped). + + """ + problems: typing.List[str] = [ ] # Build a list of problems. + idx: int value: str - idx: int = 0 - problems: typing.List[str] = [ ] - for value in values: - kv: KgtkValue = KgtkValue(value) - if not kv.is_valid(): - problems.append("%s: %s" % (self.column_names[idx], kv.describe())) - idx += 1 - if len(problems) > 0 and self.exclude_line(self.invalid_value_action, - "; ".join(problems), - line): - return True - return False + for idx, value in enumerate(values): + if len(value) > 0: # Optimize the common case of empty columns. + kv: KgtkValue = KgtkValue(value) + if not kv.is_valid(): + problems.append("%s: %s" % (self.column_names[idx], kv.describe())) + + if len(problems) == 0: + return False + + return self.exclude_line(self.invalid_value_action, + "; ".join(problems), + line) # May be overridden def _ignore_if_blank_fields(self, values: typing.List[str], line: str)->bool: From 92e36aaab2a38f92ccc865984d5680effeb40594 Mon Sep 17 00:00:00 2001 From: ckxz105 Date: Thu, 30 Apr 2020 18:06:06 -0700 Subject: [PATCH 039/278] embedding vector: code clean / bug fix / use --debug for debuging setting --- kgtk/cli/text_embedding.py | 333 +++++++++++++++++-------------------- 1 file changed, 152 insertions(+), 181 deletions(-) diff --git a/kgtk/cli/text_embedding.py b/kgtk/cli/text_embedding.py index 420f4f9e0..77004a2d9 100644 --- a/kgtk/cli/text_embedding.py +++ b/kgtk/cli/text_embedding.py @@ -1,30 +1,29 @@ -import sys import typing from kgtk.exceptions import KGTKException ALL_EMBEDDING_MODELS_NAMES = [ -"bert-base-nli-cls-token", -"bert-base-nli-max-tokens", -"bert-base-nli-mean-tokens", -"bert-base-nli-stsb-mean-tokens", -"bert-base-wikipedia-sections-mean-tokens", -"bert-large-nli-cls-token", -"bert-large-nli-max-tokens", -"bert-large-nli-mean-tokens", -"bert-large-nli-stsb-mean-tokens", -"distilbert-base-nli-mean-tokens", -"distilbert-base-nli-stsb-mean-tokens", -"distiluse-base-multilingual-cased", -"roberta-base-nli-mean-tokens", -"roberta-base-nli-stsb-mean-tokens", -"roberta-large-nli-mean-tokens", -"roberta-large-nli-stsb-mean-tokens" + "bert-base-nli-cls-token", + "bert-base-nli-max-tokens", + "bert-base-nli-mean-tokens", + "bert-base-nli-stsb-mean-tokens", + "bert-base-wikipedia-sections-mean-tokens", + "bert-large-nli-cls-token", + "bert-large-nli-max-tokens", + "bert-large-nli-mean-tokens", + "bert-large-nli-stsb-mean-tokens", + "distilbert-base-nli-mean-tokens", + "distilbert-base-nli-stsb-mean-tokens", + "distiluse-base-multilingual-cased", + "roberta-base-nli-mean-tokens", + "roberta-base-nli-stsb-mean-tokens", + "roberta-large-nli-mean-tokens", + "roberta-large-nli-stsb-mean-tokens" ] class EmbeddingVector: - def __init__(self, model_name=None, query_server=None, cache_config:dict={}): - from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses, models # type: ignore + def __init__(self, model_name=None, query_server=None, cache_config: dict = {}): + from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses, models # type: ignore import logging import re self._logger = logging.getLogger(__name__) @@ -65,43 +64,21 @@ def __init__(self, model_name=None, query_server=None, cache_config:dict={}): self.redis_server = None self.qnodes_descriptions = dict() self.vectors_map = dict() + self.property_labels_dict = dict() self.vectors_2D = None self.gt_nodes = set() self.candidates = defaultdict(dict) - self.embedding_cache = dict() self.vector_dump_file = None self.q_node_to_label = dict() self.metadata = [] self.gt_indexes = set() self.input_format = "" - self.token_patern = re.compile(r"(?u)\b\w\w+\b") - - @staticmethod - def minDistance(word1, word2): - """Dynamic programming solution""" - m = len(word1) - n = len(word2) - table = [[0] * (n + 1) for _ in range(m + 1)] - for i in range(m + 1): - table[i][0] = i - for j in range(n + 1): - table[0][j] = j - for i in range(1, m + 1): - for j in range(1, n + 1): - if word1[i - 1] == word2[j - 1]: - table[i][j] = table[i - 1][j - 1] - else: - table[i][j] = 1 + min(table[i - 1][j], table[i][j - 1], table[i - 1][j - 1]) - return table[-1][-1] - + self.token_pattern = re.compile(r"(?u)\b\w\w+\b") def get_sentences_embedding(self, sentences: typing.List[str], qnodes: typing.List[str]): """ transform a list of sentences to embedding vectors """ - # if sentences in self.embedding_cache: - # return self.embedding_cache[sentences] - # else: from ast import literal_eval if self.redis_server is not None: sentence_embeddings = [] @@ -119,14 +96,13 @@ def get_sentences_embedding(self, sentences: typing.List[str], qnodes: typing.Li self.redis_server.set(query_cache_key, str(each_embedding[0].tolist())) else: sentence_embeddings = self.model.encode(sentences, show_progress_bar=False) - # self.embedding_cache[sentences] = sentence_embeddings return sentence_embeddings - def send_sparql_query(self, query_body:str): + def send_sparql_query(self, query_body: str): """ a simple wrap to send the query and return the returned results """ - from SPARQLWrapper import SPARQLWrapper, JSON, POST, URLENCODED # type: ignore + from SPARQLWrapper import SPARQLWrapper, JSON, POST, URLENCODED # type: ignore qm = SPARQLWrapper(self.wikidata_server) qm.setReturnFormat(JSON) qm.setMethod(POST) @@ -138,9 +114,9 @@ def send_sparql_query(self, query_body:str): results = qm.query().convert()['results']['bindings'] return results except: - raise KGTKException("Sending Sparl query to {} failed!".format(self.wikidata_server)) + raise KGTKException("Sending Sparql query to {} failed!".format(self.wikidata_server)) - def get_item_description(self, qnodes: typing.List[str]=None, target_properties:dict={}, gt_label:str=""): + def get_item_description(self, qnodes: typing.List[str] = None, target_properties: dict = {}): """ use sparql query to get the descriptions of given Q nodes """ @@ -166,7 +142,7 @@ def get_item_description(self, qnodes: typing.List[str]=None, target_properties: sentences_cache_dict = {} if self.redis_server is not None: for each_node in qnodes: - cache_res = self.redis_server.get(each_node+str(properties_list)) + cache_res = self.redis_server.get(each_node + str(properties_list)) if cache_res is not None: sentences_cache_dict[each_node] = cache_res.decode("utf-8") @@ -199,12 +175,6 @@ def get_item_description(self, qnodes: typing.List[str]=None, target_properties: description = "" if "itemLabel" in each: label = each['itemLabel']['value'] - # if each_node == self.gt[gt_label]: - # if self.minDistance(label, gt_label) > len(gt_label): - # a = "".join(self.token_patern.findall(label.lower())) - # b = "".join(self.token_patern.findall(gt_label.lower())) - # if a not in b and b not in a: - # self._logger.error("{} with {} --> {} edit distance too larger!!!".format(each_node, label, gt_label)) else: label = "" if need_find_label: @@ -223,7 +193,7 @@ def get_item_description(self, qnodes: typing.List[str]=None, target_properties: part2 += """?item wdt:{} ?{}_{}. \n""".format(each, name, i) query_body2 += """ where { - values ?item {""" + query_qnodes + "}" + values ?item {""" + query_qnodes + "}" query_body2 += part2 + """ SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } @@ -246,7 +216,7 @@ def get_item_description(self, qnodes: typing.List[str]=None, target_properties: query_body3 = """ select DISTINCT ?item ?p_entity ?p_entityLabel where { - values ?item {"""+ query_qnodes + """} + values ?item {""" + query_qnodes + """} ?item ?p ?o. FILTER regex(str(?p), "^http://www.wikidata.org/prop/P", "i") BIND (IRI(REPLACE(STR(?p), "http://www.wikidata.org/prop", "http://www.wikidata.org/entity")) AS ?p_entity) . @@ -262,27 +232,26 @@ def get_item_description(self, qnodes: typing.List[str]=None, target_properties: if "has_properties" in self.candidates[node_name]: self.candidates[node_name]["has_properties"].add(p_node_label) else: - self.candidates[node_name]["has_properties"] = set([p_node_label]) + self.candidates[node_name]["has_properties"] = {p_node_label} for each_node_id in qnodes: each_sentence = self.attribute_to_sentence(self.candidates[each_node_id], each_node_id) self.candidates[each_node_id]["sentence"] = each_sentence if self.redis_server is not None: - self.redis_server.set(each_node+str(properties_list), each_sentence) - + self.redis_server.set(each_node_id + str(properties_list), each_sentence) + for each_node_id, sentence in sentences_cache_dict.items(): self.candidates[each_node_id]["sentence"] = sentence - - def read_input(self, file_path: str, skip_nodes_set: set=None, - input_format: str="kgtk_format",target_properties: dict={}, - property_labels_dict:dict={}, black_list_set:set=set() + def read_input(self, file_path: str, skip_nodes_set: set = None, + input_format: str = "kgtk_format", target_properties: dict = {}, + property_labels_dict: dict = {}, black_list_set: set = set() ): """ load the input candidates files """ from collections import defaultdict - import pandas as pd # type: ignore + import pandas as pd # type: ignore import numpy as np import math @@ -306,7 +275,7 @@ def read_input(self, file_path: str, skip_nodes_set: set=None, temp = str(each['candidates']).split("|") elif each['candidates'] is np.nan or math.isnan(each['candidates']): temp = [] - + to_remove_q = set() if each[gt_column_id] is np.nan: self._logger.warning("Ignore NaN gt value form {}".format(str(each))) @@ -345,22 +314,25 @@ def read_input(self, file_path: str, skip_nodes_set: set=None, # get header headers = f.readline().replace("\n", "").split("\t") if len(headers) < 3: - raise KGTKException("No enough columns found on given input file. Only {} columns given but at least 3 needed.".format(len(headers))) + raise KGTKException( + "No enough columns found on given input file. Only {} columns given but at least 3 needed.".format( + len(headers))) elif "node" in headers and "property" in headers and "value" in headers: - column_references = {"node": headers.index("node"), + column_references = {"node": headers.index("node"), "property": headers.index("property"), "value": headers.index("value")} elif len(headers) == 3: - column_references = {"node": 0, + column_references = {"node": 0, "property": 1, "value": 2} else: - missing_column = set(["node", "property", "value"]) - set(headers) + missing_column = {"node", "property", "value"} - set(headers) raise KGTKException("Missing column {}".format(missing_column)) self._logger.debug("column index information: ") self._logger.debug(str(column_references)) # read contents - each_node_attributes = {"has_properties":[], "isa_properties":[], "label_properties":[], "description_properties": []} + each_node_attributes = {"has_properties": [], "isa_properties": [], "label_properties": [], + "description_properties": []} current_process_node_id = None for each_line in f: each_line = each_line.replace("\n", "").split("\t") @@ -373,9 +345,9 @@ def read_input(self, file_path: str, skip_nodes_set: set=None, node_value = node_value[:node_value.index("@")] # remove extra double quote " and single quote ' - if node_value[0]== '"' and node_value[-1] == '"': + if node_value[0] == '"' and node_value[-1] == '"': node_value = node_value[1:-1] - if node_value[0]== "'" and node_value[-1] == "'": + if node_value[0] == "'" and node_value[-1] == "'": node_value = node_value[1:-1] if current_process_node_id != node_id: @@ -388,7 +360,8 @@ def read_input(self, file_path: str, skip_nodes_set: set=None, each_node_attributes["sentence"] = concated_sentence self.candidates[current_process_node_id] = each_node_attributes # after write down finish, we can cleaer and start parsing next one - each_node_attributes = {"has_properties":[], "isa_properties":[], "label_properties":[], "description_properties": []} + each_node_attributes = {"has_properties": [], "isa_properties": [], "label_properties": [], + "description_properties": []} # update to new id current_process_node_id = node_id @@ -396,12 +369,12 @@ def read_input(self, file_path: str, skip_nodes_set: set=None, each_node_attributes[target_properties[node_property]].append(node_value) if add_all_properties and each_line[column_references["value"]][0] == "P": each_node_attributes["has_properties"].append(node_value) - + else: raise KGTKException("Unkonwn input format {}".format(input_format)) self._logger.info("Totally {} Q nodes loaded.".format(len(self.candidates))) - self.vector_dump_file = "dump_vectors_{}_{}.pkl".format(file_path[:file_path.rfind(".")], self. model_name) + self.vector_dump_file = "dump_vectors_{}_{}.pkl".format(file_path[:file_path.rfind(".")], self.model_name) # self._logger.debug("The cache file name will be {}".format(self.vector_dump_file)) def get_real_label_name(self, node): @@ -410,8 +383,9 @@ def get_real_label_name(self, node): else: return node - def attribute_to_sentence(self, v, node_id = None): + def attribute_to_sentence(self, v, node_id=None): concated_sentence = "" + have_isa_properties = False # sort the properties to ensure the sentence always same v = {key: sorted(list(value)) for key, value in v.items() if len(value) > 0} if "label_properties" in v and len(v["label_properties"]) > 0: @@ -421,39 +395,35 @@ def attribute_to_sentence(self, v, node_id = None): concated_sentence += ", " concated_sentence += self.get_real_label_name(v["description_properties"][0]) if "isa_properties" in v and len(v["isa_properties"]) > 0: + have_isa_properties = True temp = [self.get_real_label_name(each) for each in v["isa_properties"]] if concated_sentence != "" and temp[0] != "": - concated_sentence += " is a " + concated_sentence += " is a " elif temp[0] != "": concated_sentence += "It is a " concated_sentence += ", ".join(temp) if "has_properties" in v and len(v["has_properties"]) > 0: temp = [self.get_real_label_name(each) for each in v["has_properties"]] if concated_sentence != "" and temp[0] != "": - concated_sentence += ", and has " + if have_isa_properties: + concated_sentence += ", and has " + else: + concated_sentence += " has " elif temp[0] != "": concated_sentence += "It has " concated_sentence += " and ".join(temp) self._logger.debug("Transform node {} --> {}".format(node_id, concated_sentence)) return concated_sentence - def get_vetors(self, use_cache=True, vector_dump_file=None): + def get_vetors(self): """ main function to get the vector representations of the descriptions """ import os import time - from tqdm import tqdm # type: ignore - if vector_dump_file is None: - vector_dump_file = self.vector_dump_file - if use_cache and os.path.exists(vector_dump_file): - self._logger.info("Using cached vector file!") - self.load_vectors(vector_dump_file) - return - + from tqdm import tqdm # type: ignore + start_all = time.time() - jobs_count = 0 - counter = 0 self._logger.info("Now generating embedding vector.") for q_node, each_item in tqdm(self.candidates.items()): # do process for each row(one target) @@ -465,6 +435,7 @@ def get_vetors(self, use_cache=True, vector_dump_file=None): self._logger.info("Totally used {} seconds.".format(str(time.time() - start_all))) def dump_vectors(self, file_name, type_=None): + import pickle if file_name.endswith(".pkl"): file_name = file_name.replace(".pkl", "") if type_ == "2D": @@ -491,15 +462,7 @@ def dump_vectors(self, file_name, type_=None): _ = f.write(str(i) + "\t") _ = f.write("\n") - def load_vectors(self, file_name, type_=None): - if type_ == "2D": - with open(file_name, "rb") as f: - self.vectors_2D = pickle.load(f) - else: - with open(file_name, "rb") as f: - self.vectors_map = pickle.load(f) - - def print_vector(self, vectors, output_properties:str="text_embedding", output_format="kgtk_format"): + def print_vector(self, vectors, output_properties: str = "text_embedding", output_format="kgtk_format"): if output_format == "kgtk_format": print("node\tproperty\tvalue\n", end="") if self.input_format == "kgtk_format": @@ -530,10 +493,8 @@ def print_vector(self, vectors, output_properties:str="text_embedding", output_f else: print(str(each_dimension) + "\n", end="") - - def plot_result(self, use_cache=True, vector_dump_file=None, - output_properties={}, input_format="kgtk_format", - output_uri:str="", output_format="kgtk_format", + def plot_result(self, output_properties={}, input_format="kgtk_format", + output_uri: str = "", output_format="kgtk_format", run_TSNE=True ): """ @@ -542,7 +503,7 @@ def plot_result(self, use_cache=True, vector_dump_file=None, """ import os import time - from sklearn.manifold import TSNE # type: ignore + from sklearn.manifold import TSNE # type: ignore self.vectors_map = {k: v for k, v in sorted(self.vectors_map.items(), key=lambda item: item[0], reverse=True)} vectors = list(self.vectors_map.values()) @@ -551,7 +512,6 @@ def plot_result(self, use_cache=True, vector_dump_file=None, self._logger.warning("Start running TSNE to reduce dimension. It will take a long time.") start = time.time() self.vectors_2D = TSNE(n_components=2, random_state=0).fit_transform(vectors) - # self.dump_vectors(vector_dump_file, "2D") self._logger.info("Totally used {} seconds.".format(time.time() - start)) if input_format == "test_format": @@ -616,7 +576,7 @@ def evaluate_result(self): centroid += np.array(self.vectors_map[each]) gt_nodes_vectors.append(self.vectors_map[each]) centroid = centroid / len(points) - + distance_sum = 0 for each in gt_nodes_vectors: distance_sum += self.calculate_distance(each, centroid) @@ -627,8 +587,8 @@ def calculate_distance(a, b): if len(a) != len(b): raise KGTKException("Vector dimension are different!") dist = 0 - for v1, v2 in zip(a,b): - dist += (v1 - v2) **2 + for v1, v2 in zip(a, b): + dist += (v1 - v2) ** 2 dist = dist ** 0.5 return dist @@ -643,12 +603,14 @@ def load_property_labels_file(input_files: typing.List[str]): if headers is None: headers = each_line if len(headers) < 2: - raise KGTKException("No enough columns found on given input file. Only {} columns given but at least 2 needed.".format(len(headers))) + raise KGTKException( + "No enough columns found on given input file. Only {} columns given but at least 2 needed.".format( + len(headers))) elif "predicate" in headers and "label" in headers: - column_references = {"predicate": headers.index("predicate"), + column_references = {"predicate": headers.index("predicate"), "label": headers.index("label")} elif "label" in headers: - column_references = {"predicate": 0, + column_references = {"predicate": 0, "label": headers.index("label"), } else: @@ -669,19 +631,22 @@ def load_black_list_files(file_path): import tarfile import zipfile import gzip + import logging import re - token_patern = re.compile(r"(?u)\b\w\w+\b") + import numpy as np + token_pattern = re.compile(r"(?u)\b\w\w+\b") qnodes_set = set() + _logger = logging.getLogger(__name__) for each_file in file_path: try: # tar.gz file if each_file.endswith("tar.gz"): tar = tarfile.open("filename.tar.gz", "r:gz") for member in tar.getmembers(): - f = tar.extractfile(member) - if f: - content = f.read() - Data = np.loadtxt(content) + f = tar.extractfile(member) + if f: + content = f.read() + input_data = np.loadtxt(content) # gz file elif each_file.endswith(".gz"): with gzip.open('big_file.txt.gz', 'rb') as f: @@ -695,10 +660,9 @@ def load_black_list_files(file_path): with open(each_file, "r") as f: input_data = f.readlines() - for each in input_data: each = each.replace("\n", "") - for each_part in token_patern.findall(each): + for each_part in token_pattern.findall(each): if each_part[0] == "Q" and each_part[1:].isnumeric(): qnodes_set.add(each_part) except Exception as e: @@ -725,15 +689,16 @@ def main(**kwargs): import argparse import pickle - do_logging = kwargs.get("logging_level", None) - if do_logging and do_logging.lower() != "none": + do_logging = kwargs.get("_debug", False) + if do_logging: logging_level_class = logging.DEBUG - logger_path = os.path.join(os.environ.get("HOME"), "kgtk_text_embedding_log_{}.log".format(strftime("%Y-%m-%d-%H-%M"))) + logger_path = os.path.join(os.environ.get("HOME"), + "kgtk_text_embedding_log_{}.log".format(strftime("%Y-%m-%d-%H-%M"))) logging.basicConfig(level=logging_level_class, - format="%(asctime)s [%(levelname)s] %(name)s %(lineno)d -- %(message)s", - datefmt='%m-%d %H:%M:%S', - filename=logger_path, - filemode='w') + format="%(asctime)s [%(levelname)s] %(name)s %(lineno)d -- %(message)s", + datefmt='%m-%d %H:%M:%S', + filename=logger_path, + filemode='w') _logger = logging.getLogger(__name__) _logger.warning("Running with logging level {}".format(_logger.getEffectiveLevel())) @@ -748,21 +713,21 @@ def main(**kwargs): property_labels_files = kwargs.get("property_labels_file_uri", "") query_server = kwargs.get("query_server") properties = dict() - all_property_relate_inputs = [kwargs.get("label_properties", ["label"]), + all_property_relate_inputs = [kwargs.get("label_properties", ["label"]), kwargs.get("description_properties", ["description"]), kwargs.get("isa_properties", ["P31"]), kwargs.get("has_properties", ["all"]), - ] - all_required_properties = ["label_properties", "description_properties", + ] + all_required_properties = ["label_properties", "description_properties", "isa_properties", "has_properties"] - cache_config = {"use_cache": kwargs.get("use_cache", False), + cache_config = {"use_cache": kwargs.get("use_cache", False), "host": kwargs.get("cache_host", "dsbox01.isi.edu"), "port": kwargs.get("cache_port", 6379) } for each_property, each_input in zip(all_required_properties, all_property_relate_inputs): for each in each_input: properties[each] = each_property - + output_properties = { "metatada_properties": kwargs.get("metatada_properties", []), "output_properties": kwargs.get("output_properties", "text_embedding") @@ -778,7 +743,7 @@ def main(**kwargs): raise KGTKException("No input file path given!") if output_uri == "": - output_uri = os.getenv("HOME") # os.getcwd() + output_uri = os.getenv("HOME") # os.getcwd() if black_list_files != "": black_list_set = load_black_list_files(black_list_files) else: @@ -788,19 +753,19 @@ def main(**kwargs): _logger.info("Totally {} property labels loaded.".format(len(property_labels_dict))) else: property_labels_dict = {} - + run_TSNE = kwargs.get("run_TSNE", True) for each_model_name in all_models_names: for each_input_file in input_uris: _logger.info("Running {} model on {}".format(each_model_name, each_input_file)) process = EmbeddingVector(each_model_name, query_server=query_server, cache_config=cache_config) - process.read_input(file_path=each_input_file, skip_nodes_set=black_list_set, + process.read_input(file_path=each_input_file, skip_nodes_set=black_list_set, input_format=input_format, target_properties=properties, property_labels_dict=property_labels_dict) - process.get_vetors(use_cache=True) - process.plot_result(use_cache=True, output_properties=output_properties, - input_format=input_format, output_uri=output_uri, + process.get_vetors() + process.plot_result(output_properties=output_properties, + input_format=input_format, output_uri=output_uri, run_TSNE=run_TSNE, output_format=output_format) process.evaluate_result() _logger.info("*" * 20 + "finished" + "*" * 20) @@ -808,88 +773,95 @@ def main(**kwargs): _logger.debug(e, exc_info=True) raise KGTKException(str(e)) + def parser(): return { 'help': """Produce embedding vectors on given file's nodes.""" } + def add_arguments(parser): import argparse def str2bool(v): if isinstance(v, bool): - return v + return v if v.lower() in ('yes', 'true', 't', 'y', '1'): return True elif v.lower() in ('no', 'false', 'f', 'n', '0'): return False else: raise argparse.ArgumentTypeError('Boolean value expected.') - # logging level, no longer need as there is a global choice for it - parser.add_argument('-l', '--logging-level', action='store', dest='logging_level', - default="info", choices=("error", "warning", "info", "debug", "none"), - help="set up the logging level, default is INFO level") - # parser.add_argument('--debug', action='store_true', dest='logging_level', - # help='set up to make logging and store at home directory.') + + parser.accept_shared_argument('_debug') + # logging level, no longer need as there is a global --debug choice for it + # parser.add_argument('-l', '--logging-level', action='store', dest='logging_level', + # default="info", choices=("error", "warning", "info", "debug", "none"), + # help="set up the logging level, default is INFO level") + # parser.add_argument('--debug', dest='_debug', action='store_true', default=False, help='enable debug mode') + # model name all_models_names = ALL_EMBEDDING_MODELS_NAMES parser.add_argument('-m', '--model', action='store', nargs='+', dest='all_models_names', - default="bert-base-wikipedia-sections-mean-tokens", choices=all_models_names, - help="the model to used for embedding") + default="bert-base-wikipedia-sections-mean-tokens", choices=all_models_names, + help="the model to used for embedding") # input file parser.add_argument('-i', '--input', action='store', nargs='+', dest='input_uris', - help="input path",) - parser.add_argument('-f', '--input-format', action='store', dest='input_format', - choices=("test_format", "kgtk_format"), default = "kgtk_format", - help="the input file format, could either be `test_format` or `kgtk_format`, default is `kgtk_format`",) - parser.add_argument('-p', '--property-labels-file', action='store', nargs='+', - dest='property_labels_file_uri', help="the path to the property labels file.",) + help="input path", ) + parser.add_argument('-f', '--input-format', action='store', dest='input_format', + choices=("test_format", "kgtk_format"), default="kgtk_format", + help="the input file format, could either be `test_format` or `kgtk_format`, default is `kgtk_format`", ) + parser.add_argument('-p', '--property-labels-file', action='store', nargs='+', + dest='property_labels_file_uri', help="the path to the property labels file.", ) # properties (only valid for kgtk format input/output data) - parser.add_argument('--label-properties', action='store', nargs='+', - dest='label_properties',default= ["label"], - help="""The names of the eges for label properties, Default is ["label"]. \n This argument is only valid for input in kgtk format.""") - parser.add_argument('--description-properties', action='store', nargs='+', - dest='description_properties', default= ["description"], - help="""The names of the eges for description properties, Default is ["description"].\n This argument is only valid for input in kgtk format.""") - parser.add_argument('--isa-properties', action='store', nargs='+', - dest='isa_properties', default= ["P31"], - help="""The names of the eges for `isa` properties, Default is ["P31"] (the `instance of` node in wikidata).\n This argument is only valid for input in kgtk format.""") - parser.add_argument('--has-properties', action='store', nargs='+', - dest='has_properties', default= ["all"], - help="""The names of the eges for `has` properties, Default is ["all"] (will automatically append all properties found for each node).\n This argument is only valid for input in kgtk format.""") - parser.add_argument('--output-property', action='store', - dest='output_properties', default= "text_embedding", - help="""The output property name used to record the embedding. Default is `output_properties`. \nThis argument is only valid for output in kgtk format.""") + parser.add_argument('--label-properties', action='store', nargs='+', + dest='label_properties', default=["label"], + help="""The names of the eges for label properties, Default is ["label"]. \n + This argument is only valid for input in kgtk format.""") + parser.add_argument('--description-properties', action='store', nargs='+', + dest='description_properties', default=["description"], + help="""The names of the eges for description properties, Default is ["description"].\n + This argument is only valid for input in kgtk format.""") + parser.add_argument('--isa-properties', action='store', nargs='+', + dest='isa_properties', default=["P31"], + help="""The names of the eges for `isa` properties, Default is ["P31"] (the `instance of` node in wikidata).\n + This argument is only valid for input in kgtk format.""") + parser.add_argument('--has-properties', action='store', nargs='+', + dest='has_properties', default=["all"], + help="""The names of the eges for `has` properties, Default is ["all"] (will automatically append all properties found for each node).\n This argument is only valid for input in kgtk format.""") + parser.add_argument('--output-property', action='store', + dest='output_properties', default="text_embedding", + help="""The output property name used to record the embedding. Default is `output_properties`. \nThis argument is only valid for output in kgtk format.""") # output parser.add_argument('-o', '--embedding-projector-metadata-path', action='store', dest='output_uri', default="", - help="output path for the metadata file, default will be current user's home directory") - parser.add_argument('--output-format', action='store', dest='output_format', - default="kgtk", choices=("tsv_format", "kgtk_format"), - help="output format, can either be `tsv_format` or `kgtk_format`. \nIf choose `tsv_format`, the output will be a tsv file, with each row contains only the vector representation of a node. Each dimension is separated by a tab") - parser.add_argument('--embedding-projector-metatada', action='store', nargs='+', - dest='metatada_properties', default= [], - help="""list of properties used to construct a metadata file for use in the Google Embedding Projector: http://projector.tensorflow.org. \n Default: the label and description of each node.""") + help="output path for the metadata file, default will be current user's home directory") + parser.add_argument('--output-format', action='store', dest='output_format', + default="kgtk", choices=("tsv_format", "kgtk_format"), + help="output format, can either be `tsv_format` or `kgtk_format`. \nIf choose `tsv_format`, the output will be a tsv file, with each row contains only the vector representation of a node. Each dimension is separated by a tab") + parser.add_argument('--embedding-projector-metatada', action='store', nargs='+', + dest='metatada_properties', default=[], + help="""list of properties used to construct a metadata file for use in the Google Embedding Projector: http://projector.tensorflow.org. \n Default: the label and description of each node.""") # black list file parser.add_argument('-b', '--black-list', nargs='+', action='store', dest='black_list_files', - default= "", - help="the black list file, contains the Q nodes which should not consider as candidates.") + default="", + help="the black list file, contains the Q nodes which should not consider as candidates.") # run tsne or not - parser.add_argument("--run-TSNE", type=str2bool, nargs='?', action='store', + parser.add_argument("--run-TSNE", type=str2bool, nargs='?', action='store', default=True, dest="run_TSNE", help="whether to run TSNE or not after the embedding, default is true.") # cache config - parser.add_argument("--use-cache", type=str2bool, nargs='?', action='store', + parser.add_argument("--use-cache", type=str2bool, nargs='?', action='store', default=False, dest="use_cache", help="whether to use cache to get some embedding vectors quicker, default is False") - parser.add_argument("--cache-host", nargs='?', action='store', + parser.add_argument("--cache-host", nargs='?', action='store', default="dsbox01.isi.edu", dest="cache_host", help="cache host address, default is `dsbox01.isi.edu`" ) - parser.add_argument("--cache-port", nargs='?', action='store', + parser.add_argument("--cache-port", nargs='?', action='store', default="6379", dest="cache_port", help="cache server port, default is `6379`" ) # query server - parser.add_argument("--query-server", nargs='?', action='store', + parser.add_argument("--query-server", nargs='?', action='store', default="", dest="query_server", help="sparql query endpoint used for test_format input files, default is https://query.wikidata.org/sparql" ) @@ -897,4 +869,3 @@ def str2bool(v): def run(**kwargs): main(**kwargs) - From 9f41543403ad6bfc6b25d186546aefc4534cdef0 Mon Sep 17 00:00:00 2001 From: ckxz105 Date: Thu, 30 Apr 2020 18:08:53 -0700 Subject: [PATCH 040/278] update requirement.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 439daf31e..7d600bae5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,3 +13,4 @@ etk==2.2.1 simplejson pyrallel.lib attrs +redis \ No newline at end of file From 2634adc433dfeaeb811be28866554d3eba296a60 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Thu, 30 Apr 2020 20:22:38 -0700 Subject: [PATCH 041/278] Print with system-specific end-of-line. --- kgtk/exceptions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kgtk/exceptions.py b/kgtk/exceptions.py index d6fea9582..c70e03797 100644 --- a/kgtk/exceptions.py +++ b/kgtk/exceptions.py @@ -45,11 +45,11 @@ def handle_exception(self, type_, exc_val, exc_tb): traceback.print_exception(type_, exc_val, exc_tb) # the output goes to sys.stderr if isinstance(exc_val, KGTKException): - sys.stderr.write(exc_val.message) + print("%s" % exc_val.message, file=sys.stderr) return exc_val.return_code warnings.warn('Please raise KGTKException instead of {}'.format(type_)) - sys.stderr.write(KGTKException.message) + print("%s" %KGTKException.message, file=sys.stderr) return KGTKException.return_code From a81358521a1eb80860bd2fbd13a6b40708748182 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Thu, 30 Apr 2020 20:32:10 -0700 Subject: [PATCH 042/278] Improve date/time parsing. Improve error feedback. --- kgtk/cli/validate.py | 2 +- kgtk/join/kgtkreader.py | 15 ++++++++++---- kgtk/join/kgtkvalue.py | 43 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 54 insertions(+), 6 deletions(-) diff --git a/kgtk/cli/validate.py b/kgtk/cli/validate.py index 9d97293e9..3593ed784 100644 --- a/kgtk/cli/validate.py +++ b/kgtk/cli/validate.py @@ -213,5 +213,5 @@ def run(kgtk_files: typing.Optional[typing.List[typing.Optional[Path]]], except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: - raise KGTKException(e) + raise KGTKException(str(e)) diff --git a/kgtk/join/kgtkreader.py b/kgtk/join/kgtkreader.py index d6a30a209..916a46abf 100644 --- a/kgtk/join/kgtkreader.py +++ b/kgtk/join/kgtkreader.py @@ -439,7 +439,10 @@ def _build_column_names(cls, # Read the column names from the first line, stripping end-of-line characters. # # TODO: if the read fails, throw a more useful exception with the line number. - header: str = next(source).rstrip("\r\n") + try: + header: str = next(source).rstrip("\r\n") + except StopIteration: + raise ValueError("No header line in file") if verbose: print("header: %s" % header, file=error_file, flush=True) @@ -449,7 +452,11 @@ def _build_column_names(cls, # Skip the first record to override the column names in the file. # Do not skip the first record if the file does not hae a header record. if skip_first_record: - next(source) + try: + next(source) + except StopIteration: + raise ValueError("No header line to skip") + # Use the forced column names. return column_separator.join(force_column_names), force_column_names @@ -479,7 +486,7 @@ def exclude_line(self, action: ValidationAction, msg: str, line: str)->bool: print("In input data line %d, %s: %s" % (self.data_lines_read, msg, line), file=self.error_file, flush=True) self.data_errors_reported += 1 if self.error_limit > 0 and self.data_errors_reported >= self.error_limit: - raise ValueError("Too many data errors.") + raise ValueError("Too many data errors, exiting.") return result # This is both and iterable and an iterator object. @@ -588,7 +595,7 @@ def _ignore_invalid_values(self, values: typing.List[str], line: str)->bool: if len(value) > 0: # Optimize the common case of empty columns. kv: KgtkValue = KgtkValue(value) if not kv.is_valid(): - problems.append("%s: %s" % (self.column_names[idx], kv.describe())) + problems.append("col %d (%s) value '%s'is an %s" % (idx, self.column_names[idx], value, kv.describe())) if len(problems) == 0: return False diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py index c9369efd6..f03390101 100644 --- a/kgtk/join/kgtkvalue.py +++ b/kgtk/join/kgtkvalue.py @@ -274,7 +274,7 @@ def is_date_and_times(self, idx: typing.Optional[int] = None)->bool: v: str = self.get_item(idx) return v.startswith("^") - date_and_times_re: typing.Pattern = re.compile(r"^\^(?P[0-9]{4})(?P-)?(?P1[0-2]|0[1-9])(?(hyphen)-)(?P3[01]|0[1-9]|[12][0-9])T(?P2[0-3]|[01][0-9])(?(hyphen):)(?P[0-5][0-9])(?(hyphen):)(?P[0-5][0-9])(?PZ|\+[0-9][0-9](?::[0-9][0-9])?)?(?P/[0-9])?$") + date_and_times_re: typing.Pattern = re.compile(r"^\^(?P[0-9]{4})(?:(?P-)?(?P1[0-2]|0[1-9])(?:(?(hyphen)-)(?P3[01]|0[1-9]|[12][0-9])))T(?P2[0-3]|[01][0-9])(?:(?(hyphen):)(?P[0-5][0-9])(?:(?(hyphen):)(?P[0-5][0-9])))(?PZ|\[-+][0-9][0-9](?::[0-9][0-9])?)?(?P/[0-1]?[0-9])?$") def is_valid_date_and_times(self, idx: typing.Optional[int] = None)->bool: """ @@ -282,6 +282,47 @@ def is_valid_date_and_times(self, idx: typing.Optional[int] = None)->bool: Otherwise, return True if the value looks like valid date and times literal based on ISO-8601. + Valid date formats: + YYYY + YYYY-MM + YYYYMMDD + YYYY-MM-DD + + Valid date and time formats + YYMMDDTHH + YYYY-MM-DDTHH + YYMMDDTHHMM + YYYY-MM-DDTHH:MM + YYMMDDTHHMMSS + YYYY-MM-DDTHH:MM:SS + + Optional Time Zone suffix for date and time: + Z + +HH + -HH + +HHMM + -HHMM + +HH:MM + -HH:MM + + NOTE: This code also accepts the following, which are disallowed by the standard: + YYYYT... + YYYYMM + YYYYMMT... + YYYY-MMT... + + Note: IS0-8601 disallows 0 for month or day, e.g.: + Invalid Correct + 1960-00-00T00:00:00Z/9 1960-01-01T00:00:00Z/9 + + TODO: Support fractional time elements + + TODO: Support week dates. + + TODO: Support ordinal dates + + TODO: Support Unicode minus sign as well as ASCII minus sign. + TODO: validate the calendar date, eg fail if 31-Apr-2020. """ if self.is_list() and idx is None: From cdeb056faa6a559c59700bc32529c70ca8ec701e Mon Sep 17 00:00:00 2001 From: filievski Date: Fri, 1 May 2020 09:54:42 -0700 Subject: [PATCH 043/278] gt_loader is way cleaner now --- kgtk/cli/gt_loader.py | 39 ++++++++++++++++++++++++++++----------- kgtk/gt/analysis_utils.py | 4 ++-- 2 files changed, 30 insertions(+), 13 deletions(-) diff --git a/kgtk/cli/gt_loader.py b/kgtk/cli/gt_loader.py index e1121cd5e..650a1c06e 100644 --- a/kgtk/cli/gt_loader.py +++ b/kgtk/cli/gt_loader.py @@ -16,10 +16,6 @@ def add_arguments(parser): parser (argparse.ArgumentParser) """ parser.add_argument(action="store", type=str, dest="filename", metavar='filename', help='filename here') - parser.add_argument("--header", action="store_true", dest="header_bool", help="Does the file contain a header in its first row") - parser.add_argument("--subj", action="store", type=int, dest="sub", help='Column in which the subject is given, default 0', default=0) - parser.add_argument("--obj", action="store", type=int, dest="obj", help='Column in which the subject is given, default 2', default=2) - parser.add_argument('--pred', action='store', type=str, dest="props", help="Edge properties to store in their order of appearance - comma-separated string.") parser.add_argument('--directed', action='store_true', dest="directed", help="Is the graph directed or not?") parser.add_argument('--degrees', action='store_true', dest='compute_degrees', help="Whether or not to compute degree distribution.") parser.add_argument('--pagerank', action='store_true', dest='compute_pagerank', help="Whether or not to compute PageRank centraility.") @@ -27,7 +23,19 @@ def add_arguments(parser): parser.add_argument('--log', action='store', type=str, dest='log_file', help='Log file for summarized statistics of the graph.', default="./log.txt") parser.add_argument('-o', '--out', action='store', type=str, dest='output', help='Graph tool file to dump the graph too - if empty, it will not be saved.') -def run(filename, header_bool, sub, obj, props, directed, compute_degrees, compute_pagerank, compute_hits, log_file, output): +def run(filename, directed, compute_degrees, compute_pagerank, compute_hits, log_file, output): + + def infer_index(h, options=[]): + for o in options: + if o in h: + return h.index(o) + return -1 + + def infer_predicate(h, options=[]): + for o in options: + if o in h: + return o + return '' try: # import modules locally @@ -42,22 +50,31 @@ def run(filename, header_bool, sub, obj, props, directed, compute_degrees, compu directions=['in', 'out', 'total'] id_col='name' - p=props.split(',') - predicate=p[0] + with open(filename, 'r') as f: + header=next(f).split('\t') + subj_index=infer_index(header, options=['node1', 'subject']) + obj_index=infer_index(header, options=['node2', 'object', 'value']) + predicate=infer_predicate(header, options=['property', 'predicate', 'label']) + + p=[] + for i, header_col in enumerate(header): + if i in [subj_index, obj_index]: continue + p.append(header_col) + with open(log_file, 'w') as writer: writer.write('loading the TSV graph now ...\n') G2 = load_graph_from_csv(filename, - skip_first=header_bool, + skip_first=True, directed=directed, hashed=True, - ecols=[sub,obj], - eprop_names=props.split(','), + ecols=[subj_index,obj_index], + eprop_names=p, csv_options={'delimiter': '\t'}) writer.write('graph loaded! It has %d nodes and %d edges\n' % (G2.num_vertices(), G2.num_edges())) writer.write('\n###Top relations:\n') - for rel, freq in gtanalysis.get_topN_relations(G2): + for rel, freq in gtanalysis.get_topN_relations(G2, pred_property=predicate): writer.write('%s\t%d\n' % (rel, freq)) if compute_degrees: diff --git a/kgtk/gt/analysis_utils.py b/kgtk/gt/analysis_utils.py index f4f9e89c7..d2f3da9c1 100644 --- a/kgtk/gt/analysis_utils.py +++ b/kgtk/gt/analysis_utils.py @@ -81,9 +81,9 @@ def compute_stats(g, direction): 'stdev_degree': stdev_degree } -def get_topN_relations(g, N=10): +def get_topN_relations(g, N=10, pred_property='predicate'): rel_freq=defaultdict(int) for i, e in enumerate(g.edges()): - r=g.edge_properties['predicate'][e] + r=g.edge_properties[pred_property][e] rel_freq[r]+=1 return sorted(rel_freq.items(), key=lambda x: x[1], reverse=True)[:N] From c1bb5f757a5c95021b7039b50c0422b8c8728104 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Fri, 1 May 2020 14:51:20 -0700 Subject: [PATCH 044/278] Support two or three character language codes. Relax the constraints on location coordinates. --- kgtk/join/kgtkvalue.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py index f03390101..b0b83b040 100644 --- a/kgtk/join/kgtkvalue.py +++ b/kgtk/join/kgtkvalue.py @@ -188,7 +188,8 @@ def is_language_qualified_string(self, idx: typing.Optional[int] = None)->bool: v: str = self.get_item(idx) return v.startswith("'") - language_qualified_string_re: typing.Pattern = re.compile(r"^(?P'(?:[^']|\\.)*')@(?P[a-zA-Z][a-zA-Z])$") + # Support two or three character language codes. + language_qualified_string_re: typing.Pattern = re.compile(r"^(?P'(?:[^']|\\.)*')@(?P[a-zA-Z]{2,3})$") def is_valid_language_qualified_string(self, idx: typing.Optional[int] = None)->bool: """ @@ -207,7 +208,12 @@ def is_valid_language_qualified_string(self, idx: typing.Optional[int] = None)-> lang: str = m.group("lang") # print("lang: %s" % lang) try: - languages.get(alpha2=lang.lower()) + if len(lang) == 2: + # Two-character language codes. + languages.get(alpha2=lang.lower()) + else: + # Three-character language codes. + languages.get(bibliographic=lang.lower()) return True except KeyError: return False @@ -223,7 +229,8 @@ def is_location_coordinates(self, idx: typing.Optional[int] = None)->bool: v: str = self.get_item(idx) return v.startswith("@") - location_coordinates_re: typing.Pattern = re.compile(r"^@(?P[-+]?\d{3}\.\d{5})/(?P[-+]?\d{3}\.\d{5})$") + #location_coordinates_re: typing.Pattern = re.compile(r"^@(?P[-+]?\d{3}\.\d{5})/(?P[-+]?\d{3}\.\d{5})$") + location_coordinates_re: typing.Pattern = re.compile(r"^@(?P[-+]?(?:\d+(?:\.\d*)?)|(?:\.\d+))/(?P[-+]?(?:\d+(?:\.\d*)?)|(?:\.\d+))$") def is_valid_location_coordinates(self, idx: typing.Optional[int] = None)->bool: """ From 624ec231331aec3c224d27e25fee94b28b09c489 Mon Sep 17 00:00:00 2001 From: ckxz105 Date: Fri, 1 May 2020 16:29:38 -0700 Subject: [PATCH 045/278] update sentence generating algorithm --- kgtk/cli/text_embedding.py | 203 ++++++++++++++++++++++--------------- 1 file changed, 123 insertions(+), 80 deletions(-) diff --git a/kgtk/cli/text_embedding.py b/kgtk/cli/text_embedding.py index 77004a2d9..35f8e8c35 100644 --- a/kgtk/cli/text_embedding.py +++ b/kgtk/cli/text_embedding.py @@ -65,11 +65,12 @@ def __init__(self, model_name=None, query_server=None, cache_config: dict = {}): self.qnodes_descriptions = dict() self.vectors_map = dict() self.property_labels_dict = dict() + self.q_node_to_label = dict() + self.node_labels = dict() self.vectors_2D = None + self.vector_dump_file = None self.gt_nodes = set() self.candidates = defaultdict(dict) - self.vector_dump_file = None - self.q_node_to_label = dict() self.metadata = [] self.gt_indexes = set() self.input_format = "" @@ -116,6 +117,99 @@ def send_sparql_query(self, query_body: str): except: raise KGTKException("Sending Sparql query to {} failed!".format(self.wikidata_server)) + def _get_labels(self, nodes: typing.List[str]): + query_nodes = " ".join(["wd:{}".format(each) for each in nodes]) + query = """ + select ?item ?nodeLabel + where { + values ?item {""" + query_nodes + """} + ?item rdfs:label ?nodeLabel. + FILTER(LANG(?nodeLabel) = "en"). + } + """ + results2 = self.send_sparql_query(query) + for each_res in results2: + node_id = each_res['item']['value'].split("/")[-1] + value = each_res['nodeLabel']['value'] + self.node_labels[node_id] = value + + def _get_labels_and_descriptions(self, query_qnodes: str, need_find_label: bool, need_find_description: bool): + query_body = """ + select ?item ?itemDescription ?itemLabel + where { + values ?item {""" + query_qnodes + """ } + SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } + } + """ + results = self.send_sparql_query(query_body) + for each in results: + each_node = each['item']['value'].split("/")[-1] + if 'itemDescription' in each: + description = each['itemDescription']['value'] + else: + description = "" + if "itemLabel" in each: + label = each['itemLabel']['value'] + else: + label = "" + if need_find_label: + self.candidates[each_node]["label_properties"] = [label] + if need_find_description: + self.candidates[each_node]["description_properties"] = [description] + + def _get_property_values(self, query_qnodes, query_part_names, query_part_properties): + used_p_node_ids = set() + for part_name, part in zip(query_part_names, query_part_properties): + if part_name == "isa_properties": + self._get_labels(part) + for i, each in enumerate(part): + if each not in {"label", "description", "all"}: + query_body2 = """ + select ?item ?eachPropertyLabel + where {{ + values ?item {{{all_nodes}}} + ?item wdt:{qnode} ?eachProperty. + SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }} + }} + """.format(all_nodes=query_qnodes, qnode=each) + results2 = self.send_sparql_query(query_body2) + + for each_res in results2: + node_id = each_res['item']['value'].split("/")[-1] + value = each_res['eachPropertyLabel']['value'] + if part_name == "isa_properties" and self.node_labels[each].endswith("of"): + value = self.node_labels[each] + "||" + value + used_p_node_ids.add(node_id) + if part_name in self.candidates[node_id]: + self.candidates[node_id][part_name] = value + else: + self.candidates[node_id][part_name] = {value} + return used_p_node_ids + + def _get_all_properties(self, query_qnodes, used_p_node_ids, properties_list): + has_properties_set = set(properties_list[3]) + query_body3 = """ + select DISTINCT ?item ?p_entity ?p_entityLabel + where { + values ?item {""" + query_qnodes + """} + ?item ?p ?o. + FILTER regex(str(?p), "^http://www.wikidata.org/prop/P", "i") + BIND (IRI(REPLACE(STR(?p), "http://www.wikidata.org/prop", "http://www.wikidata.org/entity")) AS ?p_entity) . + SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } + } + """ + results3 = self.send_sparql_query(query_body3) + for each in results3: + node_name = each['item']['value'].split("/")[-1] + p_node_id = each['p_entity']['value'].split("/")[-1] + p_node_label = each['p_entityLabel']['value'] + if p_node_id not in used_p_node_ids: + if properties_list[3] == ["all"] or p_node_id in has_properties_set: + if "has_properties" in self.candidates[node_name]: + self.candidates[node_name]["has_properties"].add(p_node_label) + else: + self.candidates[node_name]["has_properties"] = {p_node_label} + def get_item_description(self, qnodes: typing.List[str] = None, target_properties: dict = {}): """ use sparql query to get the descriptions of given Q nodes @@ -127,7 +221,6 @@ def get_item_description(self, qnodes: typing.List[str] = None, target_propertie else: find_all_properties = False properties_list = [[] for _ in range(4)] - used_p_node_ids = set() names = ["labels", "descriptions", "isa_properties", "has_properties"] for k, v in target_properties.items(): if v == "label_properties": @@ -159,80 +252,23 @@ def get_item_description(self, qnodes: typing.List[str] = None, target_propertie # this is used to get corresponding labels / descriptions if need_find_label or need_find_description: - query_body = """ - select ?item ?itemDescription ?itemLabel - where { - values ?item {""" + query_qnodes + """ } - SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } - } - """ - results = self.send_sparql_query(query_body) - for each in results: - each_node = each['item']['value'].split("/")[-1] - if 'itemDescription' in each: - description = each['itemDescription']['value'] - else: - description = "" - if "itemLabel" in each: - label = each['itemLabel']['value'] - else: - label = "" - if need_find_label: - self.candidates[each_node]["label_properties"] = [label] - if need_find_description: - self.candidates[each_node]["description_properties"] = [description] - - # this is used to get corresponding P node labels - query_body2 = "select ?item" - part2 = "" - for name, part in zip(names, properties_list): - for i, each in enumerate(part): - if each not in {"label", "description", "all"}: - used_p_node_ids.add(each) - query_body2 += " ?{}_{}Label".format(name, i) - part2 += """?item wdt:{} ?{}_{}. \n""".format(each, name, i) - query_body2 += """ - where { - values ?item {""" + query_qnodes + "}" - - query_body2 += part2 + """ - SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } - } - """ - results2 = self.send_sparql_query(query_body2) - for each in results2: - node_name = each['item']['value'].split("/")[-1] - for name, part in zip(names, properties_list): - if len(part) > 0: - properties_res = set() - for i in range(len(part)): - property_key = '{}_{}Label'.format(name, i) - if property_key in each: - properties_res.add(each[property_key]['value']) - self.candidates[node_name][name] = properties_res + self._get_labels_and_descriptions(query_qnodes, need_find_label, need_find_description) + + if len(properties_list[3]) > len(qnodes): + # in this condition, we have too many properties need to be queried, it will waste time + # query to get all properties then filtering would save more times + find_all_properties = True + query_part2_names = names[:3] + query_part2_properties = properties_list[:3] + else: + query_part2_names = names + query_part2_properties = properties_list + # this is used to get corresponding labels of properties values + used_p_node_ids = self._get_property_values(query_qnodes, query_part2_names, query_part2_properties) # if need get all properties, we need to run extra query if find_all_properties: - query_body3 = """ - select DISTINCT ?item ?p_entity ?p_entityLabel - where { - values ?item {""" + query_qnodes + """} - ?item ?p ?o. - FILTER regex(str(?p), "^http://www.wikidata.org/prop/P", "i") - BIND (IRI(REPLACE(STR(?p), "http://www.wikidata.org/prop", "http://www.wikidata.org/entity")) AS ?p_entity) . - SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } - } - """ - results3 = self.send_sparql_query(query_body3) - for each in results3: - node_name = each['item']['value'].split("/")[-1] - p_node_id = each['p_entity']['value'].split("/")[-1] - p_node_label = each['p_entityLabel']['value'] - if p_node_id not in used_p_node_ids: - if "has_properties" in self.candidates[node_name]: - self.candidates[node_name]["has_properties"].add(p_node_label) - else: - self.candidates[node_name]["has_properties"] = {p_node_label} + self._get_all_properties(query_qnodes, used_p_node_ids, properties_list) for each_node_id in qnodes: each_sentence = self.attribute_to_sentence(self.candidates[each_node_id], each_node_id) @@ -260,7 +296,6 @@ def read_input(self, file_path: str, skip_nodes_set: set = None, if input_format == "test_format": self.input_format = input_format input_df = pd.read_csv(file_path) - candidates = {} gt = {} count = 0 if "GT_kg_id" in input_df.columns: @@ -297,7 +332,7 @@ def read_input(self, file_path: str, skip_nodes_set: set = None, temp = set(temp) - to_remove_q count += len(temp) self.gt_nodes.add(each[gt_column_id]) - self.get_item_description(temp, target_properties, label) + self.get_item_description(temp, target_properties) self._logger.info("Totally {} rows with {} candidates loaded.".format(str(len(gt)), str(count))) @@ -396,12 +431,20 @@ def attribute_to_sentence(self, v, node_id=None): concated_sentence += self.get_real_label_name(v["description_properties"][0]) if "isa_properties" in v and len(v["isa_properties"]) > 0: have_isa_properties = True - temp = [self.get_real_label_name(each) for each in v["isa_properties"]] - if concated_sentence != "" and temp[0] != "": + temp = "" + for each in v["isa_properties"]: + each = self.get_real_label_name(each) + if "||" in each: + if "instance of" in each: + each = each.split("||")[1] + else: + each = each.replace("||", " ") + temp += each + ", " + if concated_sentence != "" and temp != "": concated_sentence += " is a " - elif temp[0] != "": + elif concated_sentence == "": concated_sentence += "It is a " - concated_sentence += ", ".join(temp) + concated_sentence += temp[:-2] if "has_properties" in v and len(v["has_properties"]) > 0: temp = [self.get_real_label_name(each) for each in v["has_properties"]] if concated_sentence != "" and temp[0] != "": From 37572a41202ef022d218cb9e86c915601290a37e Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Fri, 1 May 2020 17:34:34 -0700 Subject: [PATCH 046/278] Validate numbers and quantities. --- kgtk/join/kgtkvalue.py | 188 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 179 insertions(+), 9 deletions(-) diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py index b0b83b040..cdf235316 100644 --- a/kgtk/join/kgtkvalue.py +++ b/kgtk/join/kgtkvalue.py @@ -62,10 +62,10 @@ def is_empty(self, idx: typing.Optional[int] = None)->bool: v: str = self.get_item(idx) return len(v) == 0 - def is_number(self, idx: typing.Optional[int] = None)->bool: + def is_number_old(self, idx: typing.Optional[int] = None)->bool: """ Return False if this value is a list and idx is None. - Otherwise, return True if the first character is 0-9,_,-,. . + Otherwise, return True if the first character is 0-9,+,-,. . """ if self.is_list() and idx is None: return False @@ -73,7 +73,7 @@ def is_number(self, idx: typing.Optional[int] = None)->bool: v: str = self.get_item(idx) return v.startswith(("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "+", "-", ".")) - def is_valid_number(self, idx: typing.Optional[int] = None)->bool: + def is_valid_number_old(self, idx: typing.Optional[int] = None)->bool: """ Return False if this value is a list and idx is None. Otherwise, return True if the first character is 0-9,_,-,. @@ -110,6 +110,174 @@ def is_valid_number(self, idx: typing.Optional[int] = None)->bool: return False + def is_number_or_quantity(self, idx: typing.Optional[int] = None)->bool: + """ + Return False if this value is a list and idx is None. + Otherwise, return True if the first character is 0-9,+,-,. . + """ + if self.is_list() and idx is None: + return False + + v: str = self.get_item(idx) + return v.startswith(("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "+", "-", ".")) + + # The following lexical analysis is based on: + # https://docs.python.org/3/reference/lexical_analysis.html + + # The long integer suffix was part of Python 2. It was dropped in Python 3. + long_suffix_pat: str = r'[lL]' + + plus_or_minus_pat: str = r'[-+]' + + # Integer literals. + # + # Decimal integers, allowing leading zeros. + digit_pat: str = r'[0-9]' + decinteger_pat: str = r'(?:{digit}(?:_?{digit})*{long_suffix}?)'.format(digit=digit_pat, + long_suffix=long_suffix_pat) + bindigit_pat: str = r'[01]' + bininteger_pat: str = r'(?:0[bB](":_?{bindigit})+{long_suffix})'.format(bindigit=bindigit_pat, + long_suffix=long_suffix_pat) + octdigit_pat: str = r'[0-7]' + octinteger_pat: str = r'(?:0[oO](":_?{octdigit})+{long_suffix})'.format(octdigit=octdigit_pat, + long_suffix=long_suffix_pat) + hexdigit_pat: str = r'[0-7a-fA-F]' + hexinteger_pat: str = r'(?:0[xX](":_?{hexdigit})+{long_suffix})'.format(hexdigit=hexdigit_pat, + long_suffix=long_suffix_pat) + + integer_pat: str = r'(?:{decinteger}|{bininteger}|{octinteger}|{hexinteger})'.format(decinteger=decinteger_pat, + bininteger=bininteger_pat, + octinteger=octinteger_pat, + hexinteger=hexinteger_pat) + + # Floating point literals. + digitpart_pat: str = r'(?:{digit}(?:_?{digit})*)'.format(digit=digit_pat) + fraction_pat: str = r'(?:\.{digitpart})'.format(digitpart=digitpart_pat) + pointfloat_pat: str = r'(?:{digitpart}?{fraction})|(?:{digitpart}\.)'.format(digitpart=digitpart_pat, + fraction=fraction_pat) + exponent_pat: str = r'(?:[eE]{plus_or_minus}?{digitpart})'.format(plus_or_minus=plus_or_minus_pat, + digitpart=digitpart_pat) + exponentfloat_pat: str = r'(?:{digitpart}|{pointfloat}){exponent}'.format(digitpart=digitpart_pat, + pointfloat=pointfloat_pat, + exponent=exponent_pat) + floatnumber_pat: str = r'(?:{pointfloat}|{exponentfloat})'.format(pointfloat=pointfloat_pat, + exponentfloat=exponentfloat_pat) + + # Imaginary literals. + imagnumber_pat: str = r'(?:{floatnumber}|{digitpart})[jJ]'.format(floatnumber=floatnumber_pat, + digitpart=digitpart_pat) + + # Numeric literals. + numeric_pat: str = r'(?:{plus_or_minus}?(?:{integer}|{floatnumber}|{imagnumber}))'.format(plus_or_minus=plus_or_minus_pat, + integer=integer_pat, + floatnumber=floatnumber_pat, + imagnumber=imagnumber_pat) + + # Tolerances + tolerance_pat: str = r'(?:\[{numeric},{numeric}\])'.format(numeric=numeric_pat) + + # SI units taken from: + # http://www.csun.edu/~vceed002/ref/measurement/units/units.pdf + # + # Note: if Q were in this list, it would conflict with Wikidata nodes (below). + si_unit_pat: str = r'(?:m|kg|s|C|K|mol|cd|F|M|A|N|ohms|V|J|Hz|lx|H|Wb|V\W|Pa)' + si_power_pat: str = r'(?:-1|2|3)' # Might need more. + si_combiner_pat: str = r'[./]' + si_pat: str = r'(?:{si_unit}{si_power}?(?:{si_combiner}{si_unit}{si_power}?)*)'.format(si_unit=si_unit_pat, + si_combiner=si_combiner_pat, + si_power=si_power_pat) + # Wikidata nodes (for units): + nonzero_digit_pat: str = r'[1-9]' + wikidata_node_pat: str = r'(?:Q{nonzero_digit}{digit}*)'.format(nonzero_digit=nonzero_digit_pat, + digit=digit_pat) + + units_pat: str = r'(?:{si}|{wikidata_node})'.format(si=si_pat, + wikidata_node=wikidata_node_pat) + + + # This definition matches numbers or quantities. + number_or_quantity_pat: str = r'{numeric}{tolerance}?{units}?'.format(numeric=numeric_pat, + tolerance=tolerance_pat, + units=units_pat) + # This definition for quantity excludes plain numbers. + quantity_pat: str = r'{numeric}(?:(?:{tolerance}{units}?)|{units})'.format(numeric=numeric_pat, + tolerance=tolerance_pat, + units=units_pat) + # This matches numbers or quantities. + number_or_quantity_re: typing.Pattern = re.compile(r'^' + number_or_quantity_pat + r'$') + + # This matches numbers but not quantities. + number_re: typing.Pattern = re.compile(r'^' + numeric_pat + r'$') + + # This matches quantities excluding numbers. + quantity_re: typing.Pattern = re.compile(r'^' + quantity_pat + r'$') + + def is_valid_number_or_quantity(self, idx: typing.Optional[int] = None)->bool: + """ + Return False if this value is a list and idx is None. + Otherwise, return True if the first character is 0-9,_,-,. + and it is either a Python-compatible number or an enhanced + quantity. + """ + if self.is_list() and idx is None: + return False + + v: str = self.get_item(idx) + if not v.startswith(("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "+", "-", ".")): + return False + + m: typing.Optional[typing.Match] = KgtkValue.number_or_quantity_re.match(v) + return m is not None + + + def is_valid_number(self, idx: typing.Optional[int] = None)->bool: + """ + Return False if this value is a list and idx is None. + Otherwise, return True if the first character is 0-9,_,-,. + and it is a Python-compatible number (with optional limited enhancements). + + Examples: + 1 + 123 + -123 + +123 + 0b101 + 0o277 + 0x24F + .4 + 0.4 + 10. + 10.4 + 10.4e10 + """ + if self.is_list() and idx is None: + return False + + v: str = self.get_item(idx) + if not v.startswith(("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "+", "-", ".")): + return False + + m: typing.Optional[typing.Match] = KgtkValue.number_re.match(v) + return m is not None + + + def is_valid_quantity(self, idx: typing.Optional[int] = None)->bool: + """ + Return False if this value is a list and idx is None. + Otherwise, return True if the first character is 0-9,_,-,. + and it is an enhanced quantity. + """ + if self.is_list() and idx is None: + return False + + v: str = self.get_item(idx) + if not v.startswith(("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "+", "-", ".")): + return False + + m: typing.Optional[typing.Match] = KgtkValue.quantity_re.match(v) + return m is not None + + def is_string(self, idx: typing.Optional[int] = None)->bool: """ Return False if this value is a list and idx is None. @@ -163,7 +331,7 @@ def is_symbol(self, idx: typing.Optional[int] = None)->bool: if self.is_list() and idx is None: return False - return not (self.is_number(idx) or self.is_string(idx) or self.is_structured_literal(idx)) + return not (self.is_number_or_quantity(idx) or self.is_string(idx) or self.is_structured_literal(idx)) def is_boolean(self, idx: typing.Optional[int] = None)->bool: """ @@ -361,8 +529,8 @@ def is_valid_literal(self, idx: typing.Optional[int] = None)->bool: if self.is_string(idx): return self.is_valid_string(idx) - elif self.is_number(idx): - return self.is_valid_number(idx) + elif self.is_number_or_quantity(idx): + return self.is_valid_number_or_quantity(idx) elif self.is_structured_literal(idx): if self.is_language_qualified_string(idx): return self.is_valid_language_qualified_string(idx) @@ -373,7 +541,7 @@ def is_valid_literal(self, idx: typing.Optional[int] = None)->bool: elif self.is_extension(idx): return False # no validation presently available. else: - return False # Quantities will reach here at present. + return False # Shouldn't get here. else: return False @@ -422,11 +590,13 @@ def describe(self, idx: typing.Optional[int] = None)->str: return "String" else: return "Invalid String" - elif self.is_number(idx): + elif self.is_number_or_quantity(idx): if self.is_valid_number(idx): return "Number" + elif self.is_valid_quantity(idx): + return "Quantity" else: - return "Invalid Number" + return "Invalid Number or Quantity" elif self.is_structured_literal(idx): if self.is_language_qualified_string(idx): if self.is_valid_language_qualified_string(idx): From c0762dc609af5fd0021b50a3af3ad79471a634ae Mon Sep 17 00:00:00 2001 From: saggu Date: Fri, 1 May 2020 17:43:17 -0700 Subject: [PATCH 047/278] add option to output stats only --- kgtk/cli/gt_loader.py | 224 ++++++++++++++++++++++-------------------- 1 file changed, 117 insertions(+), 107 deletions(-) diff --git a/kgtk/cli/gt_loader.py b/kgtk/cli/gt_loader.py index 650a1c06e..bfc5da2b9 100644 --- a/kgtk/cli/gt_loader.py +++ b/kgtk/cli/gt_loader.py @@ -17,110 +17,120 @@ def add_arguments(parser): """ parser.add_argument(action="store", type=str, dest="filename", metavar='filename', help='filename here') parser.add_argument('--directed', action='store_true', dest="directed", help="Is the graph directed or not?") - parser.add_argument('--degrees', action='store_true', dest='compute_degrees', help="Whether or not to compute degree distribution.") - parser.add_argument('--pagerank', action='store_true', dest='compute_pagerank', help="Whether or not to compute PageRank centraility.") - parser.add_argument('--hits', action='store_true', dest='compute_hits', help="Whether or not to compute HITS centraility.") - parser.add_argument('--log', action='store', type=str, dest='log_file', help='Log file for summarized statistics of the graph.', default="./log.txt") - parser.add_argument('-o', '--out', action='store', type=str, dest='output', help='Graph tool file to dump the graph too - if empty, it will not be saved.') - -def run(filename, directed, compute_degrees, compute_pagerank, compute_hits, log_file, output): - - def infer_index(h, options=[]): - for o in options: - if o in h: - return h.index(o) - return -1 - - def infer_predicate(h, options=[]): - for o in options: - if o in h: - return o - return '' - - try: - # import modules locally - import socket - from graph_tool import load_graph_from_csv - from graph_tool import centrality - import kgtk.gt.analysis_utils as gtanalysis - from kgtk.exceptions import KGTKException - import sys - - # hardcoded values useful for the script. Perhaps some of them should be exposed as arguments later - directions=['in', 'out', 'total'] - id_col='name' - - with open(filename, 'r') as f: - header=next(f).split('\t') - subj_index=infer_index(header, options=['node1', 'subject']) - obj_index=infer_index(header, options=['node2', 'object', 'value']) - predicate=infer_predicate(header, options=['property', 'predicate', 'label']) - - p=[] - for i, header_col in enumerate(header): - if i in [subj_index, obj_index]: continue - p.append(header_col) - - with open(log_file, 'w') as writer: - - writer.write('loading the TSV graph now ...\n') - G2 = load_graph_from_csv(filename, - skip_first=True, - directed=directed, - hashed=True, - ecols=[subj_index,obj_index], - eprop_names=p, - csv_options={'delimiter': '\t'}) - - writer.write('graph loaded! It has %d nodes and %d edges\n' % (G2.num_vertices(), G2.num_edges())) - writer.write('\n###Top relations:\n') - for rel, freq in gtanalysis.get_topN_relations(G2, pred_property=predicate): - writer.write('%s\t%d\n' % (rel, freq)) - - if compute_degrees: - writer.write('\n###Degrees:\n') - for direction in directions: - degree_data=gtanalysis.compute_node_degree_hist(G2, direction) - max_degree=len(degree_data)-1 - mean_degree, std_degree= gtanalysis.compute_avg_node_degree(G2, direction) - writer.write('%s degree stats: mean=%f, std=%f, max=%d\n' % (direction, mean_degree, std_degree, max_degree)) - - if compute_pagerank: - writer.write('\n###PageRank\n') - v_pr = G2.new_vertex_property('float') - centrality.pagerank(G2, prop=v_pr) - G2.properties[('v', 'vertex_pagerank')] = v_pr - writer.write('Max pageranks\n') - result=gtanalysis.get_topn_indices(G2, 'vertex_pagerank', 5, id_col) - for n_id, n_label, pr in result: - writer.write('%s\t%s\t%f\n' % (n_id, n_label, pr)) - - if compute_hits: - writer.write('\n###HITS\n') - hits_eig, G2.vp['vertex_hubs'], G2.vp['vertex_auth']=gtanalysis.compute_hits(G2) - writer.write('HITS hubs\n') - main_hubs=gtanalysis.get_topn_indices(G2, 'vertex_hubs', 5, id_col) - for n_id, n_label, hubness in main_hubs: - writer.write('%s\t%s\t%f\n' % (n_id, n_label, hubness)) - writer.write('HITS auth\n') - main_auth=gtanalysis.get_topn_indices(G2, 'vertex_auth', 5, id_col) - for n_id, n_label, authority in main_auth: - writer.write('%s\t%s\t%f\n' % (n_id, n_label, authority)) - - for e in G2.edges(): - sid, oid=e - lbl=G2.ep[predicate][e] - sys.stdout.write('%s\t%s\t%s\n' % (G2.vp[id_col][sid], lbl, G2.vp[id_col][oid])) - - for v in G2.vertices(): - v_id=G2.vp[id_col][v] - for vprop in G2.vertex_properties.keys(): - if vprop==id_col: continue - sys.stdout.write('%s\t%s\t%s\n' % (v_id, vprop, G2.vp[vprop][v])) - - if output: - writer.write('now saving the graph to %s\n' % output) - G2.save(output) - except Exception as e: - raise KGTKException('Error: ' + str(e)) - + parser.add_argument('--degrees', action='store_true', dest='compute_degrees', + help="Whether or not to compute degree distribution.") + parser.add_argument('--pagerank', action='store_true', dest='compute_pagerank', + help="Whether or not to compute PageRank centraility.") + parser.add_argument('--hits', action='store_true', dest='compute_hits', + help="Whether or not to compute HITS centraility.") + parser.add_argument('--log', action='store', type=str, dest='log_file', + help='Log file for summarized statistics of the graph.', default="./log.txt") + parser.add_argument('-o', '--out', action='store', type=str, dest='output', + help='Graph tool file to dump the graph too - if empty, it will not be saved.') + parser.add_argument('--output-stats', action='store_true', dest='output_stats', + help='do not output the graph but statistics only') + + + +def run(filename, directed, compute_degrees, compute_pagerank, compute_hits, log_file, output, output_stats): + from kgtk.exceptions import KGTKException + def infer_index(h, options=[]): + for o in options: + if o in h: + return h.index(o) + return -1 + + def infer_predicate(h, options=[]): + for o in options: + if o in h: + return o + return '' + + try: + # import modules locally + import socket + from graph_tool import load_graph_from_csv + from graph_tool import centrality + import kgtk.gt.analysis_utils as gtanalysis + import sys + + # hardcoded values useful for the script. Perhaps some of them should be exposed as arguments later + directions = ['in', 'out', 'total'] + id_col = 'name' + + with open(filename, 'r') as f: + header = next(f).split('\t') + subj_index = infer_index(header, options=['node1', 'subject']) + obj_index = infer_index(header, options=['node2', 'object', 'value']) + predicate = infer_predicate(header, options=['property', 'predicate', 'label']) + + p = [] + for i, header_col in enumerate(header): + if i in [subj_index, obj_index]: continue + p.append(header_col) + + with open(log_file, 'w') as writer: + + writer.write('loading the TSV graph now ...\n') + G2 = load_graph_from_csv(filename, + skip_first=True, + directed=directed, + hashed=True, + ecols=[subj_index, obj_index], + eprop_names=p, + csv_options={'delimiter': '\t'}) + + writer.write('graph loaded! It has %d nodes and %d edges\n' % (G2.num_vertices(), G2.num_edges())) + writer.write('\n###Top relations:\n') + for rel, freq in gtanalysis.get_topN_relations(G2, pred_property=predicate): + writer.write('%s\t%d\n' % (rel, freq)) + + if compute_degrees: + writer.write('\n###Degrees:\n') + for direction in directions: + degree_data = gtanalysis.compute_node_degree_hist(G2, direction) + max_degree = len(degree_data) - 1 + mean_degree, std_degree = gtanalysis.compute_avg_node_degree(G2, direction) + writer.write( + '%s degree stats: mean=%f, std=%f, max=%d\n' % (direction, mean_degree, std_degree, max_degree)) + + if compute_pagerank: + writer.write('\n###PageRank\n') + v_pr = G2.new_vertex_property('float') + centrality.pagerank(G2, prop=v_pr) + G2.properties[('v', 'vertex_pagerank')] = v_pr + writer.write('Max pageranks\n') + result = gtanalysis.get_topn_indices(G2, 'vertex_pagerank', 5, id_col) + for n_id, n_label, pr in result: + writer.write('%s\t%s\t%f\n' % (n_id, n_label, pr)) + + if compute_hits: + writer.write('\n###HITS\n') + hits_eig, G2.vp['vertex_hubs'], G2.vp['vertex_auth'] = gtanalysis.compute_hits(G2) + writer.write('HITS hubs\n') + main_hubs = gtanalysis.get_topn_indices(G2, 'vertex_hubs', 5, id_col) + for n_id, n_label, hubness in main_hubs: + writer.write('%s\t%s\t%f\n' % (n_id, n_label, hubness)) + writer.write('HITS auth\n') + main_auth = gtanalysis.get_topn_indices(G2, 'vertex_auth', 5, id_col) + for n_id, n_label, authority in main_auth: + writer.write('%s\t%s\t%f\n' % (n_id, n_label, authority)) + + if not output_stats: + for e in G2.edges(): + sid, oid = e + lbl = G2.ep[predicate][e] + sys.stdout.write('THIS IS EDGES\n') + sys.stdout.write('%s\t%s\t%s\n' % (G2.vp[id_col][sid], lbl, G2.vp[id_col][oid])) + + for v in G2.vertices(): + v_id = G2.vp[id_col][v] + for vprop in G2.vertex_properties.keys(): + if vprop == id_col: continue + sys.stdout.write('%s\t%s\t%s\n' % (v_id, vprop, G2.vp[vprop][v])) + + if output: + writer.write('now saving the graph to %s\n' % output) + G2.save(output) + except Exception as e: + raise KGTKException('Error: ' + str(e)) From 3664ba0cf0baca6dc48ab38b39b0b7d09eada890 Mon Sep 17 00:00:00 2001 From: saggu Date: Fri, 1 May 2020 17:43:44 -0700 Subject: [PATCH 048/278] tabs v spaces --- kgtk/gt/analysis_utils.py | 77 +++++++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 31 deletions(-) diff --git a/kgtk/gt/analysis_utils.py b/kgtk/gt/analysis_utils.py index d2f3da9c1..c0f224084 100644 --- a/kgtk/gt/analysis_utils.py +++ b/kgtk/gt/analysis_utils.py @@ -1,33 +1,41 @@ import graph_tool as gtmain import graph_tool.all as gtall -import numpy as np # type: ignore +import numpy as np # type: ignore from collections import defaultdict -import matplotlib.pyplot as plt # type: ignore +import matplotlib.pyplot as plt # type: ignore + plt.rcParams.update({'font.size': 12}) -import seaborn as sns # type: ignore +import seaborn as sns # type: ignore + sns.set_style("whitegrid") + #### BASIC STATS #### def get_num_nodes(g): return g.num_vertices() + def get_num_edges(g): return g.num_edges() + #### DEGREES #### def compute_avg_node_degree(g, direction): return gtmain.stats.vertex_average(g, direction) + def compute_node_degree_hist(g, direction): return gtall.vertex_hist(g, direction, float_count=False) + def get_degree_maxn_counts(g, direction): return list(compute_node_degree_hist(g, direction)[0])[:10] + def plot_degrees(degrees, plottype='loglog', base=10, xlabel='', ylabel='', title=''): plt.loglog(degrees, basex=base, basey=base) plt.ylabel(ylabel) @@ -35,55 +43,62 @@ def plot_degrees(degrees, plottype='loglog', base=10, xlabel='', ylabel='', titl plt.title(title) plt.show() + #### CENTRALITY #### - + def compute_betweenness(g): - bn, be=gtmain.centrality.betweenness(g) + bn, be = gtmain.centrality.betweenness(g) return bn, be + def compute_pagerank(g): v_pr = g.new_vertex_property('float') gtmain.centrality.pagerank(g, prop=v_pr) return v_pr + def compute_hits(g): - hits_eig, v_hubs, v_auth=gtmain.centrality.hits(g) + hits_eig, v_hubs, v_auth = gtmain.centrality.hits(g) return hits_eig, v_hubs, v_auth - + + def get_max_node(g, prop): - max_pr=0.0 - max_pr_vertex=None + max_pr = 0.0 + max_pr_vertex = None for v in g.vertices(): - vertex_pr=g.vp[prop][v] - if vertex_pr>max_pr: - max_pr=vertex_pr - max_pr_vertex=g.vp['_graphml_vertex_id'][v] - + vertex_pr = g.vp[prop][v] + if vertex_pr > max_pr: + max_pr = vertex_pr + max_pr_vertex = g.vp['_graphml_vertex_id'][v] + return max_pr, max_pr_vertex + def get_topn_indices(g, prop, n, print_prop): - a=g.vp[prop].a - ind = np.argpartition(a, -n)[-n:] - result=[] - for i in ind: - result.append([i, g.vp[print_prop][i], g.vp[prop][i]]) - return result + a = g.vp[prop].a + ind = np.argpartition(a, -n)[-n:] + result = [] + for i in ind: + result.append([i, g.vp[print_prop][i], g.vp[prop][i]]) + return result + #### RUN ALL STATS #### - + def compute_stats(g, direction): - avg_degree, stdev_degree=compute_avg_node_degree(g, direction) + avg_degree, stdev_degree = compute_avg_node_degree(g, direction) return { - 'num_nodes': get_num_nodes(g), - 'num_edges': get_num_edges(g), - 'avg_degree': avg_degree, - 'degree_maxn_counts': get_degree_maxn_counts(g, direction), - 'stdev_degree': stdev_degree - } + 'num_nodes': get_num_nodes(g), + 'num_edges': get_num_edges(g), + 'avg_degree': avg_degree, + 'degree_maxn_counts': get_degree_maxn_counts(g, direction), + 'stdev_degree': stdev_degree + } + def get_topN_relations(g, N=10, pred_property='predicate'): - rel_freq=defaultdict(int) + rel_freq = defaultdict(int) for i, e in enumerate(g.edges()): - r=g.edge_properties[pred_property][e] - rel_freq[r]+=1 + r = g.edge_properties[pred_property][e] + rel_freq[r] += 1 return sorted(rel_freq.items(), key=lambda x: x[1], reverse=True)[:N] From 47126cfbc7a450b3d40558bd343efd52c38a468c Mon Sep 17 00:00:00 2001 From: saggu Date: Fri, 1 May 2020 18:32:15 -0700 Subject: [PATCH 049/278] add in and out degrees as well --- kgtk/cli/gt_loader.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kgtk/cli/gt_loader.py b/kgtk/cli/gt_loader.py index bfc5da2b9..c65514849 100644 --- a/kgtk/cli/gt_loader.py +++ b/kgtk/cli/gt_loader.py @@ -31,7 +31,6 @@ def add_arguments(parser): help='do not output the graph but statistics only') - def run(filename, directed, compute_degrees, compute_pagerank, compute_hits, log_file, output, output_stats): from kgtk.exceptions import KGTKException def infer_index(h, options=[]): @@ -116,15 +115,18 @@ def infer_predicate(h, options=[]): for n_id, n_label, authority in main_auth: writer.write('%s\t%s\t%f\n' % (n_id, n_label, authority)) + sys.stdout.write('node1\tproperty\tnode2\n') if not output_stats: for e in G2.edges(): sid, oid = e lbl = G2.ep[predicate][e] - sys.stdout.write('THIS IS EDGES\n') sys.stdout.write('%s\t%s\t%s\n' % (G2.vp[id_col][sid], lbl, G2.vp[id_col][oid])) for v in G2.vertices(): v_id = G2.vp[id_col][v] + + sys.stdout.write('{}\t{}\t{}\n'.format(v_id, 'vertex_in_degree', v.in_degree())) + sys.stdout.write('{}\t{}\t{}\n'.format(v_id, 'vertex_out_degree', v.out_degree())) for vprop in G2.vertex_properties.keys(): if vprop == id_col: continue sys.stdout.write('%s\t%s\t%s\n' % (v_id, vprop, G2.vp[vprop][v])) From a2ba02ae5658f1a5da4d2a253d1549e3671a9989 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Fri, 1 May 2020 22:33:55 -0700 Subject: [PATCH 050/278] Use pycontry instead of iso-639, pyconotry is better maintained. --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 42c81a774..bc68dd469 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,4 +13,4 @@ etk==2.2.1 simplejson pyrallel.lib attrs -iso-639 +pycountry From f65d3db32f5e86bad974513ee229d487a55e1b37 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Sat, 2 May 2020 00:31:50 -0700 Subject: [PATCH 051/278] Use iso-639 and pycountry both. --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index bc68dd469..44d1e9524 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,3 +14,4 @@ simplejson pyrallel.lib attrs pycountry +iso-639 From 5df36217573556f02049b9466369c61b03f73c43 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Sat, 2 May 2020 00:34:54 -0700 Subject: [PATCH 052/278] Support allowing month 0 and day 0 in dates. Support ignoring internal quotes in strings and language qualified strings. Check for two-character language codes, three-character language codes, retired language codes, language group codes, and two-character language codes with suffixes (typically countro or dialect, unchecked). --- kgtk/join/kgtkvalue.py | 118 ++++++++++++++++++++++++++++++++++------- 1 file changed, 98 insertions(+), 20 deletions(-) diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py index cdf235316..85a3010da 100644 --- a/kgtk/join/kgtkvalue.py +++ b/kgtk/join/kgtkvalue.py @@ -7,7 +7,8 @@ from argparse import ArgumentParser import attr -from iso639 import languages # type: ignore +import iso639 # type: ignore +import pycountry # type: ignore import re import sys import typing @@ -18,7 +19,23 @@ class KgtkValue(KgtkFormat): value: str = attr.ib(validator=attr.validators.instance_of(str)) + allow_month_or_day_zero: bool = attr.ib(validator=attr.validators.instance_of(bool), default=True) + allow_additional_language_codes: bool = attr.ib(validator=attr.validators.instance_of(bool), default=True) + + # When allow_lax_strings is true, strings will be checked to see if they + # start and end with double quote ("), but we won't check if internal + # double quotes are excaped by backslash. + allow_lax_strings: bool = attr.ib(validator=attr.validators.instance_of(bool), default=True) + + # When allow_lax_lq_strings is true, language qualified strings will be + # checked to see if they start and end with single quote ('), but we won't + # check if internal single quotes are excaped by backslash. + allow_lax_lq_strings: bool = attr.ib(validator=attr.validators.instance_of(bool), default=True) + additional_language_codes: typing.List[str] = [ + "mo", # Retired, replaced by the codes for Romanian, but still appearing in wikidata. + ] + split_list_re: typing.Pattern = re.compile(r"(?bool: v: str = self.get_item(idx) return v.startswith('"') - string_re: typing.Pattern = re.compile(r'^"(?:[^"]|\\.)*"$') + lax_string_re: typing.Pattern = re.compile(r'^".*"$') + strict_string_re: typing.Pattern = re.compile(r'^"(?:[^"\\]|\\.)*"$') def is_valid_string(self, idx: typing.Optional[int] = None)->bool: """ @@ -309,7 +327,11 @@ def is_valid_string(self, idx: typing.Optional[int] = None)->bool: v: str = self.get_item(idx) if not v.startswith('"'): return False - m: typing.Optional[typing.Match] = KgtkValue.string_re.match(v) + m: typing.Optional[typing.Match] + if self.allow_lax_strings: + m = KgtkValue.lax_string_re.match(v) + else: + m = KgtkValue.strict_string_re.match(v) return m is not None def is_structured_literal(self, idx: typing.Optional[int] = None)->bool: @@ -356,35 +378,84 @@ def is_language_qualified_string(self, idx: typing.Optional[int] = None)->bool: v: str = self.get_item(idx) return v.startswith("'") - # Support two or three character language codes. - language_qualified_string_re: typing.Pattern = re.compile(r"^(?P'(?:[^']|\\.)*')@(?P[a-zA-Z]{2,3})$") + # Support two or three character language codes. Suports hyphenated codes + # with country codes or dialect names after a language code. + lax_language_qualified_string_re: typing.Pattern = re.compile(r"^(?P'.*')@(?P[a-zA-Z]{2,3}(?:-[a-zA-Z]+)?)$") + strict_language_qualified_string_re: typing.Pattern = re.compile(r"^(?P'(?:[^'\\]|\\.)*')@(?P[a-zA-Z]{2,3}(?:-[a-zA-Z]+)?)$") def is_valid_language_qualified_string(self, idx: typing.Optional[int] = None)->bool: - """ - Return False if this value is a list and idx is None. + """Return False if this value is a list and idx is None. Otherwise, return True if the value looks like a language-qualified string. + + The language code may be a two- or three-character code from ISO + 639-3, which replaces ISO 639-1 and ISO 639-2. In addition, wikidata + may include language codes, such as 'mo', that have been retired. The + additional_language_codes table supports these codes, when allowed. + + Wikidata may also contain collective language codes, such as "nah", + referring the the Nahuatl languages. These codes from ISO 639-5 are + accepted as a fallback when ISO 639-3 lookup fails. + + https://meta.wikimedia.org/wiki/Special_language_codes + https://en.wikipedia.org/wiki/Template:ISO_639_name_be-tarask + """ if self.is_list() and idx is None: return False v: str = self.get_item(idx) - m: typing.Optional[typing.Match] = KgtkValue.language_qualified_string_re.match(v) + # print("checking %s" % v) + m: typing.Optional[typing.Match] + if self.allow_lax_lq_strings: + m = KgtkValue.lax_language_qualified_string_re.match(v) + else: + m = KgtkValue.strict_language_qualified_string_re.match(v) if m is None: + # print("match failed for %s" % v) return False # Validate the language code: - lang: str = m.group("lang") + lang: str = m.group("lang").lower() # print("lang: %s" % lang) - try: - if len(lang) == 2: - # Two-character language codes. - languages.get(alpha2=lang.lower()) - else: - # Three-character language codes. - languages.get(bibliographic=lang.lower()) + + if len(lang) == 2: + # Two-character language codes. + if pycountry.languages.get(alpha_2=lang) is not None: + return True + + elif len(lang) == 3: + # Three-character language codes. + if pycountry.languages.get(alpha_3=lang) is not None: + return True + + # Perhaps this is a collective code from ISO 639-5? + try: + iso639.languages.get(part5=lang) + return True + except KeyError: + pass + + # Wikidata contains entries such as: + # 'panamenha'@pt-br # language code followed by country code + # 'Ecuador'@es-formal # language code followed by dialect name + # + # If we see a dash, we'll check the language code by itself. + save_lang: str = lang # for the debug print below. + country_or_dialect: str = "" + if "-" in lang: + (lang, country_or_dialect) = lang.split("-", 1) + + # Assume that this is a two-character code. If necessary, + # we can try three-character codes, too. + if pycountry.languages.get(alpha_2=lang) is not None: + return True + + # If there's a table of additional language codes, check there: + if self.allow_additional_language_codes and lang in self.additional_language_codes: return True - except KeyError: - return False + + print("save_lang: %s lang: %s country_or_dialect: %s" % (save_lang, lang, country_or_dialect)) + return False def is_location_coordinates(self, idx: typing.Optional[int] = None)->bool: """ @@ -449,7 +520,10 @@ def is_date_and_times(self, idx: typing.Optional[int] = None)->bool: v: str = self.get_item(idx) return v.startswith("^") - date_and_times_re: typing.Pattern = re.compile(r"^\^(?P[0-9]{4})(?:(?P-)?(?P1[0-2]|0[1-9])(?:(?(hyphen)-)(?P3[01]|0[1-9]|[12][0-9])))T(?P2[0-3]|[01][0-9])(?:(?(hyphen):)(?P[0-5][0-9])(?:(?(hyphen):)(?P[0-5][0-9])))(?PZ|\[-+][0-9][0-9](?::[0-9][0-9])?)?(?P/[0-1]?[0-9])?$") + # This pattern allows month 00 and day 00, which are excluded by ISO 8601. + date_and_times_re: typing.Pattern = re.compile(r"^\^(?P[0-9]{4})(?:(?P-)?(?P1[0-2]|0[0-9])(?:(?(hyphen)-)(?P3[01]|0[0-9]|[12][0-9])))T(?P2[0-3]|[01][0-9])(?:(?(hyphen):)(?P[0-5][0-9])(?:(?(hyphen):)(?P[0-5][0-9])))(?PZ|\[-+][0-9][0-9](?::[0-9][0-9])?)?(?P/[0-1]?[0-9])?$") + + strict_date_and_times_re: typing.Pattern = re.compile(r"^\^(?P[0-9]{4})(?:(?P-)?(?P1[0-2]|0[1-9])(?:(?(hyphen)-)(?P3[01]|0[1-9]|[12][0-9])))T(?P2[0-3]|[01][0-9])(?:(?(hyphen):)(?P[0-5][0-9])(?:(?(hyphen):)(?P[0-5][0-9])))(?PZ|\[-+][0-9][0-9](?::[0-9][0-9])?)?(?P/[0-1]?[0-9])?$") def is_valid_date_and_times(self, idx: typing.Optional[int] = None)->bool: """ @@ -504,7 +578,11 @@ def is_valid_date_and_times(self, idx: typing.Optional[int] = None)->bool: return False v: str = self.get_item(idx) - m: typing.Optional[typing.Match] = KgtkValue.date_and_times_re.match(v) + m: typing.Optional[typing.Match] + if self.allow_month_or_day_zero: + m = KgtkValue.date_and_times_re.match(v) + else: + m = KgtkValue.strict_date_and_times_re.match(v) return m is not None def is_extension(self, idx: typing.Optional[int] = None)->bool: From 245edca1d4d7b5e8cbccf9c8bc3bfdea4853739d Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Sat, 2 May 2020 01:40:42 -0700 Subject: [PATCH 053/278] Support the value options on the command line. --- kgtk/cli/validate.py | 30 +++++++++++++++++++ kgtk/join/edgereader.py | 11 +++++++ kgtk/join/kgtkreader.py | 32 ++++++++++++++++++-- kgtk/join/kgtkvalue.py | 66 +++++++++++++++++++++++++++-------------- kgtk/join/nodereader.py | 11 +++++++ 5 files changed, 126 insertions(+), 24 deletions(-) diff --git a/kgtk/cli/validate.py b/kgtk/cli/validate.py index 3593ed784..c84a7fb6f 100644 --- a/kgtk/cli/validate.py +++ b/kgtk/cli/validate.py @@ -18,6 +18,7 @@ from kgtk.join.enumnameaction import EnumNameAction from kgtk.join.kgtkformat import KgtkFormat from kgtk.join.kgtkreader import KgtkReader +from kgtk.join.kgtkvalue import DEFAULT_ADDITIONAL_LANGUAGE_CODES, KgtkValueOptions from kgtk.join.validationaction import ValidationAction def parser(): @@ -33,6 +34,21 @@ def add_arguments(parser): parser (argparse.ArgumentParser) """ parser.add_argument( "kgtk_files", nargs="*", help="The KGTK file(s) to validate. May be omitted or '-' for stdin.", type=Path) + + parser.add_argument( "--additional-language-codes", dest="additional_language_codes", + help="Additional language codes.", nargs="*", default=DEFAULT_ADDITIONAL_LANGUAGE_CODES) + + parser.add_argument( "--allow-additional-language-codes", dest="allow_additional_language_codes", + help="Allow certain language codes not found in the current version of ISO 639-3 or ISO 639-5.", action='store_true') + + parser.add_argument( "--allow-lax-strings", dest="allow_lax_strings", + help="Do not check if double quotes are backslashed inside strings.", action='store_true') + + parser.add_argument( "--allow-lax-lq-strings", dest="allow_lax_lq_strings", + help="Do not check if single quotes are backslashed inside language qualified strings.", action='store_true') + + parser.add_argument( "--allow-month-or-day-zero", dest="allow_month_or_day_zero", + help="Allow month or day zero in dates.", action='store_true') parser.add_argument( "--blank-id-line-action", dest="blank_id_line_action", help="The action to take when a blank id field is detected.", @@ -143,6 +159,11 @@ def run(kgtk_files: typing.Optional[typing.List[typing.Optional[Path]]], invalid_value_action: ValidationAction = ValidationAction.REPORT, header_error_action: ValidationAction = ValidationAction.EXIT, unsafe_column_name_action: ValidationAction = ValidationAction.REPORT, + additional_language_codes: typing.List[str] = DEFAULT_ADDITIONAL_LANGUAGE_CODES, + allow_additional_language_codes: bool = False, + allow_lax_strings: bool = False, + allow_lax_lq_strings: bool = False, + allow_month_or_day_zero: bool = False, compression_type: typing.Optional[str] = None, gzip_in_parallel: bool = False, gzip_queue_size: int = KgtkReader.GZIP_QUEUE_SIZE_DEFAULT, @@ -161,6 +182,14 @@ def run(kgtk_files: typing.Optional[typing.List[typing.Optional[Path]]], # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stderr if errors_to_stderr else sys.stdout + # Build the value parsing option structure. + value_options: KgtkValueOptions = KgtkValueOptions(allow_month_or_day_zero=allow_month_or_day_zero, + allow_lax_strings=allow_lax_strings, + allow_lax_lq_strings=allow_lax_lq_strings, + allow_additional_language_codes=allow_additional_language_codes, + additional_language_codes=additional_language_codes) + print("value_options.allow_month_or_day_zero = %s" % str(value_options.allow_month_or_day_zero)) + try: kgtk_file: typing.Optional[Path] for kgtk_file in kgtk_files: @@ -191,6 +220,7 @@ def run(kgtk_files: typing.Optional[typing.List[typing.Optional[Path]]], header_error_action=header_error_action, unsafe_column_name_action=unsafe_column_name_action, compression_type=compression_type, + value_options=value_options, gzip_in_parallel=gzip_in_parallel, gzip_queue_size=gzip_queue_size, column_separator=column_separator, diff --git a/kgtk/join/edgereader.py b/kgtk/join/edgereader.py index b15c464dc..0bdb3d4fb 100644 --- a/kgtk/join/edgereader.py +++ b/kgtk/join/edgereader.py @@ -13,6 +13,7 @@ from kgtk.join.closableiter import ClosableIter from kgtk.join.enumnameaction import EnumNameAction from kgtk.join.kgtkreader import KgtkReader +from kgtk.join.kgtkvalue import KgtkValueOptions, DEFAULT_KGTK_VALUE_OPTIONS from kgtk.join.validationaction import ValidationAction @attr.s(slots=True, frozen=False) @@ -37,6 +38,7 @@ def open_edge_file(cls, invalid_value_action: ValidationAction = ValidationAction.REPORT, header_error_action: ValidationAction = ValidationAction.EXIT, unsafe_column_name_action: ValidationAction = ValidationAction.REPORT, + value_options: KgtkValueOptions = DEFAULT_KGTK_VALUE_OPTIONS, compression_type: typing.Optional[str] = None, gzip_in_parallel: bool = False, gzip_queue_size: int = KgtkReader.GZIP_QUEUE_SIZE_DEFAULT, @@ -110,6 +112,7 @@ def open_edge_file(cls, invalid_value_action=invalid_value_action, header_error_action=header_error_action, unsafe_column_name_action=unsafe_column_name_action, + value_options=value_options, compression_type=compression_type, gzip_in_parallel=gzip_in_parallel, gzip_queue_size=gzip_queue_size, @@ -166,6 +169,13 @@ def main(): error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr + # Build the value parsing option structure. + value_options: KgtkValueOptions = KgtkValueOptions(allow_month_or_day_zero=args.allow_month_or_day_zero, + allow_lax_strings=args.allow_lax_strings, + allow_lax_lq_strings=args.allow_lax_lq_strings, + allow_additional_language_codes=args.allow_additional_language_codes, + additional_language_codes=args.additional_language_codes) + er: EdgeReader = EdgeReader.open(args.kgtk_file, force_column_names=args.force_column_names, skip_first_record=args.skip_first_record, @@ -183,6 +193,7 @@ def main(): invalid_value_action=args.invalid_value_action, header_error_action=args.header_error_action, unsafe_column_name_action=args.unsafe_column_name_action, + value_options=value_options, compression_type=args.compression_type, gzip_in_parallel=args.gzip_in_parallel, gzip_queue_size=args.gzip_queue_size, diff --git a/kgtk/join/kgtkreader.py b/kgtk/join/kgtkreader.py index 916a46abf..f0831c3d3 100644 --- a/kgtk/join/kgtkreader.py +++ b/kgtk/join/kgtkreader.py @@ -21,7 +21,7 @@ from kgtk.join.gzipprocess import GunzipProcess from kgtk.join.kgtkbase import KgtkBase from kgtk.join.kgtkformat import KgtkFormat -from kgtk.join.kgtkvalue import KgtkValue +from kgtk.join.kgtkvalue import KgtkValue, KgtkValueOptions, DEFAULT_KGTK_VALUE_OPTIONS, DEFAULT_ADDITIONAL_LANGUAGE_CODES from kgtk.join.validationaction import ValidationAction @attr.s(slots=True, frozen=False) @@ -83,6 +83,7 @@ class KgtkReader(KgtkBase, ClosableIter[typing.List[str]]): # Validate data cell values? invalid_value_action: ValidationAction = attr.ib(validator=attr.validators.instance_of(ValidationAction), default=ValidationAction.REPORT) + value_options: KgtkValueOptions = attr.ib(validator=attr.validators.instance_of(KgtkValueOptions), default=DEFAULT_KGTK_VALUE_OPTIONS) # Repair records with too many or too few fields? fill_short_lines: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) @@ -130,6 +131,7 @@ def open(cls, invalid_value_action: ValidationAction = ValidationAction.REPORT, header_error_action: ValidationAction = ValidationAction.EXIT, unsafe_column_name_action: ValidationAction = ValidationAction.REPORT, + value_options: KgtkValueOptions = DEFAULT_KGTK_VALUE_OPTIONS, compression_type: typing.Optional[str] = None, gzip_in_parallel: bool = False, gzip_queue_size: int = GZIP_QUEUE_SIZE_DEFAULT, @@ -249,6 +251,7 @@ def open(cls, invalid_value_action=invalid_value_action, header_error_action=header_error_action, unsafe_column_name_action=unsafe_column_name_action, + value_options=value_options, compression_type=compression_type, gzip_in_parallel=gzip_in_parallel, gzip_queue_size=gzip_queue_size, @@ -303,6 +306,7 @@ def open(cls, invalid_value_action=invalid_value_action, header_error_action=header_error_action, unsafe_column_name_action=unsafe_column_name_action, + value_options=value_options, compression_type=compression_type, gzip_in_parallel=gzip_in_parallel, gzip_queue_size=gzip_queue_size, @@ -343,6 +347,7 @@ def open(cls, invalid_value_action=invalid_value_action, header_error_action=header_error_action, unsafe_column_name_action=unsafe_column_name_action, + value_options=value_options, compression_type=compression_type, gzip_in_parallel=gzip_in_parallel, gzip_queue_size=gzip_queue_size, @@ -593,7 +598,7 @@ def _ignore_invalid_values(self, values: typing.List[str], line: str)->bool: value: str for idx, value in enumerate(values): if len(value) > 0: # Optimize the common case of empty columns. - kv: KgtkValue = KgtkValue(value) + kv: KgtkValue = KgtkValue(value, options=self.value_options) if not kv.is_valid(): problems.append("col %d (%s) value '%s'is an %s" % (idx, self.column_names[idx], value, kv.describe())) @@ -653,6 +658,21 @@ def to_map(self, row: typing.List[str])->typing.Mapping[str, str]: def add_shared_arguments(cls, parser: ArgumentParser): parser.add_argument(dest="kgtk_file", help="The KGTK file to read", type=Path, nargs="?") + parser.add_argument( "--additional-language-codes", dest="additional_language_codes", + help="Additional language codes.", nargs="*", default=DEFAULT_ADDITIONAL_LANGUAGE_CODES) + + parser.add_argument( "--allow-additional-language-codes", dest="allow_additional_language_codes", + help="Allow certain language codes not found in the current version of ISO 639-3 or ISO 639-5.", action='store_true') + + parser.add_argument( "--allow-lax-strings", dest="allow_lax_strings", + help="Do not check if double quotes are backslashed inside strings.", action='store_true') + + parser.add_argument( "--allow-lax-lq-strings", dest="allow_lax_lq_strings", + help="Do not check if single quotes are backslashed inside language qualified strings.", action='store_true') + + parser.add_argument( "--allow-month-or-day-zero", dest="allow_month_or_day_zero", + help="Allow month or day zero in dates.", action='store_true') + parser.add_argument( "--blank-required-field-line-action", dest="blank_line_action", help="The action to take when a line with a blank node1, node2, or id field (per mode) is detected.", type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) @@ -745,6 +765,13 @@ def main(): error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr + # Build the value parsing option structure. + value_options: KgtkValueOptions = KgtkValueOptions(allow_month_or_day_zero=args.allow_month_or_day_zero, + allow_lax_strings=args.allow_lax_strings, + allow_lax_lq_strings=args.allow_lax_lq_strings, + allow_additional_language_codes=args.allow_additional_language_codes, + additional_language_codes=args.additional_language_codes) + kr: KgtkReader = KgtkReader.open(args.kgtk_file, force_column_names=args.force_column_names, skip_first_record=args.skip_first_record, @@ -764,6 +791,7 @@ def main(): invalid_value_action=args.invalid_value_action, header_error_action=args.header_error_action, unsafe_column_name_action=args.unsafe_column_name_action, + value_options=value_options, compression_type=args.compression_type, gzip_in_parallel=args.gzip_in_parallel, gzip_queue_size=args.gzip_queue_size, diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py index 85a3010da..41dbd8b94 100644 --- a/kgtk/join/kgtkvalue.py +++ b/kgtk/join/kgtkvalue.py @@ -15,30 +15,51 @@ from kgtk.join.kgtkformat import KgtkFormat -@attr.s(slots=True, frozen=False) -class KgtkValue(KgtkFormat): - value: str = attr.ib(validator=attr.validators.instance_of(str)) +DEFAULT_ADDITIONAL_LANGUAGE_CODES: typing.List[str] = [ + "mo", # Retired, replaced by the codes for Romanian, but still appearing in wikidata. +] - allow_month_or_day_zero: bool = attr.ib(validator=attr.validators.instance_of(bool), default=True) - allow_additional_language_codes: bool = attr.ib(validator=attr.validators.instance_of(bool), default=True) + +@attr.s(slots=True, frozen=True) +class KgtkValueOptions: + """ + These options will affect some aspects of value processing. They are in a + seperate class for efficiency. + """ + + # Allow month 00 or day 00 in dates? This isn't really allowed by ISO + # 8601, but appears in wikidata. + allow_month_or_day_zero: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) # When allow_lax_strings is true, strings will be checked to see if they # start and end with double quote ("), but we won't check if internal # double quotes are excaped by backslash. - allow_lax_strings: bool = attr.ib(validator=attr.validators.instance_of(bool), default=True) + allow_lax_strings: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) # When allow_lax_lq_strings is true, language qualified strings will be # checked to see if they start and end with single quote ('), but we won't # check if internal single quotes are excaped by backslash. - allow_lax_lq_strings: bool = attr.ib(validator=attr.validators.instance_of(bool), default=True) + allow_lax_lq_strings: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) + + # Shall we allow additional language codes? + allow_additional_language_codes: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) + + # If this list gets long, we may want to turn it into a map to make lookup + # more efficient. + additional_language_codes: typing.List[str] = attr.ib(validator=attr.validators.deep_iterable(member_validator=attr.validators.instance_of(str), + iterable_validator=attr.validators.instance_of(list)), + default=DEFAULT_ADDITIONAL_LANGUAGE_CODES) - additional_language_codes: typing.List[str] = [ - "mo", # Retired, replaced by the codes for Romanian, but still appearing in wikidata. - ] +DEFAULT_KGTK_VALUE_OPTIONS: KgtkValueOptions = KgtkValueOptions() + +@attr.s(slots=True, frozen=False) +class KgtkValue(KgtkFormat): + value: str = attr.ib(validator=attr.validators.instance_of(str)) + options: KgtkValueOptions = attr.ib(validator=attr.validators.instance_of(KgtkValueOptions), default=DEFAULT_KGTK_VALUE_OPTIONS) split_list_re: typing.Pattern = re.compile(r"(?typing.List[str]: @@ -65,7 +86,7 @@ def get_values(self)->typing.List['KgtkValue']: result: typing.List['KgtkValue'] = [ ] v: str for v in self.get_list(): - result.append(KgtkValue(v)) + result.append(KgtkValue(v, options=self.options)) return result def is_empty(self, idx: typing.Optional[int] = None)->bool: @@ -328,7 +349,7 @@ def is_valid_string(self, idx: typing.Optional[int] = None)->bool: if not v.startswith('"'): return False m: typing.Optional[typing.Match] - if self.allow_lax_strings: + if self.options.allow_lax_strings: m = KgtkValue.lax_string_re.match(v) else: m = KgtkValue.strict_string_re.match(v) @@ -406,7 +427,7 @@ def is_valid_language_qualified_string(self, idx: typing.Optional[int] = None)-> v: str = self.get_item(idx) # print("checking %s" % v) m: typing.Optional[typing.Match] - if self.allow_lax_lq_strings: + if self.options.allow_lax_lq_strings: m = KgtkValue.lax_language_qualified_string_re.match(v) else: m = KgtkValue.strict_language_qualified_string_re.match(v) @@ -428,7 +449,7 @@ def is_valid_language_qualified_string(self, idx: typing.Optional[int] = None)-> if pycountry.languages.get(alpha_3=lang) is not None: return True - # Perhaps this is a collective code from ISO 639-5? + # Perhaps this is a collective (language family) code from ISO 639-5? try: iso639.languages.get(part5=lang) return True @@ -440,7 +461,7 @@ def is_valid_language_qualified_string(self, idx: typing.Optional[int] = None)-> # 'Ecuador'@es-formal # language code followed by dialect name # # If we see a dash, we'll check the language code by itself. - save_lang: str = lang # for the debug print below. + # save_lang: str = lang # for the debug print below. country_or_dialect: str = "" if "-" in lang: (lang, country_or_dialect) = lang.split("-", 1) @@ -448,13 +469,14 @@ def is_valid_language_qualified_string(self, idx: typing.Optional[int] = None)-> # Assume that this is a two-character code. If necessary, # we can try three-character codes, too. if pycountry.languages.get(alpha_2=lang) is not None: + # Note: we didn't check the country_or_dialect portion. return True # If there's a table of additional language codes, check there: - if self.allow_additional_language_codes and lang in self.additional_language_codes: + if self.options.allow_additional_language_codes and lang in self.options.additional_language_codes: return True - print("save_lang: %s lang: %s country_or_dialect: %s" % (save_lang, lang, country_or_dialect)) + # print("save_lang: %s lang: %s country_or_dialect: %s" % (save_lang, lang, country_or_dialect)) return False def is_location_coordinates(self, idx: typing.Optional[int] = None)->bool: @@ -521,7 +543,7 @@ def is_date_and_times(self, idx: typing.Optional[int] = None)->bool: return v.startswith("^") # This pattern allows month 00 and day 00, which are excluded by ISO 8601. - date_and_times_re: typing.Pattern = re.compile(r"^\^(?P[0-9]{4})(?:(?P-)?(?P1[0-2]|0[0-9])(?:(?(hyphen)-)(?P3[01]|0[0-9]|[12][0-9])))T(?P2[0-3]|[01][0-9])(?:(?(hyphen):)(?P[0-5][0-9])(?:(?(hyphen):)(?P[0-5][0-9])))(?PZ|\[-+][0-9][0-9](?::[0-9][0-9])?)?(?P/[0-1]?[0-9])?$") + lax_date_and_times_re: typing.Pattern = re.compile(r"^\^(?P[0-9]{4})(?:(?P-)?(?P1[0-2]|0[0-9])(?:(?(hyphen)-)(?P3[01]|0[0-9]|[12][0-9])))T(?P2[0-3]|[01][0-9])(?:(?(hyphen):)(?P[0-5][0-9])(?:(?(hyphen):)(?P[0-5][0-9])))(?PZ|\[-+][0-9][0-9](?::[0-9][0-9])?)?(?P/[0-1]?[0-9])?$") strict_date_and_times_re: typing.Pattern = re.compile(r"^\^(?P[0-9]{4})(?:(?P-)?(?P1[0-2]|0[1-9])(?:(?(hyphen)-)(?P3[01]|0[1-9]|[12][0-9])))T(?P2[0-3]|[01][0-9])(?:(?(hyphen):)(?P[0-5][0-9])(?:(?(hyphen):)(?P[0-5][0-9])))(?PZ|\[-+][0-9][0-9](?::[0-9][0-9])?)?(?P/[0-1]?[0-9])?$") @@ -579,8 +601,8 @@ def is_valid_date_and_times(self, idx: typing.Optional[int] = None)->bool: v: str = self.get_item(idx) m: typing.Optional[typing.Match] - if self.allow_month_or_day_zero: - m = KgtkValue.date_and_times_re.match(v) + if self.options.allow_month_or_day_zero: + m = KgtkValue.lax_date_and_times_re.match(v) else: m = KgtkValue.strict_date_and_times_re.match(v) return m is not None @@ -710,7 +732,7 @@ def main(): value: str for value in args.values: - print("%s: %s" % (value, KgtkValue(value).describe())) + print("%s: %s" % (value, KgtkValue(value).describe()), flush=True) if __name__ == "__main__": main() diff --git a/kgtk/join/nodereader.py b/kgtk/join/nodereader.py index 4b3de9587..668e8918a 100644 --- a/kgtk/join/nodereader.py +++ b/kgtk/join/nodereader.py @@ -13,6 +13,7 @@ from kgtk.join.closableiter import ClosableIter from kgtk.join.enumnameaction import EnumNameAction from kgtk.join.kgtkreader import KgtkReader +from kgtk.join.kgtkvalue import KgtkValueOptions, DEFAULT_KGTK_VALUE_OPTIONS from kgtk.join.validationaction import ValidationAction @attr.s(slots=True, frozen=False) @@ -36,6 +37,7 @@ def open_node_file(cls, invalid_value_action: ValidationAction = ValidationAction.REPORT, header_error_action: ValidationAction = ValidationAction.EXIT, unsafe_column_name_action: ValidationAction = ValidationAction.REPORT, + value_options: KgtkValueOptions = DEFAULT_KGTK_VALUE_OPTIONS, compression_type: typing.Optional[str] = None, gzip_in_parallel: bool = False, gzip_queue_size: int = KgtkReader.GZIP_QUEUE_SIZE_DEFAULT, @@ -101,6 +103,7 @@ def open_node_file(cls, invalid_value_action=invalid_value_action, header_error_action=header_error_action, unsafe_column_name_action=unsafe_column_name_action, + value_options=value_options, compression_type=compression_type, gzip_in_parallel=gzip_in_parallel, gzip_queue_size=gzip_queue_size, @@ -144,6 +147,13 @@ def main(): error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr + # Build the value parsing option structure. + value_options: KgtkValueOptions = KgtkValueOptions(allow_month_or_day_zero=args.allow_month_or_day_zero, + allow_lax_strings=args.allow_lax_strings, + allow_lax_lq_strings=args.allow_lax_lq_strings, + allow_additional_language_codes=args.allow_additional_language_codes, + additional_language_codes=args.additional_language_codes) + er: NodeReader = NodeReader.open(args.kgtk_file, force_column_names=args.force_column_names, skip_first_record=args.skip_first_record, @@ -160,6 +170,7 @@ def main(): invalid_value_action=args.invalid_value_action, header_error_action=args.header_error_action, unsafe_column_name_action=args.unsafe_column_name_action, + value_options=value_options, compression_type=args.compression_type, gzip_in_parallel=args.gzip_in_parallel, gzip_queue_size=args.gzip_queue_size, From 443bdd2eca2629b9ab58cd0ac1f19ac5e1eeeb76 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Sat, 2 May 2020 01:45:03 -0700 Subject: [PATCH 054/278] Remove debug write. --- kgtk/cli/validate.py | 1 - 1 file changed, 1 deletion(-) diff --git a/kgtk/cli/validate.py b/kgtk/cli/validate.py index c84a7fb6f..7c28d38b7 100644 --- a/kgtk/cli/validate.py +++ b/kgtk/cli/validate.py @@ -188,7 +188,6 @@ def run(kgtk_files: typing.Optional[typing.List[typing.Optional[Path]]], allow_lax_lq_strings=allow_lax_lq_strings, allow_additional_language_codes=allow_additional_language_codes, additional_language_codes=additional_language_codes) - print("value_options.allow_month_or_day_zero = %s" % str(value_options.allow_month_or_day_zero)) try: kgtk_file: typing.Optional[Path] From 44538259570bf25e1a3d067867e1c216ae7edce0 Mon Sep 17 00:00:00 2001 From: Naren Date: Sat, 2 May 2020 18:06:37 -0700 Subject: [PATCH 055/278] escape quotes when necessary --- kgtk/cli/import_ntriples.py | 2 +- kgtk/cli/import_wikidata.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/kgtk/cli/import_ntriples.py b/kgtk/cli/import_ntriples.py index 49f63a10d..49cabeb5a 100644 --- a/kgtk/cli/import_ntriples.py +++ b/kgtk/cli/import_ntriples.py @@ -111,7 +111,7 @@ def run(input_file, output_file, limit): if '@' in subject: str_parts = subject.split('@') final_value = '\'' + \ - str_parts[0] + '\'@' + str_parts[1] + str_parts[0].replace("'","\\'") + '\'@' + str_parts[1] else: final_value = '\"' + str(subject) + '\"' final_row.append(final_value) diff --git a/kgtk/cli/import_wikidata.py b/kgtk/cli/import_wikidata.py index 2c242f3e7..797bd3a90 100644 --- a/kgtk/cli/import_wikidata.py +++ b/kgtk/cli/import_wikidata.py @@ -160,7 +160,7 @@ def process(self,line,node_file,edge_file,qual_file,lang,doc_id): lang_label = labels.get(lang, None) if lang_label: row.append( - '\'' + lang_label['value'] + '\'' + "@" + lang) + '\'' + lang_label['value'].replace("'","\\'") + '\'' + "@" + lang) else: row.append("") else: @@ -173,7 +173,7 @@ def process(self,line,node_file,edge_file,qual_file,lang,doc_id): lang_descr = descriptions.get(lang, None) if lang_descr: row.append( - '\'' + lang_descr['value'] + '\'' + "@" + lang) + '\'' + lang_descr['value'].replace("'","\\'") + '\'' + "@" + lang) else: row.append("") else: @@ -187,7 +187,7 @@ def process(self,line,node_file,edge_file,qual_file,lang,doc_id): alias_list = [] for item in lang_aliases: alias_list.append( - '\'' + item['value'] + '\'' + "@" + lang) + '\'' + item['value'].replace("'","\\'") + '\'' + "@" + lang) row.append("|".join(alias_list)) else: row.append('') @@ -269,9 +269,9 @@ def process(self,line,node_file,edge_file,qual_file,lang,doc_id): val['time'][1:] + '/' + str(val['precision']) elif typ == 'monolingualtext': value = '\'' + \ - val['text'] + '\'' + '@' + val['language'] + val['text'].replace("'","\\'") + '\'' + '@' + val['language'] else: - value = '\"' + val + '\"' + value = '\"' + val.replace('"','\\"') + '\"' if edge_file: erows.append([sid, qnode, @@ -355,9 +355,9 @@ def process(self,line,node_file,edge_file,qual_file,lang,doc_id): val['time'][1:] + '/' + str(val['precision']) elif typ == 'monolingualtext': value = '\'' + \ - val['text'] + '\'' + '@' + val['language'] + val['text'].replace("'","\\'") + '\'' + '@' + val['language'] else: - value = '\"' + val + '\"' + value = '\"' + val.replace('"','\\"') + '\"' qrows.append( [ tempid, From a5032e5c36557d3e836b47e9bef0a5090ccdd469 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Sun, 3 May 2020 10:33:30 -0700 Subject: [PATCH 056/278] Check for three-character language codes with suffixes. --- kgtk/join/kgtkvalue.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py index 41dbd8b94..a20fe47f2 100644 --- a/kgtk/join/kgtkvalue.py +++ b/kgtk/join/kgtkvalue.py @@ -466,11 +466,23 @@ def is_valid_language_qualified_string(self, idx: typing.Optional[int] = None)-> if "-" in lang: (lang, country_or_dialect) = lang.split("-", 1) - # Assume that this is a two-character code. If necessary, - # we can try three-character codes, too. - if pycountry.languages.get(alpha_2=lang) is not None: - # Note: we didn't check the country_or_dialect portion. + # TODO: refactor so this code isn't duplicated? + if len(lang) == 2: + # Two-character language codes. + if pycountry.languages.get(alpha_2=lang) is not None: + return True + + elif len(lang) == 3: + # Three-character language codes. + if pycountry.languages.get(alpha_3=lang) is not None: + return True + + # Perhaps this is a collective (language family) code from ISO 639-5? + try: + iso639.languages.get(part5=lang) return True + except KeyError: + pass # If there's a table of additional language codes, check there: if self.options.allow_additional_language_codes and lang in self.options.additional_language_codes: From 1045c18f56630669fcb591efaf9fe67cf6663e1b Mon Sep 17 00:00:00 2001 From: Naren Date: Sun, 3 May 2020 14:48:01 -0700 Subject: [PATCH 057/278] escape | character --- kgtk/cli/import_wikidata.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kgtk/cli/import_wikidata.py b/kgtk/cli/import_wikidata.py index 797bd3a90..abf623746 100644 --- a/kgtk/cli/import_wikidata.py +++ b/kgtk/cli/import_wikidata.py @@ -56,7 +56,7 @@ def add_arguments(parser): type=str, dest="lang", default="en", - help='language to extract, default en') + help='languages to extract, comma separated, default en') parser.add_argument( "--source", action="store", @@ -159,6 +159,7 @@ def process(self,line,node_file,edge_file,qual_file,lang,doc_id): if labels: lang_label = labels.get(lang, None) if lang_label: + lang_label['value']=lang_label['value'].replace('|','\\|') row.append( '\'' + lang_label['value'].replace("'","\\'") + '\'' + "@" + lang) else: @@ -172,6 +173,7 @@ def process(self,line,node_file,edge_file,qual_file,lang,doc_id): if descriptions: lang_descr = descriptions.get(lang, None) if lang_descr: + lang_descr['value']=lang_descr['value'].replace('|','\\|') row.append( '\'' + lang_descr['value'].replace("'","\\'") + '\'' + "@" + lang) else: @@ -186,6 +188,7 @@ def process(self,line,node_file,edge_file,qual_file,lang,doc_id): if lang_aliases: alias_list = [] for item in lang_aliases: + item['value']=item['value'].replace('|','\\|') alias_list.append( '\'' + item['value'].replace("'","\\'") + '\'' + "@" + lang) row.append("|".join(alias_list)) From 4d8d015f282a8554d20d787f52434467c240c6f2 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Sun, 3 May 2020 17:13:28 -0700 Subject: [PATCH 058/278] Refactor the KgtkValueOptions ArgumentParser arguments. --- kgtk/cli/validate.py | 19 ++++--------------- kgtk/join/edgereader.py | 7 ++----- kgtk/join/kgtkreader.py | 24 +++--------------------- kgtk/join/kgtkvalue.py | 40 ++++++++++++++++++++++++++++++++++++---- kgtk/join/nodereader.py | 7 ++----- 5 files changed, 47 insertions(+), 50 deletions(-) diff --git a/kgtk/cli/validate.py b/kgtk/cli/validate.py index 7c28d38b7..470304e0f 100644 --- a/kgtk/cli/validate.py +++ b/kgtk/cli/validate.py @@ -35,21 +35,6 @@ def add_arguments(parser): """ parser.add_argument( "kgtk_files", nargs="*", help="The KGTK file(s) to validate. May be omitted or '-' for stdin.", type=Path) - parser.add_argument( "--additional-language-codes", dest="additional_language_codes", - help="Additional language codes.", nargs="*", default=DEFAULT_ADDITIONAL_LANGUAGE_CODES) - - parser.add_argument( "--allow-additional-language-codes", dest="allow_additional_language_codes", - help="Allow certain language codes not found in the current version of ISO 639-3 or ISO 639-5.", action='store_true') - - parser.add_argument( "--allow-lax-strings", dest="allow_lax_strings", - help="Do not check if double quotes are backslashed inside strings.", action='store_true') - - parser.add_argument( "--allow-lax-lq-strings", dest="allow_lax_lq_strings", - help="Do not check if single quotes are backslashed inside language qualified strings.", action='store_true') - - parser.add_argument( "--allow-month-or-day-zero", dest="allow_month_or_day_zero", - help="Allow month or day zero in dates.", action='store_true') - parser.add_argument( "--blank-id-line-action", dest="blank_id_line_action", help="The action to take when a blank id field is detected.", type=ValidationAction, action=EnumNameAction, default=None) @@ -138,6 +123,10 @@ def add_arguments(parser): help="The action to take when a whitespace line is detected.", type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) + # Note: Any arguments described by KgtkValueOptions.add_arguments(...) + # need to be included in the arguments to run(...), below. + KgtkValueOptions.add_arguments(parser) + def run(kgtk_files: typing.Optional[typing.List[typing.Optional[Path]]], force_column_names: typing.Optional[typing.List[str]] = None, diff --git a/kgtk/join/edgereader.py b/kgtk/join/edgereader.py index 0bdb3d4fb..4b8865640 100644 --- a/kgtk/join/edgereader.py +++ b/kgtk/join/edgereader.py @@ -165,16 +165,13 @@ def main(): parser = ArgumentParser() KgtkReader.add_shared_arguments(parser) EdgeReader.add_arguments(parser) + KgtkValueOptions.add_arguments(parser) args = parser.parse_args() error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr # Build the value parsing option structure. - value_options: KgtkValueOptions = KgtkValueOptions(allow_month_or_day_zero=args.allow_month_or_day_zero, - allow_lax_strings=args.allow_lax_strings, - allow_lax_lq_strings=args.allow_lax_lq_strings, - allow_additional_language_codes=args.allow_additional_language_codes, - additional_language_codes=args.additional_language_codes) + value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) er: EdgeReader = EdgeReader.open(args.kgtk_file, force_column_names=args.force_column_names, diff --git a/kgtk/join/kgtkreader.py b/kgtk/join/kgtkreader.py index f0831c3d3..c6518daa0 100644 --- a/kgtk/join/kgtkreader.py +++ b/kgtk/join/kgtkreader.py @@ -21,7 +21,7 @@ from kgtk.join.gzipprocess import GunzipProcess from kgtk.join.kgtkbase import KgtkBase from kgtk.join.kgtkformat import KgtkFormat -from kgtk.join.kgtkvalue import KgtkValue, KgtkValueOptions, DEFAULT_KGTK_VALUE_OPTIONS, DEFAULT_ADDITIONAL_LANGUAGE_CODES +from kgtk.join.kgtkvalue import KgtkValue, KgtkValueOptions, DEFAULT_KGTK_VALUE_OPTIONS from kgtk.join.validationaction import ValidationAction @attr.s(slots=True, frozen=False) @@ -658,21 +658,6 @@ def to_map(self, row: typing.List[str])->typing.Mapping[str, str]: def add_shared_arguments(cls, parser: ArgumentParser): parser.add_argument(dest="kgtk_file", help="The KGTK file to read", type=Path, nargs="?") - parser.add_argument( "--additional-language-codes", dest="additional_language_codes", - help="Additional language codes.", nargs="*", default=DEFAULT_ADDITIONAL_LANGUAGE_CODES) - - parser.add_argument( "--allow-additional-language-codes", dest="allow_additional_language_codes", - help="Allow certain language codes not found in the current version of ISO 639-3 or ISO 639-5.", action='store_true') - - parser.add_argument( "--allow-lax-strings", dest="allow_lax_strings", - help="Do not check if double quotes are backslashed inside strings.", action='store_true') - - parser.add_argument( "--allow-lax-lq-strings", dest="allow_lax_lq_strings", - help="Do not check if single quotes are backslashed inside language qualified strings.", action='store_true') - - parser.add_argument( "--allow-month-or-day-zero", dest="allow_month_or_day_zero", - help="Allow month or day zero in dates.", action='store_true') - parser.add_argument( "--blank-required-field-line-action", dest="blank_line_action", help="The action to take when a line with a blank node1, node2, or id field (per mode) is detected.", type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) @@ -761,16 +746,13 @@ def main(): KgtkReader.add_arguments(parser) EdgeReader.add_arguments(parser) NodeReader.add_arguments(parser) + KgtkValueOptions.add_arguments(parser) args = parser.parse_args() error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr # Build the value parsing option structure. - value_options: KgtkValueOptions = KgtkValueOptions(allow_month_or_day_zero=args.allow_month_or_day_zero, - allow_lax_strings=args.allow_lax_strings, - allow_lax_lq_strings=args.allow_lax_lq_strings, - allow_additional_language_codes=args.allow_additional_language_codes, - additional_language_codes=args.additional_language_codes) + value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) kr: KgtkReader = KgtkReader.open(args.kgtk_file, force_column_names=args.force_column_names, diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py index a20fe47f2..bbfa8c3b5 100644 --- a/kgtk/join/kgtkvalue.py +++ b/kgtk/join/kgtkvalue.py @@ -5,7 +5,7 @@ """ -from argparse import ArgumentParser +from argparse import ArgumentParser, Namespace import attr import iso639 # type: ignore import pycountry # type: ignore @@ -50,6 +50,33 @@ class KgtkValueOptions: iterable_validator=attr.validators.instance_of(list)), default=DEFAULT_ADDITIONAL_LANGUAGE_CODES) + + @classmethod + def add_arguments(cls, parser: ArgumentParser): + parser.add_argument( "--additional-language-codes", dest="additional_language_codes", + help="Additional language codes.", nargs="*", default=DEFAULT_ADDITIONAL_LANGUAGE_CODES) + + parser.add_argument( "--allow-additional-language-codes", dest="allow_additional_language_codes", + help="Allow certain language codes not found in the current version of ISO 639-3 or ISO 639-5.", action='store_true') + + parser.add_argument( "--allow-lax-strings", dest="allow_lax_strings", + help="Do not check if double quotes are backslashed inside strings.", action='store_true') + + parser.add_argument( "--allow-lax-lq-strings", dest="allow_lax_lq_strings", + help="Do not check if single quotes are backslashed inside language qualified strings.", action='store_true') + + parser.add_argument( "--allow-month-or-day-zero", dest="allow_month_or_day_zero", + help="Allow month or day zero in dates.", action='store_true') + + @classmethod + # Build the value parsing option structure. + def from_args(cls, args: Namespace)->'KgtkValueOptions': + return cls(allow_month_or_day_zero=args.allow_month_or_day_zero, + allow_lax_strings=args.allow_lax_strings, + allow_lax_lq_strings=args.allow_lax_lq_strings, + allow_additional_language_codes=args.allow_additional_language_codes, + additional_language_codes=args.additional_language_codes) + DEFAULT_KGTK_VALUE_OPTIONS: KgtkValueOptions = KgtkValueOptions() @attr.s(slots=True, frozen=False) @@ -732,19 +759,24 @@ def describe(self, idx: typing.Optional[int] = None)->str: else: return "Symbol" + def main(): """ Test the KGTK value vparser. """ - parser = ArgumentParser() + parser: ArgumentParser = ArgumentParser() parser.add_argument(dest="values", help="The values(s) to test", type=str, nargs="+") parser.add_argument("-v", "--verbose", dest="verbose", help="Print additional progress messages.", action='store_true') parser.add_argument( "--very-verbose", dest="very_verbose", help="Print additional progress messages.", action='store_true') - args = parser.parse_args() + KgtkValueOptions.add_arguments(parser) + args: Namespace = parser.parse_args() + + # Build the value parsing option structure. + value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) value: str for value in args.values: - print("%s: %s" % (value, KgtkValue(value).describe()), flush=True) + print("%s: %s" % (value, KgtkValue(value, options=value_options).describe()), flush=True) if __name__ == "__main__": main() diff --git a/kgtk/join/nodereader.py b/kgtk/join/nodereader.py index 668e8918a..9b50fea02 100644 --- a/kgtk/join/nodereader.py +++ b/kgtk/join/nodereader.py @@ -143,16 +143,13 @@ def main(): parser = ArgumentParser() KgtkReader.add_shared_arguments(parser) NodeReader.add_arguments(parser) + KgtkValueOptions.add_arguments(parser) args = parser.parse_args() error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr # Build the value parsing option structure. - value_options: KgtkValueOptions = KgtkValueOptions(allow_month_or_day_zero=args.allow_month_or_day_zero, - allow_lax_strings=args.allow_lax_strings, - allow_lax_lq_strings=args.allow_lax_lq_strings, - allow_additional_language_codes=args.allow_additional_language_codes, - additional_language_codes=args.additional_language_codes) + value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) er: NodeReader = NodeReader.open(args.kgtk_file, force_column_names=args.force_column_names, From 6d94e179861ec5c23d364f1096156b81d2d4f4fc Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Sun, 3 May 2020 18:21:51 -0700 Subject: [PATCH 059/278] Refactor the language validation code. Rely on the default list of additional language codes. --- kgtk/cli/validate.py | 6 +- kgtk/join/kgtkvalue.py | 81 +++------------------- kgtk/join/languagevalidator.py | 123 +++++++++++++++++++++++++++++++++ 3 files changed, 133 insertions(+), 77 deletions(-) create mode 100644 kgtk/join/languagevalidator.py diff --git a/kgtk/cli/validate.py b/kgtk/cli/validate.py index 470304e0f..db46d66ee 100644 --- a/kgtk/cli/validate.py +++ b/kgtk/cli/validate.py @@ -18,7 +18,7 @@ from kgtk.join.enumnameaction import EnumNameAction from kgtk.join.kgtkformat import KgtkFormat from kgtk.join.kgtkreader import KgtkReader -from kgtk.join.kgtkvalue import DEFAULT_ADDITIONAL_LANGUAGE_CODES, KgtkValueOptions +from kgtk.join.kgtkvalue import KgtkValueOptions from kgtk.join.validationaction import ValidationAction def parser(): @@ -148,8 +148,7 @@ def run(kgtk_files: typing.Optional[typing.List[typing.Optional[Path]]], invalid_value_action: ValidationAction = ValidationAction.REPORT, header_error_action: ValidationAction = ValidationAction.EXIT, unsafe_column_name_action: ValidationAction = ValidationAction.REPORT, - additional_language_codes: typing.List[str] = DEFAULT_ADDITIONAL_LANGUAGE_CODES, - allow_additional_language_codes: bool = False, + additional_language_codes: typing.Optional[typing.List[str]] = None, allow_lax_strings: bool = False, allow_lax_lq_strings: bool = False, allow_month_or_day_zero: bool = False, @@ -175,7 +174,6 @@ def run(kgtk_files: typing.Optional[typing.List[typing.Optional[Path]]], value_options: KgtkValueOptions = KgtkValueOptions(allow_month_or_day_zero=allow_month_or_day_zero, allow_lax_strings=allow_lax_strings, allow_lax_lq_strings=allow_lax_lq_strings, - allow_additional_language_codes=allow_additional_language_codes, additional_language_codes=additional_language_codes) try: diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py index bbfa8c3b5..fd6aef315 100644 --- a/kgtk/join/kgtkvalue.py +++ b/kgtk/join/kgtkvalue.py @@ -1,24 +1,15 @@ """ Validate KGTK File data types. - -Dimensioned quantities are not supported. - """ from argparse import ArgumentParser, Namespace import attr -import iso639 # type: ignore -import pycountry # type: ignore import re import sys import typing from kgtk.join.kgtkformat import KgtkFormat - -DEFAULT_ADDITIONAL_LANGUAGE_CODES: typing.List[str] = [ - "mo", # Retired, replaced by the codes for Romanian, but still appearing in wikidata. -] - +from kgtk.join.languagevalidator import LanguageValidator @attr.s(slots=True, frozen=True) class KgtkValueOptions: @@ -41,23 +32,17 @@ class KgtkValueOptions: # check if internal single quotes are excaped by backslash. allow_lax_lq_strings: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) - # Shall we allow additional language codes? - allow_additional_language_codes: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) - # If this list gets long, we may want to turn it into a map to make lookup # more efficient. - additional_language_codes: typing.List[str] = attr.ib(validator=attr.validators.deep_iterable(member_validator=attr.validators.instance_of(str), - iterable_validator=attr.validators.instance_of(list)), - default=DEFAULT_ADDITIONAL_LANGUAGE_CODES) + additional_language_codes: typing.Optional[typing.List[str]] = attr.ib(validator=attr.validators.deep_iterable(member_validator=attr.validators.instance_of(str), + iterable_validator=attr.validators.instance_of(list)), + default=None) @classmethod def add_arguments(cls, parser: ArgumentParser): parser.add_argument( "--additional-language-codes", dest="additional_language_codes", - help="Additional language codes.", nargs="*", default=DEFAULT_ADDITIONAL_LANGUAGE_CODES) - - parser.add_argument( "--allow-additional-language-codes", dest="allow_additional_language_codes", - help="Allow certain language codes not found in the current version of ISO 639-3 or ISO 639-5.", action='store_true') + help="Additional language codes.", nargs="*", default=None) parser.add_argument( "--allow-lax-strings", dest="allow_lax_strings", help="Do not check if double quotes are backslashed inside strings.", action='store_true') @@ -74,7 +59,6 @@ def from_args(cls, args: Namespace)->'KgtkValueOptions': return cls(allow_month_or_day_zero=args.allow_month_or_day_zero, allow_lax_strings=args.allow_lax_strings, allow_lax_lq_strings=args.allow_lax_lq_strings, - allow_additional_language_codes=args.allow_additional_language_codes, additional_language_codes=args.additional_language_codes) DEFAULT_KGTK_VALUE_OPTIONS: KgtkValueOptions = KgtkValueOptions() @@ -466,57 +450,8 @@ def is_valid_language_qualified_string(self, idx: typing.Optional[int] = None)-> lang: str = m.group("lang").lower() # print("lang: %s" % lang) - if len(lang) == 2: - # Two-character language codes. - if pycountry.languages.get(alpha_2=lang) is not None: - return True - - elif len(lang) == 3: - # Three-character language codes. - if pycountry.languages.get(alpha_3=lang) is not None: - return True - - # Perhaps this is a collective (language family) code from ISO 639-5? - try: - iso639.languages.get(part5=lang) - return True - except KeyError: - pass - - # Wikidata contains entries such as: - # 'panamenha'@pt-br # language code followed by country code - # 'Ecuador'@es-formal # language code followed by dialect name - # - # If we see a dash, we'll check the language code by itself. - # save_lang: str = lang # for the debug print below. - country_or_dialect: str = "" - if "-" in lang: - (lang, country_or_dialect) = lang.split("-", 1) - - # TODO: refactor so this code isn't duplicated? - if len(lang) == 2: - # Two-character language codes. - if pycountry.languages.get(alpha_2=lang) is not None: - return True - - elif len(lang) == 3: - # Three-character language codes. - if pycountry.languages.get(alpha_3=lang) is not None: - return True - - # Perhaps this is a collective (language family) code from ISO 639-5? - try: - iso639.languages.get(part5=lang) - return True - except KeyError: - pass - - # If there's a table of additional language codes, check there: - if self.options.allow_additional_language_codes and lang in self.options.additional_language_codes: - return True - - # print("save_lang: %s lang: %s country_or_dialect: %s" % (save_lang, lang, country_or_dialect)) - return False + return LanguageValidator.validate(lang, + additional_language_codes=self.options.additional_language_codes) def is_location_coordinates(self, idx: typing.Optional[int] = None)->bool: """ @@ -762,7 +697,7 @@ def describe(self, idx: typing.Optional[int] = None)->str: def main(): """ - Test the KGTK value vparser. + Test the KGTK value parser. """ parser: ArgumentParser = ArgumentParser() parser.add_argument(dest="values", help="The values(s) to test", type=str, nargs="+") diff --git a/kgtk/join/languagevalidator.py b/kgtk/join/languagevalidator.py new file mode 100644 index 000000000..771859d71 --- /dev/null +++ b/kgtk/join/languagevalidator.py @@ -0,0 +1,123 @@ +""" +Validate language qualifiers. +""" + +from argparse import ArgumentParser, Namespace +import attr +import iso639 # type: ignore +import pycountry # type: ignore +import re +import sys +import typing + +# Problem: pycountry incorporates the Debian team's ISO 639-3 table, +# which as of 03-May-2020 has not been updated in four years! +# Meanwhile, iso639 (from pypi iso-639) has an ISO 639-3 table +# from 2015-05-05. +# +# https://salsa.debian.org/iso-codes-team/iso-codes/-/blob/master/iso_639-3/iso_639_3.tab +# https://pypi.org/project/iso-639/ +# +# Problem: Wikidata may contain obsolete language codes which have been +# removed from the standard indices. +# +# Example: "mo" +# +# Solution: We will keep a list of additional language codes. +@attr.s(slots=True, frozen=True) +class LanguageValidator: + + DEFAULT_ADDITIONAL_LANGUAGE_CODES: typing.List[str] = [ + # New codes: + "cnr", # Montenegrin. Added 21-Dec-2017. https://iso639-3.sil.org/code/cnr + "hyw", # Wester Armenian. Added 23-Jan-2018. https://iso639-3.sil.org/code/hyw + + # Obsolete codes: + "mo", # Retired, replaced by the codes for Romanian, but still appearing in wikidata. + "eml", # Emiliano-Romagnolo. Split and retired 16-Jan-2009. https://iso639-3.sil.org/code/eml + ] + + @classmethod + def validate(cls, + lang: str, + additional_language_codes: typing.Optional[typing.List[str]]=None, + verbose: bool = False, + )->bool: + # Wikidata contains entries such as: + # 'panamenha'@pt-br # language code followed by country code + # 'Ecuador'@es-formal # language code followed by dialect name + # + # If we see a dash, we'll check the language code by itself. + if verbose: + print("Validating '%s'" % lang) + + save_lang: str = lang # for the debug prints below. + country_or_dialect: str = "" + if "-" in lang: + (lang, country_or_dialect) = lang.split("-", 1) + if verbose: + print("'%s' split into '%s' and '%s'" % (save_lang, lang, country_or_dialect)) + + if len(lang) == 2: + # Two-character language codes. + if pycountry.languages.get(alpha_2=lang) is not None: + if verbose: + print("pycountry.languages.get(alpha_2=lang) succeeded") + return True + + elif len(lang) == 3: + # Three-character language codes. + if pycountry.languages.get(alpha_3=lang) is not None: + if verbose: + print("pycountry.languages.get(alpha_3=lang) succeeded") + return True + + # Perhaps this is a collective (language family) code from ISO 639-5? + try: + iso639.languages.get(part5=lang) + if verbose: + print("iso639.languages.get(part5=lang) succeeded") + return True + except KeyError: + pass + + # If there's a table of additional language codes, check there: + if additional_language_codes is None: + if verbose: + print("Using the default list of additional language codes.") + additional_language_codes = LanguageValidator.DEFAULT_ADDITIONAL_LANGUAGE_CODES + else: + if verbose: + print("Using a custom list of %d additional language codes." % len(additional_language_codes)) + if lang in additional_language_codes: + if verbose: + print("found in the table of additional languages.") + return True + + if verbose: + print("Not found.") + return False + +def main(): + """ + Test the language validator. + """ + parser: ArgumentParser = ArgumentParser() + parser.add_argument(dest="values", help="The values(s) to test", type=str, nargs="+") + + parser.add_argument( "--additional-language-codes", dest="additional_language_codes", + help="Additional language codes.", nargs="*", default=None) + + parser.add_argument("-v", "--verbose", dest="verbose", help="Print additional progress messages.", action='store_true') + args: Namespace = parser.parse_args() + + value: str + for value in args.values: + result: bool = LanguageValidator.validate(value, + additional_language_codes=args.additional_language_codes, + verbose=args.verbose) + + print("%s: %s" % (value, str(result)), flush=True) + +if __name__ == "__main__": + main() From 4423b714b26e67d828af8e9c193d459dbe6e0b24 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Sun, 3 May 2020 18:24:58 -0700 Subject: [PATCH 060/278] The additional language codes list is not optional in the value options. --- kgtk/join/kgtkvalue.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py index fd6aef315..9d95418b8 100644 --- a/kgtk/join/kgtkvalue.py +++ b/kgtk/join/kgtkvalue.py @@ -34,8 +34,8 @@ class KgtkValueOptions: # If this list gets long, we may want to turn it into a map to make lookup # more efficient. - additional_language_codes: typing.Optional[typing.List[str]] = attr.ib(validator=attr.validators.deep_iterable(member_validator=attr.validators.instance_of(str), - iterable_validator=attr.validators.instance_of(list)), + additional_language_codes: typing.Optional[typing.List[str]] = attr.ib(validator=attr.validators.optional(attr.validators.deep_iterable(member_validator=attr.validators.instance_of(str), + iterable_validator=attr.validators.instance_of(list))), default=None) From 4c04282e924b88a66c90487b3c49150769716964 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Sun, 3 May 2020 19:25:19 -0700 Subject: [PATCH 061/278] Use a language validator. Control it by value options. --- kgtk/join/edgereader.py | 2 +- kgtk/join/kgtkreader.py | 3 +- kgtk/join/kgtkvalue.py | 69 +----------------------------- kgtk/join/kgtkvalueoptions.py | 77 ++++++++++++++++++++++++++++++++++ kgtk/join/languagevalidator.py | 33 +++++++++++---- kgtk/join/nodereader.py | 2 +- 6 files changed, 108 insertions(+), 78 deletions(-) create mode 100644 kgtk/join/kgtkvalueoptions.py diff --git a/kgtk/join/edgereader.py b/kgtk/join/edgereader.py index 4b8865640..0d687988e 100644 --- a/kgtk/join/edgereader.py +++ b/kgtk/join/edgereader.py @@ -13,7 +13,7 @@ from kgtk.join.closableiter import ClosableIter from kgtk.join.enumnameaction import EnumNameAction from kgtk.join.kgtkreader import KgtkReader -from kgtk.join.kgtkvalue import KgtkValueOptions, DEFAULT_KGTK_VALUE_OPTIONS +from kgtk.join.kgtkvalueoptions import KgtkValueOptions, DEFAULT_KGTK_VALUE_OPTIONS from kgtk.join.validationaction import ValidationAction @attr.s(slots=True, frozen=False) diff --git a/kgtk/join/kgtkreader.py b/kgtk/join/kgtkreader.py index c6518daa0..326b905bf 100644 --- a/kgtk/join/kgtkreader.py +++ b/kgtk/join/kgtkreader.py @@ -21,7 +21,8 @@ from kgtk.join.gzipprocess import GunzipProcess from kgtk.join.kgtkbase import KgtkBase from kgtk.join.kgtkformat import KgtkFormat -from kgtk.join.kgtkvalue import KgtkValue, KgtkValueOptions, DEFAULT_KGTK_VALUE_OPTIONS +from kgtk.join.kgtkvalue import KgtkValue +from kgtk.join.kgtkvalueoptions import KgtkValueOptions, DEFAULT_KGTK_VALUE_OPTIONS from kgtk.join.validationaction import ValidationAction @attr.s(slots=True, frozen=False) diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py index 9d95418b8..677bcdd51 100644 --- a/kgtk/join/kgtkvalue.py +++ b/kgtk/join/kgtkvalue.py @@ -9,60 +9,9 @@ import typing from kgtk.join.kgtkformat import KgtkFormat +from kgtk.join.kgtkvalueoptions import KgtkValueOptions, DEFAULT_KGTK_VALUE_OPTIONS from kgtk.join.languagevalidator import LanguageValidator -@attr.s(slots=True, frozen=True) -class KgtkValueOptions: - """ - These options will affect some aspects of value processing. They are in a - seperate class for efficiency. - """ - - # Allow month 00 or day 00 in dates? This isn't really allowed by ISO - # 8601, but appears in wikidata. - allow_month_or_day_zero: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) - - # When allow_lax_strings is true, strings will be checked to see if they - # start and end with double quote ("), but we won't check if internal - # double quotes are excaped by backslash. - allow_lax_strings: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) - - # When allow_lax_lq_strings is true, language qualified strings will be - # checked to see if they start and end with single quote ('), but we won't - # check if internal single quotes are excaped by backslash. - allow_lax_lq_strings: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) - - # If this list gets long, we may want to turn it into a map to make lookup - # more efficient. - additional_language_codes: typing.Optional[typing.List[str]] = attr.ib(validator=attr.validators.optional(attr.validators.deep_iterable(member_validator=attr.validators.instance_of(str), - iterable_validator=attr.validators.instance_of(list))), - default=None) - - - @classmethod - def add_arguments(cls, parser: ArgumentParser): - parser.add_argument( "--additional-language-codes", dest="additional_language_codes", - help="Additional language codes.", nargs="*", default=None) - - parser.add_argument( "--allow-lax-strings", dest="allow_lax_strings", - help="Do not check if double quotes are backslashed inside strings.", action='store_true') - - parser.add_argument( "--allow-lax-lq-strings", dest="allow_lax_lq_strings", - help="Do not check if single quotes are backslashed inside language qualified strings.", action='store_true') - - parser.add_argument( "--allow-month-or-day-zero", dest="allow_month_or_day_zero", - help="Allow month or day zero in dates.", action='store_true') - - @classmethod - # Build the value parsing option structure. - def from_args(cls, args: Namespace)->'KgtkValueOptions': - return cls(allow_month_or_day_zero=args.allow_month_or_day_zero, - allow_lax_strings=args.allow_lax_strings, - allow_lax_lq_strings=args.allow_lax_lq_strings, - additional_language_codes=args.additional_language_codes) - -DEFAULT_KGTK_VALUE_OPTIONS: KgtkValueOptions = KgtkValueOptions() - @attr.s(slots=True, frozen=False) class KgtkValue(KgtkFormat): value: str = attr.ib(validator=attr.validators.instance_of(str)) @@ -418,19 +367,6 @@ def is_language_qualified_string(self, idx: typing.Optional[int] = None)->bool: def is_valid_language_qualified_string(self, idx: typing.Optional[int] = None)->bool: """Return False if this value is a list and idx is None. Otherwise, return True if the value looks like a language-qualified string. - - The language code may be a two- or three-character code from ISO - 639-3, which replaces ISO 639-1 and ISO 639-2. In addition, wikidata - may include language codes, such as 'mo', that have been retired. The - additional_language_codes table supports these codes, when allowed. - - Wikidata may also contain collective language codes, such as "nah", - referring the the Nahuatl languages. These codes from ISO 639-5 are - accepted as a fallback when ISO 639-3 lookup fails. - - https://meta.wikimedia.org/wiki/Special_language_codes - https://en.wikipedia.org/wiki/Template:ISO_639_name_be-tarask - """ if self.is_list() and idx is None: return False @@ -450,8 +386,7 @@ def is_valid_language_qualified_string(self, idx: typing.Optional[int] = None)-> lang: str = m.group("lang").lower() # print("lang: %s" % lang) - return LanguageValidator.validate(lang, - additional_language_codes=self.options.additional_language_codes) + return LanguageValidator.validate(lang, options=self.options) def is_location_coordinates(self, idx: typing.Optional[int] = None)->bool: """ diff --git a/kgtk/join/kgtkvalueoptions.py b/kgtk/join/kgtkvalueoptions.py new file mode 100644 index 000000000..e29ac6367 --- /dev/null +++ b/kgtk/join/kgtkvalueoptions.py @@ -0,0 +1,77 @@ +""" +KGTK value processing options. +""" + +from argparse import ArgumentParser, Namespace +import attr +import sys +import typing + +from kgtk.join.kgtkformat import KgtkFormat +from kgtk.join.languagevalidator import LanguageValidator + +@attr.s(slots=True, frozen=True) +class KgtkValueOptions: + """ + These options will affect some aspects of value processing. They are in a + seperate class for efficiency. + """ + + # Allow month 00 or day 00 in dates? This isn't really allowed by ISO + # 8601, but appears in wikidata. + allow_month_or_day_zero: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) + + # When allow_lax_strings is true, strings will be checked to see if they + # start and end with double quote ("), but we won't check if internal + # double quotes are excaped by backslash. + allow_lax_strings: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) + + # When allow_lax_lq_strings is true, language qualified strings will be + # checked to see if they start and end with single quote ('), but we won't + # check if internal single quotes are excaped by backslash. + allow_lax_lq_strings: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) + + # If this list gets long, we may want to turn it into a map to make lookup + # more efficient. + additional_language_codes: typing.Optional[typing.List[str]] = attr.ib(validator=attr.validators.optional(attr.validators.deep_iterable(member_validator=attr.validators.instance_of(str), + iterable_validator=attr.validators.instance_of(list))), + default=None) + + + @classmethod + def add_arguments(cls, parser: ArgumentParser): + parser.add_argument( "--additional-language-codes", dest="additional_language_codes", + help="Additional language codes.", nargs="*", default=None) + + parser.add_argument( "--allow-lax-strings", dest="allow_lax_strings", + help="Do not check if double quotes are backslashed inside strings.", action='store_true') + + parser.add_argument( "--allow-lax-lq-strings", dest="allow_lax_lq_strings", + help="Do not check if single quotes are backslashed inside language qualified strings.", action='store_true') + + parser.add_argument( "--allow-month-or-day-zero", dest="allow_month_or_day_zero", + help="Allow month or day zero in dates.", action='store_true') + + @classmethod + # Build the value parsing option structure. + def from_args(cls, args: Namespace)->'KgtkValueOptions': + return cls(allow_month_or_day_zero=args.allow_month_or_day_zero, + allow_lax_strings=args.allow_lax_strings, + allow_lax_lq_strings=args.allow_lax_lq_strings, + additional_language_codes=args.additional_language_codes) + +DEFAULT_KGTK_VALUE_OPTIONS: KgtkValueOptions = KgtkValueOptions() + +def main(): + """ + Test the KGTK value options. + """ + parser: ArgumentParser = ArgumentParser() + KgtkValueOptions.add_arguments(parser) + args: Namespace = parser.parse_args() + + # Build the value parsing option structure. + value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) + +if __name__ == "__main__": + main() diff --git a/kgtk/join/languagevalidator.py b/kgtk/join/languagevalidator.py index 771859d71..b22366a2c 100644 --- a/kgtk/join/languagevalidator.py +++ b/kgtk/join/languagevalidator.py @@ -6,10 +6,10 @@ import attr import iso639 # type: ignore import pycountry # type: ignore -import re -import sys import typing +from kgtkvalueoptions import KgtkValueOptions, DEFAULT_KGTK_VALUE_OPTIONS + # Problem: pycountry incorporates the Debian team's ISO 639-3 table, # which as of 03-May-2020 has not been updated in four years! # Meanwhile, iso639 (from pypi iso-639) has an ISO 639-3 table @@ -26,6 +26,19 @@ # Solution: We will keep a list of additional language codes. @attr.s(slots=True, frozen=True) class LanguageValidator: + """ + The language code may be a two- or three-character code from ISO + 639-3, which replaces ISO 639-1 and ISO 639-2. In addition, wikidata + may include language codes, such as 'mo', that have been retired. The + additional_language_codes table supports these codes, when allowed. + + Wikidata may also contain collective language codes, such as "nah", + referring the the Nahuatl languages. These codes from ISO 639-5 are + accepted as a fallback when ISO 639-3 lookup fails. + + https://meta.wikimedia.org/wiki/Special_language_codes + https://en.wikipedia.org/wiki/Template:ISO_639_name_be-tarask + """ DEFAULT_ADDITIONAL_LANGUAGE_CODES: typing.List[str] = [ # New codes: @@ -33,14 +46,15 @@ class LanguageValidator: "hyw", # Wester Armenian. Added 23-Jan-2018. https://iso639-3.sil.org/code/hyw # Obsolete codes: - "mo", # Retired, replaced by the codes for Romanian, but still appearing in wikidata. + "mo", # Moldavian. Retired 3-Nov-2008. Replaced by the codes for Romanian. + # http://www.personal.psu.edu/ejp10/blogs/gotunicode/2008/11/language-tage-mo-for-moldovan.html "eml", # Emiliano-Romagnolo. Split and retired 16-Jan-2009. https://iso639-3.sil.org/code/eml ] @classmethod def validate(cls, lang: str, - additional_language_codes: typing.Optional[typing.List[str]]=None, + options: KgtkValueOptions=DEFAULT_KGTK_VALUE_OPTIONS, verbose: bool = False, )->bool: # Wikidata contains entries such as: @@ -82,13 +96,16 @@ def validate(cls, pass # If there's a table of additional language codes, check there: - if additional_language_codes is None: + additional_language_codes: typing.List[str] + if options.additional_language_codes is not None: + additional_language_codes = options.additional_language_codes if verbose: - print("Using the default list of additional language codes.") - additional_language_codes = LanguageValidator.DEFAULT_ADDITIONAL_LANGUAGE_CODES + print("Using a custom list of %d additional language codes." % len(additional_language_codes)) else: if verbose: - print("Using a custom list of %d additional language codes." % len(additional_language_codes)) + print("Using the default list of additional language codes.") + additional_language_codes = LanguageValidator.DEFAULT_ADDITIONAL_LANGUAGE_CODES + if lang in additional_language_codes: if verbose: print("found in the table of additional languages.") diff --git a/kgtk/join/nodereader.py b/kgtk/join/nodereader.py index 9b50fea02..0f83d8b8a 100644 --- a/kgtk/join/nodereader.py +++ b/kgtk/join/nodereader.py @@ -13,7 +13,7 @@ from kgtk.join.closableiter import ClosableIter from kgtk.join.enumnameaction import EnumNameAction from kgtk.join.kgtkreader import KgtkReader -from kgtk.join.kgtkvalue import KgtkValueOptions, DEFAULT_KGTK_VALUE_OPTIONS +from kgtk.join.kgtkvalueoptions import KgtkValueOptions, DEFAULT_KGTK_VALUE_OPTIONS from kgtk.join.validationaction import ValidationAction @attr.s(slots=True, frozen=False) From 0b76102e28ab1cd4b66421a7bb0590b8559ec279 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Sun, 3 May 2020 19:33:14 -0700 Subject: [PATCH 062/278] Compete refactoring the KgtkValueOptions and LanguageValidator. --- kgtk/cli/validate.py | 2 +- kgtk/join/kgtkvalueoptions.py | 4 ---- kgtk/join/languagevalidator.py | 15 ++++++--------- 3 files changed, 7 insertions(+), 14 deletions(-) diff --git a/kgtk/cli/validate.py b/kgtk/cli/validate.py index db46d66ee..82fc6ab01 100644 --- a/kgtk/cli/validate.py +++ b/kgtk/cli/validate.py @@ -18,7 +18,7 @@ from kgtk.join.enumnameaction import EnumNameAction from kgtk.join.kgtkformat import KgtkFormat from kgtk.join.kgtkreader import KgtkReader -from kgtk.join.kgtkvalue import KgtkValueOptions +from kgtk.join.kgtkvalueoptions import KgtkValueOptions from kgtk.join.validationaction import ValidationAction def parser(): diff --git a/kgtk/join/kgtkvalueoptions.py b/kgtk/join/kgtkvalueoptions.py index e29ac6367..d7e0acb29 100644 --- a/kgtk/join/kgtkvalueoptions.py +++ b/kgtk/join/kgtkvalueoptions.py @@ -4,12 +4,8 @@ from argparse import ArgumentParser, Namespace import attr -import sys import typing -from kgtk.join.kgtkformat import KgtkFormat -from kgtk.join.languagevalidator import LanguageValidator - @attr.s(slots=True, frozen=True) class KgtkValueOptions: """ diff --git a/kgtk/join/languagevalidator.py b/kgtk/join/languagevalidator.py index b22366a2c..5e4eedb7a 100644 --- a/kgtk/join/languagevalidator.py +++ b/kgtk/join/languagevalidator.py @@ -8,7 +8,7 @@ import pycountry # type: ignore import typing -from kgtkvalueoptions import KgtkValueOptions, DEFAULT_KGTK_VALUE_OPTIONS +from kgtk.join.kgtkvalueoptions import KgtkValueOptions, DEFAULT_KGTK_VALUE_OPTIONS # Problem: pycountry incorporates the Debian team's ISO 639-3 table, # which as of 03-May-2020 has not been updated in four years! @@ -121,19 +121,16 @@ def main(): """ parser: ArgumentParser = ArgumentParser() parser.add_argument(dest="values", help="The values(s) to test", type=str, nargs="+") - - parser.add_argument( "--additional-language-codes", dest="additional_language_codes", - help="Additional language codes.", nargs="*", default=None) - parser.add_argument("-v", "--verbose", dest="verbose", help="Print additional progress messages.", action='store_true') + KgtkValueOptions.add_arguments(parser) args: Namespace = parser.parse_args() + # Build the value parsing option structure. + value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) + value: str for value in args.values: - result: bool = LanguageValidator.validate(value, - additional_language_codes=args.additional_language_codes, - verbose=args.verbose) - + result: bool = LanguageValidator.validate(value, options=value_options, verbose=args.verbose) print("%s: %s" % (value, str(result)), flush=True) if __name__ == "__main__": From 7fbe7407bca49b36d2cccf08ce3663bd8a5b4d35 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Sun, 3 May 2020 19:59:56 -0700 Subject: [PATCH 063/278] Provide a control over language suffix processing. Provide better arguments. --- kgtk/cli/validate.py | 2 ++ kgtk/join/kgtkvalueoptions.py | 43 +++++++++++++++++++++++++++++----- kgtk/join/languagevalidator.py | 2 +- 3 files changed, 40 insertions(+), 7 deletions(-) diff --git a/kgtk/cli/validate.py b/kgtk/cli/validate.py index 82fc6ab01..a823bcb5a 100644 --- a/kgtk/cli/validate.py +++ b/kgtk/cli/validate.py @@ -149,6 +149,7 @@ def run(kgtk_files: typing.Optional[typing.List[typing.Optional[Path]]], header_error_action: ValidationAction = ValidationAction.EXIT, unsafe_column_name_action: ValidationAction = ValidationAction.REPORT, additional_language_codes: typing.Optional[typing.List[str]] = None, + allow_language_suffixes: bool = False, allow_lax_strings: bool = False, allow_lax_lq_strings: bool = False, allow_month_or_day_zero: bool = False, @@ -174,6 +175,7 @@ def run(kgtk_files: typing.Optional[typing.List[typing.Optional[Path]]], value_options: KgtkValueOptions = KgtkValueOptions(allow_month_or_day_zero=allow_month_or_day_zero, allow_lax_strings=allow_lax_strings, allow_lax_lq_strings=allow_lax_lq_strings, + allow_language_suffixes=allow_language_suffixes, additional_language_codes=additional_language_codes) try: diff --git a/kgtk/join/kgtkvalueoptions.py b/kgtk/join/kgtkvalueoptions.py index d7e0acb29..cd07e5aa0 100644 --- a/kgtk/join/kgtkvalueoptions.py +++ b/kgtk/join/kgtkvalueoptions.py @@ -27,6 +27,8 @@ class KgtkValueOptions: # check if internal single quotes are excaped by backslash. allow_lax_lq_strings: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) + allow_language_suffixes: bool = attr.ib(validator=attr.validators.instance_of(bool), default=True) + # If this list gets long, we may want to turn it into a map to make lookup # more efficient. additional_language_codes: typing.Optional[typing.List[str]] = attr.ib(validator=attr.validators.optional(attr.validators.deep_iterable(member_validator=attr.validators.instance_of(str), @@ -39,19 +41,39 @@ def add_arguments(cls, parser: ArgumentParser): parser.add_argument( "--additional-language-codes", dest="additional_language_codes", help="Additional language codes.", nargs="*", default=None) - parser.add_argument( "--allow-lax-strings", dest="allow_lax_strings", - help="Do not check if double quotes are backslashed inside strings.", action='store_true') + lsgroup= parser.add_mutually_exclusive_group() + lsgroup.add_argument( "--allow-language-suffixes", dest="allow_language_suffixes", + help="Allow language identifier suffixes starting with a dash.", action='store_true', default=True) + + lsgroup.add_argument( "--disallow-language-suffixes", dest="allow_language_suffixes", + help="Disallow language identifier suffixes starting with a dash.", action='store_false') + + laxgroup= parser.add_mutually_exclusive_group() + laxgroup.add_argument( "--allow-lax-strings", dest="allow_lax_strings", + help="Do not check if double quotes are backslashed inside strings.", action='store_true', default=False) + + laxgroup.add_argument( "--disallow-lax-strings", dest="allow_lax_strings", + help="Check if double quotes are backslashed inside strings.", action='store_false') - parser.add_argument( "--allow-lax-lq-strings", dest="allow_lax_lq_strings", - help="Do not check if single quotes are backslashed inside language qualified strings.", action='store_true') + lqgroup= parser.add_mutually_exclusive_group() + lqgroup.add_argument( "--allow-lax-lq-strings", dest="allow_lax_lq_strings", + help="Do not check if single quotes are backslashed inside language qualified strings.", action='store_true', default=False) - parser.add_argument( "--allow-month-or-day-zero", dest="allow_month_or_day_zero", - help="Allow month or day zero in dates.", action='store_true') + lqgroup.add_argument( "--disallow-lax-lq-strings", dest="allow_lax_lq_strings", + help="Check if single quotes are backslashed inside language qualified strings.", action='store_false') + + md0group= parser.add_mutually_exclusive_group() + md0group.add_argument( "--allow-month-or-day-zero", dest="allow_month_or_day_zero", + help="Allow month or day zero in dates.", action='store_true', default=False) + + md0group.add_argument( "--disallow-month-or-day-zero", dest="allow_month_or_day_zero", + help="Allow month or day zero in dates.", action='store_false') @classmethod # Build the value parsing option structure. def from_args(cls, args: Namespace)->'KgtkValueOptions': return cls(allow_month_or_day_zero=args.allow_month_or_day_zero, + allow_language_suffixes=args.allow_language_suffixes, allow_lax_strings=args.allow_lax_strings, allow_lax_lq_strings=args.allow_lax_lq_strings, additional_language_codes=args.additional_language_codes) @@ -69,5 +91,14 @@ def main(): # Build the value parsing option structure. value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) + print("allow_month_or_day_zero: %s" % str(value_options.allow_month_or_day_zero)) + print("allow_lax_strings: %s" % str(value_options.allow_lax_strings)) + print("allow_lax_lq_strings: %s" % str(value_options.allow_lax_lq_strings)) + print("allow_language_suffixes: %s" % str(value_options.allow_language_suffixes)) + if value_options.additional_language_codes is None: + print("additional_language_codes: None") + else: + print("additional_language_codes: [ %s ]" % ", ".join(value_options.additional_language_codes)) + if __name__ == "__main__": main() diff --git a/kgtk/join/languagevalidator.py b/kgtk/join/languagevalidator.py index 5e4eedb7a..4c7c9ff2e 100644 --- a/kgtk/join/languagevalidator.py +++ b/kgtk/join/languagevalidator.py @@ -67,7 +67,7 @@ def validate(cls, save_lang: str = lang # for the debug prints below. country_or_dialect: str = "" - if "-" in lang: + if options.allow_language_suffixes and "-" in lang: (lang, country_or_dialect) = lang.split("-", 1) if verbose: print("'%s' split into '%s' and '%s'" % (save_lang, lang, country_or_dialect)) From 40f6bb8045a87d8bb0941adf18c64d61e681e7c9 Mon Sep 17 00:00:00 2001 From: Naren Date: Sun, 3 May 2020 22:57:56 -0700 Subject: [PATCH 064/278] accept multiple languages --- kgtk/cli/import_wikidata.py | 60 ++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/kgtk/cli/import_wikidata.py b/kgtk/cli/import_wikidata.py index abf623746..0019fef32 100644 --- a/kgtk/cli/import_wikidata.py +++ b/kgtk/cli/import_wikidata.py @@ -129,7 +129,7 @@ def enter(self): self.cnt=0 self.write_mode='w' - def process(self,line,node_file,edge_file,qual_file,lang,doc_id): + def process(self,line,node_file,edge_file,qual_file,languages,doc_id): write_mode='a' if self.first==True: write_mode='w' @@ -140,7 +140,6 @@ def process(self,line,node_file,edge_file,qual_file,lang,doc_id): nrows=[] erows=[] qrows=[] - site_filter = '{}wiki'.format(lang) clean_line = line.strip() if clean_line.endswith(b","): clean_line = clean_line[:-1] @@ -157,13 +156,15 @@ def process(self,line,node_file,edge_file,qual_file,lang,doc_id): if self.parse_labels: labels = obj["labels"] if labels: - lang_label = labels.get(lang, None) - if lang_label: - lang_label['value']=lang_label['value'].replace('|','\\|') - row.append( - '\'' + lang_label['value'].replace("'","\\'") + '\'' + "@" + lang) - else: - row.append("") + label_list=[] + for lang in languages: + lang_label = labels.get(lang, None) + if lang_label: + lang_label['value']=lang_label['value'].replace('|','\\|') + label_list.append( + '\'' + lang_label['value'].replace("'","\\'") + '\'' + "@" + lang) + if len(label_list)>0: + row.append("|".join(label_list)) else: row.append("") row.append(entry_type) @@ -171,31 +172,33 @@ def process(self,line,node_file,edge_file,qual_file,lang,doc_id): if self.parse_descr: descriptions = obj["descriptions"] if descriptions: - lang_descr = descriptions.get(lang, None) - if lang_descr: - lang_descr['value']=lang_descr['value'].replace('|','\\|') - row.append( - '\'' + lang_descr['value'].replace("'","\\'") + '\'' + "@" + lang) - else: - row.append("") + descr_list=[] + for lang in languages: + lang_descr = descriptions.get(lang, None) + if lang_descr: + lang_descr['value']=lang_descr['value'].replace('|','\\|') + descr_list.append( + '\'' + lang_descr['value'].replace("'","\\'") + '\'' + "@" + lang) + if len(descr_list)>0: + row.append("|".join(descr_list)) else: row.append("") if self.parse_aliases: aliases = obj["aliases"] if aliases: - lang_aliases = aliases.get(lang, None) - if lang_aliases: - alias_list = [] - for item in lang_aliases: - item['value']=item['value'].replace('|','\\|') - alias_list.append( - '\'' + item['value'].replace("'","\\'") + '\'' + "@" + lang) - row.append("|".join(alias_list)) - else: - row.append('') + alias_list = [] + for lang in languages: + lang_aliases = aliases.get(lang, None) + if lang_aliases: + for item in lang_aliases: + item['value']=item['value'].replace('|','\\|') + alias_list.append( + '\'' + item['value'].replace("'","\\'") + '\'' + "@" + lang) + if len(alias_list)>0: + row.append("|".join(alias_list)) else: - row.append('') + row.append("") #row.append(doc_id) if node_file: @@ -428,6 +431,7 @@ def process(self,line,node_file,edge_file,qual_file,lang,doc_id): try: start=time.time() + languages=lang.split(',') if node_file: header = ['id','label','type','description','alias'] with open(node_file+'_header', 'w', newline='') as myfile: @@ -466,7 +470,7 @@ def process(self,line,node_file,edge_file,qual_file,lang,doc_id): for cnt, line in enumerate(file): if limit and cnt >= limit: break - pp.add_task(line,node_file,edge_file,qual_file,lang,source) + pp.add_task(line,node_file,edge_file,qual_file,languages,source) pp.task_done() pp.join() if node_file: From 664796e997724be38b70a2b1c4eac08a7da3729c Mon Sep 17 00:00:00 2001 From: saggu Date: Mon, 4 May 2020 11:27:00 -0700 Subject: [PATCH 065/278] add ids, options to sepcify label for properties --- kgtk/cli/gt_loader.py | 54 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 8 deletions(-) diff --git a/kgtk/cli/gt_loader.py b/kgtk/cli/gt_loader.py index c65514849..b78167725 100644 --- a/kgtk/cli/gt_loader.py +++ b/kgtk/cli/gt_loader.py @@ -29,9 +29,25 @@ def add_arguments(parser): help='Graph tool file to dump the graph too - if empty, it will not be saved.') parser.add_argument('--output-stats', action='store_true', dest='output_stats', help='do not output the graph but statistics only') - - -def run(filename, directed, compute_degrees, compute_pagerank, compute_hits, log_file, output, output_stats): + parser.add_argument('--vertex-in-degree-property', action='store', dest='vertex_in_degree', + default='vertex_in_degree', + help='label for edge: vertex in degree property') + parser.add_argument('--vertex-out-degree-property', action='store', dest='vertex_out_degree', + default='vertex_out_degree', + help='label for edge: vertex out degree property') + parser.add_argument('--page-rank-property', action='store', dest='vertex_pagerank', + default='vertex_pagerank', + help='label for pank rank property') + parser.add_argument('--vertex-hits-authority-property', action='store', dest='vertex_auth', + default='vertex_auth', + help='label for edge: vertext hits authority') + parser.add_argument('--vertex-hits-hubs-property', action='store', dest='vertex_hubs', + default='vertex_hubs', + help='label for edge: vertex hits hubs') + + +def run(filename, directed, compute_degrees, compute_pagerank, compute_hits, log_file, output, output_stats, + vertex_in_degree, vertex_out_degree, vertex_pagerank, vertex_auth, vertex_hubs): from kgtk.exceptions import KGTKException def infer_index(h, options=[]): for o in options: @@ -45,6 +61,11 @@ def infer_predicate(h, options=[]): return o return '' + v_prop_dict = { + 'vertex_pagerank': vertex_pagerank, + 'vertex_hubs': vertex_hubs, + 'vertex_auth': vertex_auth + } try: # import modules locally import socket @@ -115,21 +136,38 @@ def infer_predicate(h, options=[]): for n_id, n_label, authority in main_auth: writer.write('%s\t%s\t%f\n' % (n_id, n_label, authority)) - sys.stdout.write('node1\tproperty\tnode2\n') + sys.stdout.write('id\tnode1\tproperty\tnode2\n') + id_count = 0 if not output_stats: for e in G2.edges(): sid, oid = e lbl = G2.ep[predicate][e] - sys.stdout.write('%s\t%s\t%s\n' % (G2.vp[id_col][sid], lbl, G2.vp[id_col][oid])) + sys.stdout.write( + '%s\t%s\t%s\t%s\n' % ( + '{}-{}-{}'.format(G2.vp[id_col][sid], lbl, id_count), G2.vp[id_col][sid], lbl, + G2.vp[id_col][oid])) + id_count += 1 + id_count = 0 for v in G2.vertices(): v_id = G2.vp[id_col][v] - sys.stdout.write('{}\t{}\t{}\n'.format(v_id, 'vertex_in_degree', v.in_degree())) - sys.stdout.write('{}\t{}\t{}\n'.format(v_id, 'vertex_out_degree', v.out_degree())) + sys.stdout.write( + '{}\t{}\t{}\t{}\n'.format('{}-{}-{}'.format(v_id, vertex_in_degree, id_count), v_id, + vertex_in_degree, v.in_degree())) + id_count += 1 + sys.stdout.write( + '{}\t{}\t{}\t{}\n'.format('{}-{}-{}'.format(v_id, vertex_out_degree, id_count), v_id, + vertex_out_degree, v.out_degree())) + id_count += 1 + for vprop in G2.vertex_properties.keys(): if vprop == id_col: continue - sys.stdout.write('%s\t%s\t%s\n' % (v_id, vprop, G2.vp[vprop][v])) + sys.stdout.write( + '%s\t%s\t%s\t%s\n' % ( + '{}-{}-{}'.format(v_id, v_prop_dict[vprop], id_count), v_id, v_prop_dict[vprop], + G2.vp[vprop][v])) + id_count += 1 if output: writer.write('now saving the graph to %s\n' % output) From c46b1d2cbf23e7b7e514a92a33d9ffcd00195a67 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Mon, 4 May 2020 12:41:20 -0700 Subject: [PATCH 066/278] Documentation cleanup. Replace erroneous backslash with vertical bar in su_unit_pat. --- kgtk/join/kgtkvalue.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py index 677bcdd51..2d95bd4aa 100644 --- a/kgtk/join/kgtkvalue.py +++ b/kgtk/join/kgtkvalue.py @@ -178,7 +178,7 @@ def is_number_or_quantity(self, idx: typing.Optional[int] = None)->bool: # http://www.csun.edu/~vceed002/ref/measurement/units/units.pdf # # Note: if Q were in this list, it would conflict with Wikidata nodes (below). - si_unit_pat: str = r'(?:m|kg|s|C|K|mol|cd|F|M|A|N|ohms|V|J|Hz|lx|H|Wb|V\W|Pa)' + si_unit_pat: str = r'(?:m|kg|s|C|K|mol|cd|F|M|A|N|ohms|V|J|Hz|lx|H|Wb|V|W|Pa)' si_power_pat: str = r'(?:-1|2|3)' # Might need more. si_combiner_pat: str = r'[./]' si_pat: str = r'(?:{si_unit}{si_power}?(?:{si_combiner}{si_unit}{si_power}?)*)'.format(si_unit=si_unit_pat, @@ -407,9 +407,6 @@ def is_valid_location_coordinates(self, idx: typing.Optional[int] = None)->bool: Return False if this value is a list and idx is None. Otherwise, return True if the value looks like valid location coordinates. - Note: The coordinates must look exactly like the examples in KGTK - File Format v2, excelt for optional +/- characters. - @043.26193/010.92708 """ if self.is_list() and idx is None: From 2ee03e46ad80ff6cbb64c8011d7a9296fa09f668 Mon Sep 17 00:00:00 2001 From: ckxz105 Date: Mon, 4 May 2020 17:37:31 -0700 Subject: [PATCH 067/278] move main embedding codes to gt/embedding_utils.py, add support for kgtk format on multiprocessing --- kgtk/cli/text_embedding.py | 682 +++---------------------------------- kgtk/gt/embedding_utils.py | 676 ++++++++++++++++++++++++++++++++++++ 2 files changed, 716 insertions(+), 642 deletions(-) create mode 100644 kgtk/gt/embedding_utils.py diff --git a/kgtk/cli/text_embedding.py b/kgtk/cli/text_embedding.py index 35f8e8c35..6083da388 100644 --- a/kgtk/cli/text_embedding.py +++ b/kgtk/cli/text_embedding.py @@ -21,621 +21,6 @@ ] -class EmbeddingVector: - def __init__(self, model_name=None, query_server=None, cache_config: dict = {}): - from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses, models # type: ignore - import logging - import re - self._logger = logging.getLogger(__name__) - from collections import defaultdict - if model_name is None: - self.model_name = 'bert-base-nli-mean-tokens' - # xlnet need to be trained before using, we can't use this for now - # elif model_name == "xlnet-base-cased": - # word_embedding_model = models.XLNet('xlnet-base-cased') - # # Apply mean pooling to get one fixed sized sentence vector - # pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), - # pooling_mode_mean_tokens=True, - # pooling_mode_cls_token=False, - # pooling_mode_max_tokens=False) - # self.model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) - else: - self.model_name = model_name - self._logger.info("Using model {}".format(self.model_name)) - self.model = SentenceTransformer(self.model_name) - # setup redis cache server - if query_server is None or query_server == "": - self.wikidata_server = "https://query.wikidata.org/sparql" - else: - self.wikidata_server = query_server - use_cache = cache_config.get("use_cache", False) - if use_cache: - import redis - host = cache_config.get("host", "dsbox01.isi.edu") - port = cache_config.get("port", 6379) - self.redis_server = redis.Redis(host=host, port=port, db=0) - try: - _ = self.redis_server.get("foo") - self._logger.debug("Cache server {}:{} connected!".format(host, port)) - except: - self._logger.error("Cache server {}:{} is not able to be connected! Will not use cache!".format(host, port)) - self.redis_server = None - else: - self.redis_server = None - self.qnodes_descriptions = dict() - self.vectors_map = dict() - self.property_labels_dict = dict() - self.q_node_to_label = dict() - self.node_labels = dict() - self.vectors_2D = None - self.vector_dump_file = None - self.gt_nodes = set() - self.candidates = defaultdict(dict) - self.metadata = [] - self.gt_indexes = set() - self.input_format = "" - self.token_pattern = re.compile(r"(?u)\b\w\w+\b") - - def get_sentences_embedding(self, sentences: typing.List[str], qnodes: typing.List[str]): - """ - transform a list of sentences to embedding vectors - """ - from ast import literal_eval - if self.redis_server is not None: - sentence_embeddings = [] - for each_node, each_sentence in zip(qnodes, sentences): - query_cache_key = each_node + each_sentence - if self.model_name != "bert-base-wikipedia-sections-mean-tokens": - query_cache_key += self.model_name - cache_res = self.redis_server.get(query_cache_key) - if cache_res is not None: - sentence_embeddings.append(literal_eval(cache_res.decode("utf-8"))) - # self._logger.error("{} hit!".format(each_node+each_sentence)) - else: - each_embedding = self.model.encode([each_sentence], show_progress_bar=False) - sentence_embeddings.extend(each_embedding) - self.redis_server.set(query_cache_key, str(each_embedding[0].tolist())) - else: - sentence_embeddings = self.model.encode(sentences, show_progress_bar=False) - return sentence_embeddings - - def send_sparql_query(self, query_body: str): - """ - a simple wrap to send the query and return the returned results - """ - from SPARQLWrapper import SPARQLWrapper, JSON, POST, URLENCODED # type: ignore - qm = SPARQLWrapper(self.wikidata_server) - qm.setReturnFormat(JSON) - qm.setMethod(POST) - qm.setRequestMethod(URLENCODED) - self._logger.debug("Sent query is:") - self._logger.debug(str(query_body)) - qm.setQuery(query_body) - try: - results = qm.query().convert()['results']['bindings'] - return results - except: - raise KGTKException("Sending Sparql query to {} failed!".format(self.wikidata_server)) - - def _get_labels(self, nodes: typing.List[str]): - query_nodes = " ".join(["wd:{}".format(each) for each in nodes]) - query = """ - select ?item ?nodeLabel - where { - values ?item {""" + query_nodes + """} - ?item rdfs:label ?nodeLabel. - FILTER(LANG(?nodeLabel) = "en"). - } - """ - results2 = self.send_sparql_query(query) - for each_res in results2: - node_id = each_res['item']['value'].split("/")[-1] - value = each_res['nodeLabel']['value'] - self.node_labels[node_id] = value - - def _get_labels_and_descriptions(self, query_qnodes: str, need_find_label: bool, need_find_description: bool): - query_body = """ - select ?item ?itemDescription ?itemLabel - where { - values ?item {""" + query_qnodes + """ } - SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } - } - """ - results = self.send_sparql_query(query_body) - for each in results: - each_node = each['item']['value'].split("/")[-1] - if 'itemDescription' in each: - description = each['itemDescription']['value'] - else: - description = "" - if "itemLabel" in each: - label = each['itemLabel']['value'] - else: - label = "" - if need_find_label: - self.candidates[each_node]["label_properties"] = [label] - if need_find_description: - self.candidates[each_node]["description_properties"] = [description] - - def _get_property_values(self, query_qnodes, query_part_names, query_part_properties): - used_p_node_ids = set() - for part_name, part in zip(query_part_names, query_part_properties): - if part_name == "isa_properties": - self._get_labels(part) - for i, each in enumerate(part): - if each not in {"label", "description", "all"}: - query_body2 = """ - select ?item ?eachPropertyLabel - where {{ - values ?item {{{all_nodes}}} - ?item wdt:{qnode} ?eachProperty. - SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }} - }} - """.format(all_nodes=query_qnodes, qnode=each) - results2 = self.send_sparql_query(query_body2) - - for each_res in results2: - node_id = each_res['item']['value'].split("/")[-1] - value = each_res['eachPropertyLabel']['value'] - if part_name == "isa_properties" and self.node_labels[each].endswith("of"): - value = self.node_labels[each] + "||" + value - used_p_node_ids.add(node_id) - if part_name in self.candidates[node_id]: - self.candidates[node_id][part_name] = value - else: - self.candidates[node_id][part_name] = {value} - return used_p_node_ids - - def _get_all_properties(self, query_qnodes, used_p_node_ids, properties_list): - has_properties_set = set(properties_list[3]) - query_body3 = """ - select DISTINCT ?item ?p_entity ?p_entityLabel - where { - values ?item {""" + query_qnodes + """} - ?item ?p ?o. - FILTER regex(str(?p), "^http://www.wikidata.org/prop/P", "i") - BIND (IRI(REPLACE(STR(?p), "http://www.wikidata.org/prop", "http://www.wikidata.org/entity")) AS ?p_entity) . - SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } - } - """ - results3 = self.send_sparql_query(query_body3) - for each in results3: - node_name = each['item']['value'].split("/")[-1] - p_node_id = each['p_entity']['value'].split("/")[-1] - p_node_label = each['p_entityLabel']['value'] - if p_node_id not in used_p_node_ids: - if properties_list[3] == ["all"] or p_node_id in has_properties_set: - if "has_properties" in self.candidates[node_name]: - self.candidates[node_name]["has_properties"].add(p_node_label) - else: - self.candidates[node_name]["has_properties"] = {p_node_label} - - def get_item_description(self, qnodes: typing.List[str] = None, target_properties: dict = {}): - """ - use sparql query to get the descriptions of given Q nodes - """ - if qnodes is None: - qnodes = self.candidates - if "all" in target_properties: - find_all_properties = True - else: - find_all_properties = False - properties_list = [[] for _ in range(4)] - names = ["labels", "descriptions", "isa_properties", "has_properties"] - for k, v in target_properties.items(): - if v == "label_properties": - properties_list[0].append(k) - elif v == "description_properties": - properties_list[1].append(k) - elif v == "isa_properties": - properties_list[2].append(k) - elif v == "has_properties": - properties_list[3].append(k) - - sentences_cache_dict = {} - if self.redis_server is not None: - for each_node in qnodes: - cache_res = self.redis_server.get(each_node + str(properties_list)) - if cache_res is not None: - sentences_cache_dict[each_node] = cache_res.decode("utf-8") - - if len(sentences_cache_dict) > 0: - qnodes = set(qnodes) - set(sentences_cache_dict.keys()) - - # only need to do query when we still have remained nodes - if len(qnodes) > 0: - need_find_label = "label" in properties_list[0] - need_find_description = "description" in properties_list[1] - query_qnodes = "" - for each in qnodes: - query_qnodes += "wd:{} ".format(each) - - # this is used to get corresponding labels / descriptions - if need_find_label or need_find_description: - self._get_labels_and_descriptions(query_qnodes, need_find_label, need_find_description) - - if len(properties_list[3]) > len(qnodes): - # in this condition, we have too many properties need to be queried, it will waste time - # query to get all properties then filtering would save more times - find_all_properties = True - query_part2_names = names[:3] - query_part2_properties = properties_list[:3] - else: - query_part2_names = names - query_part2_properties = properties_list - # this is used to get corresponding labels of properties values - used_p_node_ids = self._get_property_values(query_qnodes, query_part2_names, query_part2_properties) - - # if need get all properties, we need to run extra query - if find_all_properties: - self._get_all_properties(query_qnodes, used_p_node_ids, properties_list) - - for each_node_id in qnodes: - each_sentence = self.attribute_to_sentence(self.candidates[each_node_id], each_node_id) - self.candidates[each_node_id]["sentence"] = each_sentence - if self.redis_server is not None: - self.redis_server.set(each_node_id + str(properties_list), each_sentence) - - for each_node_id, sentence in sentences_cache_dict.items(): - self.candidates[each_node_id]["sentence"] = sentence - - def read_input(self, file_path: str, skip_nodes_set: set = None, - input_format: str = "kgtk_format", target_properties: dict = {}, - property_labels_dict: dict = {}, black_list_set: set = set() - ): - """ - load the input candidates files - """ - from collections import defaultdict - import pandas as pd # type: ignore - import numpy as np - import math - - self.property_labels_dict = property_labels_dict - - if input_format == "test_format": - self.input_format = input_format - input_df = pd.read_csv(file_path) - gt = {} - count = 0 - if "GT_kg_id" in input_df.columns: - gt_column_id = "GT_kg_id" - elif "kg_id" in input_df.columns: - gt_column_id = "kg_id" - else: - raise KGTKException("Can't find ground truth id column! It should either named as `GT_kg_id` or `kg_id`") - - for _, each in input_df.iterrows(): - if isinstance(each["candidates"], str): - temp = str(each['candidates']).split("|") - elif each['candidates'] is np.nan or math.isnan(each['candidates']): - temp = [] - - to_remove_q = set() - if each[gt_column_id] is np.nan: - self._logger.warning("Ignore NaN gt value form {}".format(str(each))) - each[gt_column_id] = "" - gt_nodes = each[gt_column_id].split(" ") - label = str(each["label"]) - if len(gt_nodes) == 0: - self._logger.error("Skip a row with no ground truth node given: as {}".format(str(each))) - continue - if label == "": - self._logger.error("Skip a row with no label given: as {}".format(str(each))) - continue - temp.extend(gt_nodes) - - for each_q in temp: - self.q_node_to_label[each_q] = label - if skip_nodes_set is not None and each_q in skip_nodes_set: - to_remove_q.add(each_q) - temp = set(temp) - to_remove_q - count += len(temp) - self.gt_nodes.add(each[gt_column_id]) - self.get_item_description(temp, target_properties) - - self._logger.info("Totally {} rows with {} candidates loaded.".format(str(len(gt)), str(count))) - - elif input_format == "kgtk_format": - # assume the input edge file is sorted - if "all" in target_properties: - _ = target_properties.pop("all") - add_all_properties = True - else: - add_all_properties = False - - self.input_format = input_format - with open(file_path, "r") as f: - # get header - headers = f.readline().replace("\n", "").split("\t") - if len(headers) < 3: - raise KGTKException( - "No enough columns found on given input file. Only {} columns given but at least 3 needed.".format( - len(headers))) - elif "node" in headers and "property" in headers and "value" in headers: - column_references = {"node": headers.index("node"), - "property": headers.index("property"), - "value": headers.index("value")} - elif len(headers) == 3: - column_references = {"node": 0, - "property": 1, - "value": 2} - else: - missing_column = {"node", "property", "value"} - set(headers) - raise KGTKException("Missing column {}".format(missing_column)) - self._logger.debug("column index information: ") - self._logger.debug(str(column_references)) - # read contents - each_node_attributes = {"has_properties": [], "isa_properties": [], "label_properties": [], - "description_properties": []} - current_process_node_id = None - for each_line in f: - each_line = each_line.replace("\n", "").split("\t") - node_id = each_line[column_references["node"]] - node_property = each_line[column_references["property"]] - node_value = each_line[column_references["value"]] - # remove @ mark - if "@" in node_value and node_value[0] != "@": - node_value_org = node_value - node_value = node_value[:node_value.index("@")] - - # remove extra double quote " and single quote ' - if node_value[0] == '"' and node_value[-1] == '"': - node_value = node_value[1:-1] - if node_value[0] == "'" and node_value[-1] == "'": - node_value = node_value[1:-1] - - if current_process_node_id != node_id: - if current_process_node_id is None: - current_process_node_id = node_id - else: - # if we get to next id - # concate all properties into one sentence to represent the Q node - concated_sentence = self.attribute_to_sentence(each_node_attributes, current_process_node_id) - each_node_attributes["sentence"] = concated_sentence - self.candidates[current_process_node_id] = each_node_attributes - # after write down finish, we can cleaer and start parsing next one - each_node_attributes = {"has_properties": [], "isa_properties": [], "label_properties": [], - "description_properties": []} - # update to new id - current_process_node_id = node_id - - if node_property in target_properties: - each_node_attributes[target_properties[node_property]].append(node_value) - if add_all_properties and each_line[column_references["value"]][0] == "P": - each_node_attributes["has_properties"].append(node_value) - - else: - raise KGTKException("Unkonwn input format {}".format(input_format)) - - self._logger.info("Totally {} Q nodes loaded.".format(len(self.candidates))) - self.vector_dump_file = "dump_vectors_{}_{}.pkl".format(file_path[:file_path.rfind(".")], self.model_name) - # self._logger.debug("The cache file name will be {}".format(self.vector_dump_file)) - - def get_real_label_name(self, node): - if node in self.property_labels_dict: - return self.property_labels_dict[node] - else: - return node - - def attribute_to_sentence(self, v, node_id=None): - concated_sentence = "" - have_isa_properties = False - # sort the properties to ensure the sentence always same - v = {key: sorted(list(value)) for key, value in v.items() if len(value) > 0} - if "label_properties" in v and len(v["label_properties"]) > 0: - concated_sentence += self.get_real_label_name(v["label_properties"][0]) - if "description_properties" in v and len(v["description_properties"]) > 0: - if concated_sentence != "" and v["description_properties"][0] != "": - concated_sentence += ", " - concated_sentence += self.get_real_label_name(v["description_properties"][0]) - if "isa_properties" in v and len(v["isa_properties"]) > 0: - have_isa_properties = True - temp = "" - for each in v["isa_properties"]: - each = self.get_real_label_name(each) - if "||" in each: - if "instance of" in each: - each = each.split("||")[1] - else: - each = each.replace("||", " ") - temp += each + ", " - if concated_sentence != "" and temp != "": - concated_sentence += " is a " - elif concated_sentence == "": - concated_sentence += "It is a " - concated_sentence += temp[:-2] - if "has_properties" in v and len(v["has_properties"]) > 0: - temp = [self.get_real_label_name(each) for each in v["has_properties"]] - if concated_sentence != "" and temp[0] != "": - if have_isa_properties: - concated_sentence += ", and has " - else: - concated_sentence += " has " - elif temp[0] != "": - concated_sentence += "It has " - concated_sentence += " and ".join(temp) - self._logger.debug("Transform node {} --> {}".format(node_id, concated_sentence)) - return concated_sentence - - def get_vetors(self): - """ - main function to get the vector representations of the descriptions - """ - import os - import time - from tqdm import tqdm # type: ignore - - start_all = time.time() - self._logger.info("Now generating embedding vector.") - for q_node, each_item in tqdm(self.candidates.items()): - # do process for each row(one target) - sentence = each_item["sentence"] - if isinstance(sentence, bytes): - sentence = sentence.decode("utf-8") - vectors = self.get_sentences_embedding([sentence], [q_node]) - self.vectors_map[q_node] = vectors[0] - self._logger.info("Totally used {} seconds.".format(str(time.time() - start_all))) - - def dump_vectors(self, file_name, type_=None): - import pickle - if file_name.endswith(".pkl"): - file_name = file_name.replace(".pkl", "") - if type_ == "2D": - with open(file_name + ".pkl", "wb") as f: - pickle.dump(self.vectors_2D, f) - dimension = len(self.vectors_2D[0]) - with open(file_name + ".tsv", "w") as f: - for each in self.vectors_2D: - for i, each_val in enumerate(each): - _ = f.write(str(each_val)) - if i != dimension - 1: - _ = f.write("\t") - _ = f.write("\n") - elif type_ == "metadata": - with open(file_name + "_metadata.tsv", "w") as f: - for each in self.metadata: - _ = f.write(each + "\n") - else: - with open(file_name + ".pkl", "wb") as f: - pickle.dump(self.vectors_map, f) - with open(file_name + ".tsv", "w") as f: - for each in self.vectors_map.values(): - for i in each: - _ = f.write(str(i) + "\t") - _ = f.write("\n") - - def print_vector(self, vectors, output_properties: str = "text_embedding", output_format="kgtk_format"): - if output_format == "kgtk_format": - print("node\tproperty\tvalue\n", end="") - if self.input_format == "kgtk_format": - for i, each_vector in enumerate(vectors): - print(str(list(self.candidates.keys())[i]) + "\t", end="") - print(output_properties + "\t", end="") - for j, each_dimension in enumerate(each_vector): - if j != len(each_vector) - 1: - print(str(each_dimension) + ",", end="") - else: - print(str(each_dimension) + "\n", end="") - elif self.input_format == "test_format": - all_nodes = list(self.vectors_map.keys()) - for i, each_vector in enumerate(vectors): - print(all_nodes[i] + "\t", end="") - print(output_properties + "\t", end="") - for j, each_dimension in enumerate(each_vector): - if j != len(each_vector) - 1: - print(str(each_dimension) + ",", end="") - else: - print(str(each_dimension) + "\n", end="") - - elif output_format == "tsv_format": - for each_vector in vectors: - for i, each_dimension in enumerate(each_vector): - if i != len(each_vector) - 1: - print(str(each_dimension) + "\t", end="") - else: - print(str(each_dimension) + "\n", end="") - - def plot_result(self, output_properties={}, input_format="kgtk_format", - output_uri: str = "", output_format="kgtk_format", - run_TSNE=True - ): - """ - transfer the vectors to lower dimension so that we can plot - Then save the 2D vector file for further purpose - """ - import os - import time - from sklearn.manifold import TSNE # type: ignore - - self.vectors_map = {k: v for k, v in sorted(self.vectors_map.items(), key=lambda item: item[0], reverse=True)} - vectors = list(self.vectors_map.values()) - # use tsne to reduce dimension - if run_TSNE: - self._logger.warning("Start running TSNE to reduce dimension. It will take a long time.") - start = time.time() - self.vectors_2D = TSNE(n_components=2, random_state=0).fit_transform(vectors) - self._logger.info("Totally used {} seconds.".format(time.time() - start)) - - if input_format == "test_format": - gt_indexes = set() - vector_map_keys = list(self.vectors_map.keys()) - for each_node in self.gt_nodes: - gt_indexes.add(vector_map_keys.index(each_node)) - - self.metadata.append("Q_nodes\tType\tLabel\tDescription") - for i, each in enumerate(self.vectors_map.keys()): - label = self.q_node_to_label[each] - description = self.candidates[each]["sentence"] - if i in gt_indexes: - self.metadata.append("{}\tground_truth_node\t{}\t{}".format(each, label, description)) - else: - self.metadata.append("{}\tcandidates\t{}\t{}".format(each, label, description)) - self.gt_indexes = gt_indexes - - elif input_format == "kgtk_format": - if len(output_properties.get("metatada_properties", [])) == 0: - for k, v in self.candidates.items(): - label = v.get("label_properties", "") - if len(label) > 0 and isinstance(label, list): - label = label[0] - description = v.get("description_properties", "") - if len(description) > 0 and isinstance(description, list): - description = description[0] - self.metadata.append("{}\t\t{}\t{}".format(k, label, description)) - else: - required_properties = output_properties["metatada_properties"] - self.metadata.append("node\t" + "\t".join(required_properties)) - for k, v in self.candidates.items(): - each_metadata = k + "\t" - for each in required_properties: - each_metadata += v.get(each, " ") + "\t" - self.metadata.append(each_metadata) - - metadata_output_path = os.path.join(output_uri, self.vector_dump_file.split("/")[-1]) - if run_TSNE: - self.print_vector(self.vectors_2D, output_properties.get("output_properties"), output_format) - else: - self.print_vector(vectors, output_properties.get("output_properties"), output_format) - if output_uri != "none": - self.dump_vectors(metadata_output_path, "metadata") - - def evaluate_result(self): - """ - for the ground truth nodes, evaluate the average distance to the centroid, the lower the average distance, the better clustering results should be - """ - import numpy as np - centroid = None - gt_nodes_vectors = [] - if len(self.gt_indexes) == 0: - points = set(range(len(self.vectors_map))) - else: - points = self.gt_indexes - for i, each in enumerate(self.vectors_map.keys()): - if i in points: - if centroid is None: - centroid = np.array(self.vectors_map[each]) - else: - centroid += np.array(self.vectors_map[each]) - gt_nodes_vectors.append(self.vectors_map[each]) - centroid = centroid / len(points) - - distance_sum = 0 - for each in gt_nodes_vectors: - distance_sum += self.calculate_distance(each, centroid) - self._logger.info("The average distance for the ground truth nodes to centroid is {}".format(distance_sum / len(points))) - - @staticmethod - def calculate_distance(a, b): - if len(a) != len(b): - raise KGTKException("Vector dimension are different!") - dist = 0 - for v1, v2 in zip(a, b): - dist += (v1 - v2) ** 2 - dist = dist ** 0.5 - return dist - - def load_property_labels_file(input_files: typing.List[str]): labels_dict = {} headers = None @@ -718,11 +103,25 @@ def load_black_list_files(file_path): def main(**kwargs): from kgtk.exceptions import KGTKException + import logging + import os + from time import strftime + do_logging = kwargs.get("_debug", False) + if do_logging: + logging_level_class = logging.DEBUG + logger_path = os.path.join(os.environ.get("HOME"), + "kgtk_text_embedding_log_{}.log".format(strftime("%Y-%m-%d-%H-%M"))) + logging.basicConfig(level=logging_level_class, + format="%(asctime)s [%(levelname)s] %(name)s %(lineno)d -- %(message)s", + datefmt='%m-%d %H:%M:%S', + filename=logger_path, + filemode='w') + + _logger = logging.getLogger(__name__) + _logger.warning("Running with logging level {}".format(_logger.getEffectiveLevel())) + try: - import logging - import os import time - from time import strftime import torch import typing import pandas as pd @@ -731,29 +130,17 @@ def main(**kwargs): import re import argparse import pickle - - do_logging = kwargs.get("_debug", False) - if do_logging: - logging_level_class = logging.DEBUG - logger_path = os.path.join(os.environ.get("HOME"), - "kgtk_text_embedding_log_{}.log".format(strftime("%Y-%m-%d-%H-%M"))) - logging.basicConfig(level=logging_level_class, - format="%(asctime)s [%(levelname)s] %(name)s %(lineno)d -- %(message)s", - datefmt='%m-%d %H:%M:%S', - filename=logger_path, - filemode='w') - - _logger = logging.getLogger(__name__) - _logger.warning("Running with logging level {}".format(_logger.getEffectiveLevel())) + from kgtk.gt.embedding_utils import EmbeddingVector # get input parameters from kwargs output_uri = kwargs.get("output_uri", "") + parallel_count = kwargs.get("parallel_count", "1") black_list_files = kwargs.get("black_list_files", "") all_models_names = kwargs.get("all_models_names", ['bert-base-wikipedia-sections-mean-tokens']) input_format = kwargs.get("input_format", "kgtk_format") input_uris = kwargs.get("input_uris", []) output_format = kwargs.get("output_format", "kgtk_format") - property_labels_files = kwargs.get("property_labels_file_uri", "") + property_labels_files = kwargs.get("property_labels_file_uri", []) query_server = kwargs.get("query_server") properties = dict() all_property_relate_inputs = [kwargs.get("label_properties", ["label"]), @@ -802,11 +189,12 @@ def main(**kwargs): for each_model_name in all_models_names: for each_input_file in input_uris: _logger.info("Running {} model on {}".format(each_model_name, each_input_file)) - process = EmbeddingVector(each_model_name, query_server=query_server, cache_config=cache_config) + process = EmbeddingVector(each_model_name, query_server=query_server, cache_config=cache_config, + parallel_count=parallel_count) process.read_input(file_path=each_input_file, skip_nodes_set=black_list_set, input_format=input_format, target_properties=properties, property_labels_dict=property_labels_dict) - process.get_vetors() + process.get_vectors() process.plot_result(output_properties=output_properties, input_format=input_format, output_uri=output_uri, run_TSNE=run_TSNE, output_format=output_format) @@ -866,23 +254,28 @@ def str2bool(v): This argument is only valid for input in kgtk format.""") parser.add_argument('--isa-properties', action='store', nargs='+', dest='isa_properties', default=["P31"], - help="""The names of the eges for `isa` properties, Default is ["P31"] (the `instance of` node in wikidata).\n - This argument is only valid for input in kgtk format.""") + help="""The names of the eges for `isa` properties, Default is ["P31"] (the `instance of` node in + wikidata).\n This argument is only valid for input in kgtk format.""") parser.add_argument('--has-properties', action='store', nargs='+', dest='has_properties', default=["all"], - help="""The names of the eges for `has` properties, Default is ["all"] (will automatically append all properties found for each node).\n This argument is only valid for input in kgtk format.""") + help="""The names of the eges for `has` properties, Default is ["all"] (will automatically append all + properties found for each node).\n This argument is only valid for input in kgtk format.""") parser.add_argument('--output-property', action='store', dest='output_properties', default="text_embedding", - help="""The output property name used to record the embedding. Default is `output_properties`. \nThis argument is only valid for output in kgtk format.""") + help="""The output property name used to record the embedding. Default is `output_properties`. \nThis + argument is only valid for output in kgtk format.""") # output parser.add_argument('-o', '--embedding-projector-metadata-path', action='store', dest='output_uri', default="", help="output path for the metadata file, default will be current user's home directory") parser.add_argument('--output-format', action='store', dest='output_format', default="kgtk", choices=("tsv_format", "kgtk_format"), - help="output format, can either be `tsv_format` or `kgtk_format`. \nIf choose `tsv_format`, the output will be a tsv file, with each row contains only the vector representation of a node. Each dimension is separated by a tab") + help="output format, can either be `tsv_format` or `kgtk_format`. \nIf choose `tsv_format`, the output " + "will be a tsv file, with each row contains only the vector representation of a node. Each " + "dimension is separated by a tab") parser.add_argument('--embedding-projector-metatada', action='store', nargs='+', dest='metatada_properties', default=[], - help="""list of properties used to construct a metadata file for use in the Google Embedding Projector: http://projector.tensorflow.org. \n Default: the label and description of each node.""") + help="""list of properties used to construct a metadata file for use in the Google Embedding Projector: + http://projector.tensorflow.org. \n Default: the label and description of each node.""") # black list file parser.add_argument('-b', '--black-list', nargs='+', action='store', dest='black_list_files', default="", @@ -891,6 +284,10 @@ def str2bool(v): parser.add_argument("--run-TSNE", type=str2bool, nargs='?', action='store', default=True, dest="run_TSNE", help="whether to run TSNE or not after the embedding, default is true.") + + parser.add_argument("--parallel", nargs='?', action='store', + default="1", dest="parallel_count", + help="How many processes to be run in same time, default is 1.") # cache config parser.add_argument("--use-cache", type=str2bool, nargs='?', action='store', default=False, dest="use_cache", @@ -906,7 +303,8 @@ def str2bool(v): # query server parser.add_argument("--query-server", nargs='?', action='store', default="", dest="query_server", - help="sparql query endpoint used for test_format input files, default is https://query.wikidata.org/sparql" + help="sparql query endpoint used for test_format input files, default is " + "https://query.wikidata.org/sparql " ) diff --git a/kgtk/gt/embedding_utils.py b/kgtk/gt/embedding_utils.py new file mode 100644 index 000000000..f4fb45f5f --- /dev/null +++ b/kgtk/gt/embedding_utils.py @@ -0,0 +1,676 @@ +import logging +import re +import redis +import typing +import hashlib +import pandas as pd # type: ignore +import numpy as np +import math +import pickle +import os +import time + +from pyrallel import ParallelProcessor +from sklearn.manifold import TSNE # type: ignore +from tqdm import tqdm # type: ignore +from ast import literal_eval +from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses, models # type: ignore +from collections import defaultdict +from SPARQLWrapper import SPARQLWrapper, JSON, POST, URLENCODED # type: ignore +from kgtk.exceptions import KGTKException + + +class EmbeddingVector: + def __init__(self, model_name=None, query_server=None, cache_config: dict = None, parallel_count=1): + self._logger = logging.getLogger(__name__) + if model_name is None: + self.model_name = 'bert-base-nli-mean-tokens' + # xlnet need to be trained before using, we can't use this for now + # elif model_name == "xlnet-base-cased": + # word_embedding_model = models.XLNet('xlnet-base-cased') + # # Apply mean pooling to get one fixed sized sentence vector + # pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), + # pooling_mode_mean_tokens=True, + # pooling_mode_cls_token=False, + # pooling_mode_max_tokens=False) + # self.model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) + else: + self.model_name = model_name + self._logger.info("Using model {}".format(self.model_name)) + self.model = SentenceTransformer(self.model_name) + # setup redis cache server + if query_server is None or query_server == "": + self.wikidata_server = "https://query.wikidata.org/sparql" + else: + self.wikidata_server = query_server + if cache_config and cache_config.get("use_cache", False): + host = cache_config.get("host", "dsbox01.isi.edu") + port = cache_config.get("port", 6379) + self.redis_server = redis.Redis(host=host, port=port, db=0) + try: + _ = self.redis_server.get("foo") + self._logger.debug("Cache server {}:{} connected!".format(host, port)) + except: + self._logger.error("Cache server {}:{} is not able to be connected! Will not use cache!".format(host, port)) + self.redis_server = None + else: + self.redis_server = None + self._parallel_count = int(parallel_count) + self._logger.debug("Running with {} processes.".format(parallel_count)) + self.qnodes_descriptions = dict() + self.vectors_map = dict() + self.property_labels_dict = dict() + self.q_node_to_label = dict() + self.node_labels = dict() + self.vectors_2D = None + self.vector_dump_file = None + self.gt_nodes = set() + self.candidates = defaultdict(dict) + self.metadata = [] + self.gt_indexes = set() + self.input_format = "" + self.token_pattern = re.compile(r"(?u)\b\w\w+\b") + + def get_sentences_embedding(self, sentences: typing.List[str], qnodes: typing.List[str]): + """ + transform a list of sentences to embedding vectors + """ + + if self.redis_server is not None: + sentence_embeddings = [] + for each_node, each_sentence in zip(qnodes, sentences): + query_cache_key = each_node + each_sentence + if self.model_name != "bert-base-wikipedia-sections-mean-tokens": + query_cache_key += self.model_name + cache_res = self.redis_server.get(query_cache_key) + if cache_res is not None: + sentence_embeddings.append(literal_eval(cache_res.decode("utf-8"))) + # self._logger.error("{} hit!".format(each_node+each_sentence)) + else: + each_embedding = self.model.encode([each_sentence], show_progress_bar=False) + sentence_embeddings.extend(each_embedding) + self.redis_server.set(query_cache_key, str(each_embedding[0].tolist())) + else: + sentence_embeddings = self.model.encode(sentences, show_progress_bar=False) + return sentence_embeddings + + def send_sparql_query(self, query_body: str): + """ + a simple wrap to send the query and return the returned results + """ + qm = SPARQLWrapper(self.wikidata_server) + qm.setReturnFormat(JSON) + qm.setMethod(POST) + qm.setRequestMethod(URLENCODED) + self._logger.debug("Sent query is:") + self._logger.debug(str(query_body)) + qm.setQuery(query_body) + try: + results = qm.query().convert()['results']['bindings'] + return results + except Exception as e: + error_message = ("Sending Sparql query to {} failed!".format(self.wikidata_server)) + self._logger.error(error_message) + self._logger.debug(e, exc_info=True) + raise KGTKException(error_message) + + def _get_labels(self, nodes: typing.List[str]): + query_nodes = " ".join(["wd:{}".format(each) for each in nodes]) + query = """ + select ?item ?nodeLabel + where { + values ?item {""" + query_nodes + """} + ?item rdfs:label ?nodeLabel. + FILTER(LANG(?nodeLabel) = "en"). + } + """ + results2 = self.send_sparql_query(query) + for each_res in results2: + node_id = each_res['item']['value'].split("/")[-1] + value = each_res['nodeLabel']['value'] + self.node_labels[node_id] = value + + def _get_labels_and_descriptions(self, query_qnodes: str, need_find_label: bool, need_find_description: bool): + query_body = """ + select ?item ?itemDescription ?itemLabel + where { + values ?item {""" + query_qnodes + """ } + SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } + } + """ + results = self.send_sparql_query(query_body) + for each in results: + each_node = each['item']['value'].split("/")[-1] + if 'itemDescription' in each: + description = each['itemDescription']['value'] + else: + description = "" + if "itemLabel" in each: + label = each['itemLabel']['value'] + else: + label = "" + if need_find_label: + self.candidates[each_node]["label_properties"] = [label] + if need_find_description: + self.candidates[each_node]["description_properties"] = [description] + + def _get_property_values(self, query_qnodes, query_part_names, query_part_properties): + used_p_node_ids = set() + for part_name, part in zip(query_part_names, query_part_properties): + if part_name == "isa_properties": + self._get_labels(part) + for i, each in enumerate(part): + if each not in {"label", "description", "all"}: + query_body2 = """ + select ?item ?eachPropertyLabel + where {{ + values ?item {{{all_nodes}}} + ?item wdt:{qnode} ?eachProperty. + SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }} + }} + """.format(all_nodes=query_qnodes, qnode=each) + results2 = self.send_sparql_query(query_body2) + + for each_res in results2: + node_id = each_res['item']['value'].split("/")[-1] + value = each_res['eachPropertyLabel']['value'] + if part_name == "isa_properties" and self.node_labels[each].endswith("of"): + value = self.node_labels[each] + "||" + value + used_p_node_ids.add(node_id) + if part_name in self.candidates[node_id]: + self.candidates[node_id][part_name].add(value) + else: + self.candidates[node_id][part_name] = {value} + return used_p_node_ids + + def _get_all_properties(self, query_qnodes, used_p_node_ids, properties_list): + has_properties_set = set(properties_list[3]) + query_body3 = """ + select DISTINCT ?item ?p_entity ?p_entityLabel + where { + values ?item {""" + query_qnodes + """} + ?item ?p ?o. + FILTER regex(str(?p), "^http://www.wikidata.org/prop/P", "i") + BIND (IRI(REPLACE(STR(?p), "http://www.wikidata.org/prop", "http://www.wikidata.org/entity")) AS ?p_entity) . + SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } + } + """ + results3 = self.send_sparql_query(query_body3) + for each in results3: + node_name = each['item']['value'].split("/")[-1] + p_node_id = each['p_entity']['value'].split("/")[-1] + p_node_label = each['p_entityLabel']['value'] + if p_node_id not in used_p_node_ids: + if properties_list[3] == ["all"] or p_node_id in has_properties_set: + if "has_properties" in self.candidates[node_name]: + self.candidates[node_name]["has_properties"].add(p_node_label) + else: + self.candidates[node_name]["has_properties"] = {p_node_label} + + def get_item_description(self, qnodes: typing.List[str] = None, target_properties: dict = {}): + """ + use sparql query to get the descriptions of given Q nodes + """ + if qnodes is None: + qnodes = self.candidates + if "all" in target_properties: + find_all_properties = True + else: + find_all_properties = False + properties_list = [[] for _ in range(4)] + names = ["labels", "descriptions", "isa_properties", "has_properties"] + for k, v in target_properties.items(): + if v == "label_properties": + properties_list[0].append(k) + elif v == "description_properties": + properties_list[1].append(k) + elif v == "isa_properties": + properties_list[2].append(k) + elif v == "has_properties": + properties_list[3].append(k) + + hash_generator = hashlib.md5() + hash_generator.update(str(properties_list).encode('utf-8')) + properties_list_hash = "||" + str(hash_generator.hexdigest()) + + sentences_cache_dict = {} + if self.redis_server is not None: + for each_node in qnodes: + cache_key = each_node + properties_list_hash + cache_res = self.redis_server.get(cache_key) + self._logger.debug("Cached key is: {}".format(cache_key)) + if cache_res is not None: + self._logger.debug("Cache hitted {}".format(cache_key)) + sentences_cache_dict[each_node] = cache_res.decode("utf-8") + + self._logger.debug("Cached for those nodes {} / {}".format(len(sentences_cache_dict), len(qnodes))) + self._logger.debug(str(set(sentences_cache_dict.keys()))) + self._logger.debug("Need run query for those nodes {} / {}:".format(len(qnodes) - len(sentences_cache_dict), len(qnodes))) + + # we do not need to get those node again + if len(sentences_cache_dict) > 0: + qnodes = set(qnodes) - set(sentences_cache_dict.keys()) + self._logger.debug(str(qnodes)) + + # only need to do query when we still have remained nodes + if len(qnodes) > 0: + need_find_label = "label" in properties_list[0] + need_find_description = "description" in properties_list[1] + query_qnodes = "" + for each in qnodes: + query_qnodes += "wd:{} ".format(each) + + # this is used to get corresponding labels / descriptions + if need_find_label or need_find_description: + self._get_labels_and_descriptions(query_qnodes, need_find_label, need_find_description) + + if len(properties_list[3]) > len(qnodes): + # in this condition, we have too many properties need to be queried, it will waste time + # query to get all properties then filtering would save more times + find_all_properties = True + query_part2_names = names[:3] + query_part2_properties = properties_list[:3] + else: + query_part2_names = names + query_part2_properties = properties_list + # this is used to get corresponding labels of properties values + used_p_node_ids = self._get_property_values(query_qnodes, query_part2_names, query_part2_properties) + + # if need get all properties, we need to run extra query + if find_all_properties: + self._get_all_properties(query_qnodes, used_p_node_ids, properties_list) + + for each_node_id in qnodes: + each_sentence = self.attribute_to_sentence(self.candidates[each_node_id], each_node_id) + self.candidates[each_node_id]["sentence"] = each_sentence + # add to cache + if self.redis_server is not None: + response = self.redis_server.set(each_node_id + properties_list_hash, each_sentence) + if response: + self._logger.debug("Pushed cache for {} success.".format(each_node_id + properties_list_hash)) + + for each_node_id, sentence in sentences_cache_dict.items(): + self.candidates[each_node_id]["sentence"] = sentence + + def _process_one(self, args): + """ + one process for multiprocess calling + :param args: + :return: + """ + node_id = args["node_id"] + each_node_attributes = args["attribute"] + concated_sentence = self.attribute_to_sentence(each_node_attributes, node_id) + vectors = self.get_sentences_embedding([concated_sentence], [node_id])[0] + return {"v_" + node_id: vectors, "c_" + node_id: each_node_attributes} + + def _multiprocess_collector(self, data): + for k, v in data.items(): + if k.startswith("v_"): + k = k.replace("v_", "") + self.vectors_map[k] = v + else: + k = k.replace("c_", "") + self.candidates[k] = v + + def read_input(self, file_path: str, skip_nodes_set: set = None, + input_format: str = "kgtk_format", target_properties: dict = {}, + property_labels_dict: dict = {}, black_list_set: set = set() + ): + """ + load the input candidates files + """ + self.property_labels_dict = property_labels_dict + + if input_format == "test_format": + self.input_format = input_format + input_df = pd.read_csv(file_path) + gt = {} + count = 0 + if "GT_kg_id" in input_df.columns: + gt_column_id = "GT_kg_id" + elif "kg_id" in input_df.columns: + gt_column_id = "kg_id" + else: + raise KGTKException("Can't find ground truth id column! It should either named as `GT_kg_id` or `kg_id`") + + for _, each in input_df.iterrows(): + if isinstance(each["candidates"], str): + temp = str(each['candidates']).split("|") + elif each['candidates'] is np.nan or math.isnan(each['candidates']): + temp = [] + + to_remove_q = set() + if each[gt_column_id] is np.nan: + self._logger.warning("Ignore NaN gt value form {}".format(str(each))) + each[gt_column_id] = "" + gt_nodes = each[gt_column_id].split(" ") + label = str(each["label"]) + if len(gt_nodes) == 0: + self._logger.error("Skip a row with no ground truth node given: as {}".format(str(each))) + continue + if label == "": + self._logger.error("Skip a row with no label given: as {}".format(str(each))) + continue + temp.extend(gt_nodes) + + for each_q in temp: + self.q_node_to_label[each_q] = label + if skip_nodes_set is not None and each_q in skip_nodes_set: + to_remove_q.add(each_q) + temp = set(temp) - to_remove_q + count += len(temp) + self.gt_nodes.add(each[gt_column_id]) + self.get_item_description(temp, target_properties) + + self._logger.info("Totally {} rows with {} candidates loaded.".format(str(len(gt)), str(count))) + + elif input_format == "kgtk_format": + # assume the input edge file is sorted + if "all" in target_properties: + _ = target_properties.pop("all") + add_all_properties = True + else: + add_all_properties = False + + self.input_format = input_format + with open(file_path, "r") as f: + # get header + headers = f.readline().replace("\n", "").split("\t") + if len(headers) < 3: + raise KGTKException( + "No enough columns found on given input file. Only {} columns given but at least 3 needed.".format( + len(headers))) + elif "node" in headers and "property" in headers and "value" in headers: + column_references = {"node": headers.index("node"), + "property": headers.index("property"), + "value": headers.index("value")} + elif len(headers) == 3: + column_references = {"node": 0, + "property": 1, + "value": 2} + else: + missing_column = {"node", "property", "value"} - set(headers) + raise KGTKException("Missing column {}".format(missing_column)) + self._logger.debug("column index information: ") + self._logger.debug(str(column_references)) + # read contents + each_node_attributes = {"has_properties": [], "isa_properties": [], "label_properties": [], + "description_properties": []} + current_process_node_id = None + + if self._parallel_count > 1: + pp = ParallelProcessor(self._parallel_count, self._process_one, collector=self._multiprocess_collector) + pp.start() + + for each_line in f: + each_line = each_line.replace("\n", "").split("\t") + node_id = each_line[column_references["node"]] + node_property = each_line[column_references["property"]] + node_value = each_line[column_references["value"]] + # remove @ mark + if "@" in node_value and node_value[0] != "@": + node_value_org = node_value + node_value = node_value[:node_value.index("@")] + + # remove extra double quote " and single quote ' + if node_value[0] == '"' and node_value[-1] == '"': + node_value = node_value[1:-1] + if node_value[0] == "'" and node_value[-1] == "'": + node_value = node_value[1:-1] + + if current_process_node_id != node_id: + if current_process_node_id is None: + current_process_node_id = node_id + else: + # if we get to next id, concate all properties into one sentence to represent the Q node + + # for multi process + if self._parallel_count > 1: + each_arg = {"node_id": current_process_node_id, "attribute": each_node_attributes} + pp.add_task(each_arg) + # for single process + else: + concated_sentence = self.attribute_to_sentence(each_node_attributes, current_process_node_id) + each_node_attributes["sentence"] = concated_sentence + self.candidates[current_process_node_id] = each_node_attributes + + # after write down finish, we can cleaer and start parsing next one + each_node_attributes = {"has_properties": [], "isa_properties": [], "label_properties": [], + "description_properties": []} + # update to new id + current_process_node_id = node_id + + if node_property in target_properties: + each_node_attributes[target_properties[node_property]].append(node_value) + if add_all_properties and each_line[column_references["value"]][0] == "P": + each_node_attributes["has_properties"].append(node_value) + + # close multiprocess pool + if self._parallel_count > 1: + pp.task_done() + pp.join() + else: + raise KGTKException("Unkonwn input format {}".format(input_format)) + + self._logger.info("Totally {} Q nodes loaded.".format(len(self.candidates))) + self.vector_dump_file = "dump_vectors_{}_{}.pkl".format(file_path[:file_path.rfind(".")], self.model_name) + # self._logger.debug("The cache file name will be {}".format(self.vector_dump_file)) + + def get_real_label_name(self, node): + if node in self.property_labels_dict: + return self.property_labels_dict[node] + else: + return node + + def attribute_to_sentence(self, attribute_dict: dict, node_id=None): + concated_sentence = "" + have_isa_properties = False + # sort the properties to ensure the sentence always same + attribute_dict = {key: sorted(list(value)) for key, value in attribute_dict.items() if len(value) > 0} + if "label_properties" in attribute_dict and len(attribute_dict["label_properties"]) > 0: + concated_sentence += self.get_real_label_name(attribute_dict["label_properties"][0]) + if "description_properties" in attribute_dict and len(attribute_dict["description_properties"]) > 0: + if concated_sentence != "" and attribute_dict["description_properties"][0] != "": + concated_sentence += ", " + concated_sentence += self.get_real_label_name(attribute_dict["description_properties"][0]) + if "isa_properties" in attribute_dict and len(attribute_dict["isa_properties"]) > 0: + have_isa_properties = True + temp = "" + for each in attribute_dict["isa_properties"]: + each = self.get_real_label_name(each) + if "||" in each: + if "instance of" in each: + each = each.split("||")[1] + else: + each = each.replace("||", " ") + temp += each + ", " + if concated_sentence != "" and temp != "": + concated_sentence += " is a " + elif concated_sentence == "": + concated_sentence += "It is a " + concated_sentence += temp[:-2] + if "has_properties" in attribute_dict and len(attribute_dict["has_properties"]) > 0: + temp = [self.get_real_label_name(each) for each in attribute_dict["has_properties"]] + if concated_sentence != "" and temp[0] != "": + if have_isa_properties: + concated_sentence += ", and has " + else: + concated_sentence += " has " + elif temp[0] != "": + concated_sentence += "It has " + concated_sentence += " and ".join(temp) + self._logger.debug("Transform node {} --> {}".format(node_id, concated_sentence)) + return concated_sentence + + def get_vectors(self): + """ + main function to get the vector representations of the descriptions + """ + if self._parallel_count == 1: + start_all = time.time() + self._logger.info("Now generating embedding vector.") + for q_node, each_item in tqdm(self.candidates.items()): + # do process for each row(one target) + sentence = each_item["sentence"] + if isinstance(sentence, bytes): + sentence = sentence.decode("utf-8") + vectors = self.get_sentences_embedding([sentence], [q_node]) + self.vectors_map[q_node] = vectors[0] + self._logger.info("Totally used {} seconds.".format(str(time.time() - start_all))) + else: + # Skip get vector function because we already get them + pass + + def dump_vectors(self, file_name, type_=None): + if file_name.endswith(".pkl"): + file_name = file_name.replace(".pkl", "") + if type_ == "2D": + with open(file_name + ".pkl", "wb") as f: + pickle.dump(self.vectors_2D, f) + dimension = len(self.vectors_2D[0]) + with open(file_name + ".tsv", "w") as f: + for each in self.vectors_2D: + for i, each_val in enumerate(each): + _ = f.write(str(each_val)) + if i != dimension - 1: + _ = f.write("\t") + _ = f.write("\n") + elif type_ == "metadata": + with open(file_name + "_metadata.tsv", "w") as f: + for each in self.metadata: + _ = f.write(each + "\n") + else: + with open(file_name + ".pkl", "wb") as f: + pickle.dump(self.vectors_map, f) + with open(file_name + ".tsv", "w") as f: + for each in self.vectors_map.values(): + for i in each: + _ = f.write(str(i) + "\t") + _ = f.write("\n") + + def print_vector(self, vectors, output_properties: str = "text_embedding", output_format="kgtk_format"): + if output_format == "kgtk_format": + print("node\tproperty\tvalue\n", end="") + if self.input_format == "kgtk_format": + for i, each_vector in enumerate(vectors): + print(str(list(self.candidates.keys())[i]) + "\t", end="") + print(output_properties + "\t", end="") + for j, each_dimension in enumerate(each_vector): + if j != len(each_vector) - 1: + print(str(each_dimension) + ",", end="") + else: + print(str(each_dimension) + "\n", end="") + elif self.input_format == "test_format": + all_nodes = list(self.vectors_map.keys()) + for i, each_vector in enumerate(vectors): + print(all_nodes[i] + "\t", end="") + print(output_properties + "\t", end="") + for j, each_dimension in enumerate(each_vector): + if j != len(each_vector) - 1: + print(str(each_dimension) + ",", end="") + else: + print(str(each_dimension) + "\n", end="") + + elif output_format == "tsv_format": + for each_vector in vectors: + for i, each_dimension in enumerate(each_vector): + if i != len(each_vector) - 1: + print(str(each_dimension) + "\t", end="") + else: + print(str(each_dimension) + "\n", end="") + + def plot_result(self, output_properties={}, input_format="kgtk_format", + output_uri: str = "", output_format="kgtk_format", + run_TSNE=True + ): + """ + transfer the vectors to lower dimension so that we can plot + Then save the 2D vector file for further purpose + """ + self.vectors_map = {k: v for k, v in sorted(self.vectors_map.items(), key=lambda item: item[0], reverse=True)} + vectors = list(self.vectors_map.values()) + # use TSNE to reduce dimension + if run_TSNE: + self._logger.warning("Start running TSNE to reduce dimension. It will take a long time.") + start = time.time() + self.vectors_2D = TSNE(n_components=2, random_state=0).fit_transform(vectors) + self._logger.info("Totally used {} seconds.".format(time.time() - start)) + + if input_format == "test_format": + gt_indexes = set() + vector_map_keys = list(self.vectors_map.keys()) + for each_node in self.gt_nodes: + gt_indexes.add(vector_map_keys.index(each_node)) + + self.metadata.append("Q_nodes\tType\tLabel\tDescription") + for i, each in enumerate(self.vectors_map.keys()): + label = self.q_node_to_label[each] + description = self.candidates[each]["sentence"] + if i in gt_indexes: + self.metadata.append("{}\tground_truth_node\t{}\t{}".format(each, label, description)) + else: + self.metadata.append("{}\tcandidates\t{}\t{}".format(each, label, description)) + self.gt_indexes = gt_indexes + + elif input_format == "kgtk_format": + if len(output_properties.get("metatada_properties", [])) == 0: + for k, v in self.candidates.items(): + label = v.get("label_properties", "") + if len(label) > 0 and isinstance(label, list): + label = label[0] + description = v.get("description_properties", "") + if len(description) > 0 and isinstance(description, list): + description = description[0] + self.metadata.append("{}\t\t{}\t{}".format(k, label, description)) + else: + required_properties = output_properties["metatada_properties"] + self.metadata.append("node\t" + "\t".join(required_properties)) + for k, v in self.candidates.items(): + each_metadata = k + "\t" + for each in required_properties: + each_metadata += v.get(each, " ") + "\t" + self.metadata.append(each_metadata) + + metadata_output_path = os.path.join(output_uri, self.vector_dump_file.split("/")[-1]) + if run_TSNE: + self.print_vector(self.vectors_2D, output_properties.get("output_properties"), output_format) + else: + self.print_vector(vectors, output_properties.get("output_properties"), output_format) + if output_uri != "none": + self.dump_vectors(metadata_output_path, "metadata") + + def evaluate_result(self): + """ + for the ground truth nodes, evaluate the average distance to the centroid, the lower the average distance, + the better clustering results should be + """ + centroid = None + gt_nodes_vectors = [] + if len(self.gt_indexes) == 0: + points = set(range(len(self.vectors_map))) + else: + points = self.gt_indexes + for i, each in enumerate(self.vectors_map.keys()): + if i in points: + if centroid is None: + centroid = np.array(self.vectors_map[each]) + else: + centroid += np.array(self.vectors_map[each]) + gt_nodes_vectors.append(self.vectors_map[each]) + centroid = centroid / len(points) + + distance_sum = 0 + for each in gt_nodes_vectors: + distance_sum += self.calculate_distance(each, centroid) + self._logger.info("The average distance for the ground truth nodes to centroid is {}".format(distance_sum / len(points))) + + @staticmethod + def calculate_distance(a, b): + if len(a) != len(b): + raise KGTKException("Vector dimension are different!") + dist = 0 + for v1, v2 in zip(a, b): + dist += (v1 - v2) ** 2 + dist = dist ** 0.5 + return dist From 91f7b03a716a0eccc7f41bac808539e4be173c80 Mon Sep 17 00:00:00 2001 From: ckxz105 Date: Mon, 4 May 2020 17:41:07 -0700 Subject: [PATCH 068/278] update embedding readme with parallel parameter --- kgtk/cli/text_embedding_README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kgtk/cli/text_embedding_README.md b/kgtk/cli/text_embedding_README.md index d9e694a17..a05bf3ba7 100644 --- a/kgtk/cli/text_embedding_README.md +++ b/kgtk/cli/text_embedding_README.md @@ -23,6 +23,7 @@ kgtk text_embedding \ --black-list/ -b # optional,default is None --logging-level/ -l \ # optional, default is `info` --run-TSNE False # optional, default is True + --parallel 4 # optional, default is 1 ``` ##### Example 1: For easiest running, just give the input file as @@ -147,6 +148,9 @@ First column is the node name. Second column is the property name as required, default is `text_embedding`. Third column is the embeded vecotrs. +##### parallel +You can also set up the parallel count to some number larger than 1 to run in multiprocess mode. Currently only support for kgtk format input data. For example: `--parallel 4` + ##### Reduced Embedding Vectors This will have embedded vectors values after running TSNE and reduced dimension to 2-dimensions for each Q nodes. This is used for visulization. (for example, you can view it at Google's online tools here: http://projector.tensorflow.org/) 3. Metadata for the generated vectors: This will contains the metadata information for the Q nodes generated from 2 files mentioned above. It will contains the Q node value of each vector, the type (it is a `candidate` or a `ground truth` node), the given label of the Q node and corresponding fetched description information from wikidata. From 94a4c100e7ee58befad1ecc27659d5d2d51a34b3 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Mon, 4 May 2020 17:41:14 -0700 Subject: [PATCH 069/278] Process minimum and maximum year limits. --- kgtk/cli/validate.py | 6 +++- kgtk/join/kgtkvalue.py | 52 +++++++++++++++++++++++++++++++---- kgtk/join/kgtkvalueoptions.py | 27 ++++++++++++++---- 3 files changed, 74 insertions(+), 11 deletions(-) diff --git a/kgtk/cli/validate.py b/kgtk/cli/validate.py index a823bcb5a..074f56aed 100644 --- a/kgtk/cli/validate.py +++ b/kgtk/cli/validate.py @@ -153,6 +153,8 @@ def run(kgtk_files: typing.Optional[typing.List[typing.Optional[Path]]], allow_lax_strings: bool = False, allow_lax_lq_strings: bool = False, allow_month_or_day_zero: bool = False, + minimum_valid_year: int = KgtkValueOptions.MINIMUM_VALID_YEAR, + maximum_valid_year: int = KgtkValueOptions.MAXIMUM_VALID_YEAR, compression_type: typing.Optional[str] = None, gzip_in_parallel: bool = False, gzip_queue_size: int = KgtkReader.GZIP_QUEUE_SIZE_DEFAULT, @@ -176,7 +178,9 @@ def run(kgtk_files: typing.Optional[typing.List[typing.Optional[Path]]], allow_lax_strings=allow_lax_strings, allow_lax_lq_strings=allow_lax_lq_strings, allow_language_suffixes=allow_language_suffixes, - additional_language_codes=additional_language_codes) + additional_language_codes=additional_language_codes, + minimum_valid_year=minimum_valid_year, + maximum_valid_year=maximum_valid_year) try: kgtk_file: typing.Optional[Path] diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py index 2d95bd4aa..7f86a79f7 100644 --- a/kgtk/join/kgtkvalue.py +++ b/kgtk/join/kgtkvalue.py @@ -448,11 +448,40 @@ def is_date_and_times(self, idx: typing.Optional[int] = None)->bool: v: str = self.get_item(idx) return v.startswith("^") - # This pattern allows month 00 and day 00, which are excluded by ISO 8601. - lax_date_and_times_re: typing.Pattern = re.compile(r"^\^(?P[0-9]{4})(?:(?P-)?(?P1[0-2]|0[0-9])(?:(?(hyphen)-)(?P3[01]|0[0-9]|[12][0-9])))T(?P2[0-3]|[01][0-9])(?:(?(hyphen):)(?P[0-5][0-9])(?:(?(hyphen):)(?P[0-5][0-9])))(?PZ|\[-+][0-9][0-9](?::[0-9][0-9])?)?(?P/[0-1]?[0-9])?$") + year_pat: str = r'(?P[-+]?[0-9]{4})' + + hour_pat: str = r'(?P2[0-3]|[01][0-9])' + minutes_pat: str = r'(?P[0-5][0-9])' + seconds_pat: str = r'(?P[0-5][0-9])' + zone_pat: str = r'(?PZ|\[-+][0-9][0-9](?::[0-9][0-9]))' + time_pat: str = r'(?:{hour}(?:(?(hyphen):){minutes}(?:(?(hyphen):){seconds})?)?{zone}?)'.format(hour=hour_pat, + minutes=minutes_pat, + seconds=seconds_pat, + zone=zone_pat) - strict_date_and_times_re: typing.Pattern = re.compile(r"^\^(?P[0-9]{4})(?:(?P-)?(?P1[0-2]|0[1-9])(?:(?(hyphen)-)(?P3[01]|0[1-9]|[12][0-9])))T(?P2[0-3]|[01][0-9])(?:(?(hyphen):)(?P[0-5][0-9])(?:(?(hyphen):)(?P[0-5][0-9])))(?PZ|\[-+][0-9][0-9](?::[0-9][0-9])?)?(?P/[0-1]?[0-9])?$") + precision_pat: str = r'(?P/[0-1]?[0-9])' + # This pattern allows month 00 and day 00, which are excluded by ISO 8601. + lax_month_pat: str = r'(?P1[0-2]|0[0-9])' + lax_day_pat: str = r'(?P3[01]|0[0-9]|[12][0-9])' + lax_date_pat: str = r'(?:{year}(?:(?P-)?{month}?(?:(?(hyphen)-){day})?)?)'.format(year=year_pat, + month=lax_month_pat, + day=lax_day_pat) + lax_date_and_times_pat: str = r'(?:\^{date}(?:T{time}{precision}?)?)'.format(date=lax_date_pat, + time=time_pat, + precision=precision_pat) + lax_date_and_times_re: typing.Pattern = re.compile(r'^{date_and_times}$'.format(date_and_times=lax_date_and_times_pat)) + + strict_month_pat: str = r'(?P1[0-2]|0[1-9])' + strict_day_pat: str = r'(?P3[01]|0[1-9]|[12][0-9])' + strict_date_pat: str = r'(?:{year}(?:(?P-)?{month}?(?:(?(hyphen)-){day})?)?)'.format(year=year_pat, + month=strict_month_pat, + day=strict_day_pat) + strict_date_and_times_pat: str = r'(?:\^{date}(?:T{time}{precision}?)?)'.format(date=strict_date_pat, + time=time_pat, + precision=precision_pat) + strict_date_and_times_re: typing.Pattern = re.compile(r'^{date_and_times}$'.format(date_and_times=strict_date_and_times_pat)) + def is_valid_date_and_times(self, idx: typing.Optional[int] = None)->bool: """ Return False if this value is a list and idx is None. @@ -466,7 +495,7 @@ def is_valid_date_and_times(self, idx: typing.Optional[int] = None)->bool: YYYY-MM-DD Valid date and time formats - YYMMDDTHH + YYYYMMDDTHH YYYY-MM-DDTHH YYMMDDTHHMM YYYY-MM-DDTHH:MM @@ -511,7 +540,20 @@ def is_valid_date_and_times(self, idx: typing.Optional[int] = None)->bool: m = KgtkValue.lax_date_and_times_re.match(v) else: m = KgtkValue.strict_date_and_times_re.match(v) - return m is not None + if m is None: + return False + year_str: str = m.group("year") + if year_str is None or len(year_str) == 0: + return False + try: + year: int = int(year_str) + except ValueError: + return False + if year < self.options.minimum_valid_year: + return False + if year > self.options.maximum_valid_year: + return False + return True def is_extension(self, idx: typing.Optional[int] = None)->bool: """ diff --git a/kgtk/join/kgtkvalueoptions.py b/kgtk/join/kgtkvalueoptions.py index cd07e5aa0..c6383e793 100644 --- a/kgtk/join/kgtkvalueoptions.py +++ b/kgtk/join/kgtkvalueoptions.py @@ -13,6 +13,10 @@ class KgtkValueOptions: seperate class for efficiency. """ + # The default minimum and maximum valid year values. + MINIMUM_VALID_YEAR: int = 1583 # Per ISO 8601, years before this one require special agreement. + MAXIMUM_VALID_YEAR: int = 2100 # Arbitrarily chosen. + # Allow month 00 or day 00 in dates? This isn't really allowed by ISO # 8601, but appears in wikidata. allow_month_or_day_zero: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) @@ -31,10 +35,15 @@ class KgtkValueOptions: # If this list gets long, we may want to turn it into a map to make lookup # more efficient. - additional_language_codes: typing.Optional[typing.List[str]] = attr.ib(validator=attr.validators.optional(attr.validators.deep_iterable(member_validator=attr.validators.instance_of(str), - iterable_validator=attr.validators.instance_of(list))), - default=None) - + # + # TODO: fix this validation + # additional_language_codes: typing.Optional[typing.List[str]] = attr.ib(validator=attr.validators.deep_iterable(member_validator=attr.validators.instance_of(str), + # iterable_validator=attr.validators.instance_of(list)))), + additional_language_codes: typing.Optional[typing.List[str]] = attr.ib(default=None) + + # Minimum and maximum year range in dates. + minimum_valid_year: int = attr.ib(validator=attr.validators.instance_of(int), default=MINIMUM_VALID_YEAR) + maximum_valid_year: int = attr.ib(validator=attr.validators.instance_of(int), default=MAXIMUM_VALID_YEAR) @classmethod def add_arguments(cls, parser: ArgumentParser): @@ -69,6 +78,12 @@ def add_arguments(cls, parser: ArgumentParser): md0group.add_argument( "--disallow-month-or-day-zero", dest="allow_month_or_day_zero", help="Allow month or day zero in dates.", action='store_false') + parser.add_argument( "--minimum-valid-year", dest="minimum_valid_year", + help="The minimum valid year in dates.", type=int, default=cls.MINIMUM_VALID_YEAR) + + parser.add_argument( "--maximum-valid-year", dest="maximum_valid_year", + help="The maximum valid year in dates.", type=int, default=cls.MAXIMUM_VALID_YEAR) + @classmethod # Build the value parsing option structure. def from_args(cls, args: Namespace)->'KgtkValueOptions': @@ -76,7 +91,9 @@ def from_args(cls, args: Namespace)->'KgtkValueOptions': allow_language_suffixes=args.allow_language_suffixes, allow_lax_strings=args.allow_lax_strings, allow_lax_lq_strings=args.allow_lax_lq_strings, - additional_language_codes=args.additional_language_codes) + additional_language_codes=args.additional_language_codes, + minimum_valid_year=args.minimum_valid_year, + maximum_valid_year=args.maximum_valid_year) DEFAULT_KGTK_VALUE_OPTIONS: KgtkValueOptions = KgtkValueOptions() From 479c650b603cc718f1aa4845796f5d03a7f7d745 Mon Sep 17 00:00:00 2001 From: saggu Date: Mon, 4 May 2020 18:00:06 -0700 Subject: [PATCH 070/278] update order of columns --- kgtk/cli/gt_loader.py | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/kgtk/cli/gt_loader.py b/kgtk/cli/gt_loader.py index b78167725..00fa84404 100644 --- a/kgtk/cli/gt_loader.py +++ b/kgtk/cli/gt_loader.py @@ -9,6 +9,20 @@ def parser(): } +def convert_scientific_notation(num): + if isinstance(num, float): + num = str(num) + if 'e' in num: + vals = num.split('e') + formatter = int(vals[1].replace('-', '')) + 2 + try: + return "{:.{formatter}f}".format(float(num), formatter=formatter) + except: + print(num, vals, formatter) + raise + return num + + def add_arguments(parser): """ Parse arguments @@ -136,16 +150,15 @@ def infer_predicate(h, options=[]): for n_id, n_label, authority in main_auth: writer.write('%s\t%s\t%f\n' % (n_id, n_label, authority)) - sys.stdout.write('id\tnode1\tproperty\tnode2\n') + sys.stdout.write('node1\tproperty\tnode2\tid\n') id_count = 0 if not output_stats: for e in G2.edges(): sid, oid = e lbl = G2.ep[predicate][e] sys.stdout.write( - '%s\t%s\t%s\t%s\n' % ( - '{}-{}-{}'.format(G2.vp[id_col][sid], lbl, id_count), G2.vp[id_col][sid], lbl, - G2.vp[id_col][oid])) + '%s\t%s\t%s\t%s\n' % (G2.vp[id_col][sid], lbl, G2.vp[id_col][oid], + '{}-{}-{}'.format(G2.vp[id_col][sid], lbl, id_count))) id_count += 1 id_count = 0 @@ -153,20 +166,19 @@ def infer_predicate(h, options=[]): v_id = G2.vp[id_col][v] sys.stdout.write( - '{}\t{}\t{}\t{}\n'.format('{}-{}-{}'.format(v_id, vertex_in_degree, id_count), v_id, - vertex_in_degree, v.in_degree())) + '{}\t{}\t{}\t{}\n'.format(v_id, vertex_in_degree, v.in_degree(), + '{}-{}-{}'.format(v_id, vertex_in_degree, id_count))) id_count += 1 sys.stdout.write( - '{}\t{}\t{}\t{}\n'.format('{}-{}-{}'.format(v_id, vertex_out_degree, id_count), v_id, - vertex_out_degree, v.out_degree())) + '{}\t{}\t{}\t{}\n'.format(v_id, vertex_out_degree, v.out_degree(), + '{}-{}-{}'.format(v_id, vertex_out_degree, id_count))) id_count += 1 for vprop in G2.vertex_properties.keys(): if vprop == id_col: continue sys.stdout.write( - '%s\t%s\t%s\t%s\n' % ( - '{}-{}-{}'.format(v_id, v_prop_dict[vprop], id_count), v_id, v_prop_dict[vprop], - G2.vp[vprop][v])) + '%s\t%s\t%s\t%s\n' % (v_id, v_prop_dict[vprop], convert_scientific_notation(G2.vp[vprop][v]), + '{}-{}-{}'.format(v_id, v_prop_dict[vprop], id_count))) id_count += 1 if output: From 5c11c71b3f02ac3c3fd62ca3f6ba05b9887e46b6 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Mon, 4 May 2020 19:10:30 -0700 Subject: [PATCH 071/278] Improve coordinates and date/time parsing. --- kgtk/join/kgtkvalue.py | 85 ++++++++++++++++++++++++++++-------------- 1 file changed, 56 insertions(+), 29 deletions(-) diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py index 7f86a79f7..df5b0cd16 100644 --- a/kgtk/join/kgtkvalue.py +++ b/kgtk/join/kgtkvalue.py @@ -400,7 +400,8 @@ def is_location_coordinates(self, idx: typing.Optional[int] = None)->bool: return v.startswith("@") #location_coordinates_re: typing.Pattern = re.compile(r"^@(?P[-+]?\d{3}\.\d{5})/(?P[-+]?\d{3}\.\d{5})$") - location_coordinates_re: typing.Pattern = re.compile(r"^@(?P[-+]?(?:\d+(?:\.\d*)?)|(?:\.\d+))/(?P[-+]?(?:\d+(?:\.\d*)?)|(?:\.\d+))$") + degrees_pat: str = r'(?:[-+]?(?:\d+(?:\.\d*)?)|(?:\.\d+))' + location_coordinates_re: typing.Pattern = re.compile(r'^@(?P{degrees})/(?P{degrees})$'.format(degrees=degrees_pat)) def is_valid_location_coordinates(self, idx: typing.Optional[int] = None)->bool: """ @@ -448,40 +449,49 @@ def is_date_and_times(self, idx: typing.Optional[int] = None)->bool: v: str = self.get_item(idx) return v.startswith("^") + # https://en.wikipedia.org/wiki/ISO_8601 + # + # The "lax" patterns allow month 00 and day 00, which are excluded by ISO 8601. + # We will allow those values when requested in the code below. + # + # The first possible hyphen position determines whether we will parse in + # value as a "basic" (no hyphen) or "extended" format date/time. A + # mixture is not permitted: either all hyphens (colons in the time + # section) must be present, or none. + # + # Year-month-day year_pat: str = r'(?P[-+]?[0-9]{4})' - + lax_month_pat: str = r'(?P1[0-2]|0[0-9])' + lax_day_pat: str = r'(?P3[01]|0[0-9]|[12][0-9])' + lax_date_pat: str = r'(?:{year}(?:(?P-)?{month}?(?:(?(hyphen)-){day})?)?)'.format(year=year_pat, + month=lax_month_pat, + day=lax_day_pat) + # hour-minutes-seconds hour_pat: str = r'(?P2[0-3]|[01][0-9])' minutes_pat: str = r'(?P[0-5][0-9])' seconds_pat: str = r'(?P[0-5][0-9])' - zone_pat: str = r'(?PZ|\[-+][0-9][0-9](?::[0-9][0-9]))' + + # NOTE: It might be the case that the ":" before the minutes in the time zone pattern + # should be conditioned upon the hyphen indicator. The Wikipedia article doesn't + # mention this requirement. + # + # NOTE: This pattern accepts a wider range of offsets than actually occur. + # + # TODO: consult the actual standard about the colon. + zone_pat: str = r'(?PZ|[-+][01][0-9](?::?[0-5][0-9])?)' + time_pat: str = r'(?:{hour}(?:(?(hyphen):){minutes}(?:(?(hyphen):){seconds})?)?{zone}?)'.format(hour=hour_pat, minutes=minutes_pat, seconds=seconds_pat, zone=zone_pat) - precision_pat: str = r'(?P/[0-1]?[0-9])' + precision_pat: str = r'(?P[0-1]?[0-9])' - # This pattern allows month 00 and day 00, which are excluded by ISO 8601. - lax_month_pat: str = r'(?P1[0-2]|0[0-9])' - lax_day_pat: str = r'(?P3[01]|0[0-9]|[12][0-9])' - lax_date_pat: str = r'(?:{year}(?:(?P-)?{month}?(?:(?(hyphen)-){day})?)?)'.format(year=year_pat, - month=lax_month_pat, - day=lax_day_pat) - lax_date_and_times_pat: str = r'(?:\^{date}(?:T{time}{precision}?)?)'.format(date=lax_date_pat, - time=time_pat, - precision=precision_pat) + lax_date_and_times_pat: str = r'(?:\^{date}(?:T{time})?(?:/{precision})?)'.format(date=lax_date_pat, + time=time_pat, + precision=precision_pat) lax_date_and_times_re: typing.Pattern = re.compile(r'^{date_and_times}$'.format(date_and_times=lax_date_and_times_pat)) - strict_month_pat: str = r'(?P1[0-2]|0[1-9])' - strict_day_pat: str = r'(?P3[01]|0[1-9]|[12][0-9])' - strict_date_pat: str = r'(?:{year}(?:(?P-)?{month}?(?:(?(hyphen)-){day})?)?)'.format(year=year_pat, - month=strict_month_pat, - day=strict_day_pat) - strict_date_and_times_pat: str = r'(?:\^{date}(?:T{time}{precision}?)?)'.format(date=strict_date_pat, - time=time_pat, - precision=precision_pat) - strict_date_and_times_re: typing.Pattern = re.compile(r'^{date_and_times}$'.format(date_and_times=strict_date_and_times_pat)) - def is_valid_date_and_times(self, idx: typing.Optional[int] = None)->bool: """ Return False if this value is a list and idx is None. @@ -535,16 +545,14 @@ def is_valid_date_and_times(self, idx: typing.Optional[int] = None)->bool: return False v: str = self.get_item(idx) - m: typing.Optional[typing.Match] - if self.options.allow_month_or_day_zero: - m = KgtkValue.lax_date_and_times_re.match(v) - else: - m = KgtkValue.strict_date_and_times_re.match(v) + m: typing.Optional[typing.Match] = KgtkValue.lax_date_and_times_re.match(v) if m is None: return False + + # Validate the year: year_str: str = m.group("year") if year_str is None or len(year_str) == 0: - return False + return False # Years are mandatory try: year: int = int(year_str) except ValueError: @@ -553,6 +561,25 @@ def is_valid_date_and_times(self, idx: typing.Optional[int] = None)->bool: return False if year > self.options.maximum_valid_year: return False + + month_str: str = m.group("month") + if month_str is not None: + try: + month: int = int(month_str) + except ValueError: + return False # shouldn't happen + if month == 0 and not self.options.allow_month_or_day_zero: + return False # month 0 was disallowed. + + day_str: str = m.group("day") + if day_str is not None: + try: + day: int = int(day_str) + except ValueError: + return False # shouldn't happen + if day == 0 and not self.options.allow_month_or_day_zero: + return False # day 0 was disallowed. + return True def is_extension(self, idx: typing.Optional[int] = None)->bool: From a32637202cb8354d243646b72e7892274cfa7bc0 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Mon, 4 May 2020 19:20:32 -0700 Subject: [PATCH 072/278] Prepare the min/max lat/lon values for override. Describe lists a little differently. --- kgtk/join/kgtkvalue.py | 12 ++++++------ kgtk/join/kgtkvalueoptions.py | 17 +++++++++++++---- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py index df5b0cd16..56939b949 100644 --- a/kgtk/join/kgtkvalue.py +++ b/kgtk/join/kgtkvalue.py @@ -418,20 +418,20 @@ def is_valid_location_coordinates(self, idx: typing.Optional[int] = None)->bool: if m is None: return False - # Latitude runs from -90 to +90 + # Latitude normally runs from -90 to +90: latstr: str = m.group("lat") try: lat: float = float(latstr) - if lat < -90. or lat > 90.: + if lat < self.options.minimum_valid_lat or lat > self.options.maximum_valid_lat: return False except ValueError: return False - # Longitude runs from -180 to +180 + # Longitude normally runs from -180 to +180: lonstr: str = m.group("lon") try: lon: float = float(lonstr) - if lon < -180. or lon > 180.: + if lon < self.options.minimum_valid_lon or lon > self.options.maximum_valid_lon: return False except ValueError: return False @@ -647,7 +647,7 @@ def describe(self, idx: typing.Optional[int] = None)->str: Return a string that describes the value. """ if self.is_list() and idx is None: - result: str = "" + result: str = "List (" kv: KgtkValue first: bool = True for kv in self.get_values(): @@ -656,7 +656,7 @@ def describe(self, idx: typing.Optional[int] = None)->str: else: result += KgtkFormat.LIST_SEPARATOR result += kv.describe() - return result + return result + ")" if self.is_empty(idx): return "Empty" diff --git a/kgtk/join/kgtkvalueoptions.py b/kgtk/join/kgtkvalueoptions.py index c6383e793..19252f3d9 100644 --- a/kgtk/join/kgtkvalueoptions.py +++ b/kgtk/join/kgtkvalueoptions.py @@ -13,10 +13,6 @@ class KgtkValueOptions: seperate class for efficiency. """ - # The default minimum and maximum valid year values. - MINIMUM_VALID_YEAR: int = 1583 # Per ISO 8601, years before this one require special agreement. - MAXIMUM_VALID_YEAR: int = 2100 # Arbitrarily chosen. - # Allow month 00 or day 00 in dates? This isn't really allowed by ISO # 8601, but appears in wikidata. allow_month_or_day_zero: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) @@ -42,9 +38,22 @@ class KgtkValueOptions: additional_language_codes: typing.Optional[typing.List[str]] = attr.ib(default=None) # Minimum and maximum year range in dates. + MINIMUM_VALID_YEAR: int = 1583 # Per ISO 8601, years before this one require special agreement. minimum_valid_year: int = attr.ib(validator=attr.validators.instance_of(int), default=MINIMUM_VALID_YEAR) + MAXIMUM_VALID_YEAR: int = 2100 # Arbitrarily chosen. maximum_valid_year: int = attr.ib(validator=attr.validators.instance_of(int), default=MAXIMUM_VALID_YEAR) + MINIMUM_VALID_LAT: float = -90. + minimum_valid_lat: float = attr.ib(validator=attr.validators.instance_of(float), default=MINIMUM_VALID_LAT) + MAXIMUM_VALID_LAT: float = 90. + maximum_valid_lat: float = attr.ib(validator=attr.validators.instance_of(float), default=MAXIMUM_VALID_LAT) + + MINIMUM_VALID_LON: float = -180. + minimum_valid_lon: float = attr.ib(validator=attr.validators.instance_of(float), default=MINIMUM_VALID_LON) + MAXIMUM_VALID_LON: float = 180. + maximum_valid_lon: float = attr.ib(validator=attr.validators.instance_of(float), default=MAXIMUM_VALID_LON) + + @classmethod def add_arguments(cls, parser: ArgumentParser): parser.add_argument( "--additional-language-codes", dest="additional_language_codes", From 991e486b473d81f4a31809c99741479df8abb2a5 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Mon, 4 May 2020 21:30:58 -0700 Subject: [PATCH 073/278] New set of datatypes. --- kgtk/join/kgtkformat.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/kgtk/join/kgtkformat.py b/kgtk/join/kgtkformat.py index 6c392fe82..036d81e4b 100644 --- a/kgtk/join/kgtkformat.py +++ b/kgtk/join/kgtkformat.py @@ -20,11 +20,18 @@ class KgtkFormat: # There is only one required column in a node file: ID_COLUMN_NAMES: typing.List[str] = ["id", "ID"] - class DataTypes(Enum): - NUMBER = 0 - STRING = 1 - STRUCTURED_LITERAL = 2 - SYMBOL = 3 + class DataType(Enum): + EMPTY = 0 + LIST = 1 + NUMBER = 2 + QUANTITY = 3 + STRING = 4 + LANGUAGE_QUALIFIED_STRING = 5 + LOCATION_COORDINATES = 6 + DATE_AND_TIMES = 7 + EXTENSION = 8 + BOOLEAN = 9 + SYMBOL = 10 TRUE_SYMBOL: str = "True" FALSE_SYMBOL: str = "False" From fda2f128a352417f454f2e0ab885a6da2e468516 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Tue, 5 May 2020 14:49:51 -0700 Subject: [PATCH 074/278] Refactored value tests. --- kgtk/join/kgtkvalue.py | 598 ++++++++++++++++++++++------------------- 1 file changed, 326 insertions(+), 272 deletions(-) diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py index 56939b949..e1fb06f27 100644 --- a/kgtk/join/kgtkvalue.py +++ b/kgtk/join/kgtkvalue.py @@ -17,107 +17,145 @@ class KgtkValue(KgtkFormat): value: str = attr.ib(validator=attr.validators.instance_of(str)) options: KgtkValueOptions = attr.ib(validator=attr.validators.instance_of(KgtkValueOptions), default=DEFAULT_KGTK_VALUE_OPTIONS) - split_list_re: typing.Pattern = re.compile(r"(?typing.List[str]: - if self.values is None: - self.values = KgtkValue.split_list_re.split(self.value) - return self.values + def get_data_type(self)->KgtkFormat.DataType: - def get_item(self, idx: typing.Optional[int])-> str: - if idx is None: - return self.value - else: - return self.get_list()[idx] + if self.data_type is not None: + pass - def is_list(self)->bool: - return len(self.get_list()) > 1 + elif self.is_empty() or self.is_list(): + pass - def get_values(self)->typing.List['KgtkValue']: - """ - Convert the value into a list of KgtkValues. - """ - if not self.is_list: - return [ self ] + elif self.is_string() or self.is_language_qualified_string(): + pass + + elif self.is_number_or_quantity(): + # To determine whether this is a number or a quantity, we have + # to validate one of them. + if not self.is_valid_number(): + # If it isn't a valid number, assume it's a quantity. + self.data_type = KgtkFormat.DataType.QUANTITY + + elif self.is_location_coordinates(): + pass + + elif self.is_date_and_times(): + pass + + elif self.is_extension(): + pass + + elif self.is_boolean() or self.is_symbol(): + pass + + if self.data_type is not None: + return self.data_type + + # Shouldn't get here. + raise ValueError("Unknown data type for '%s'" % self.value) + + def is_valid(self)->bool: + dt: KgtkFormat.DataType = self.get_data_type() + if dt == KgtkFormat.DataType.EMPTY: + return self.is_valid_empty() + elif dt == KgtkFormat.DataType.LIST: + return self.is_valid_list() + elif dt == KgtkFormat.DataType.NUMBER: + return self.is_valid_number() + elif dt == KgtkFormat.DataType.QUANTITY: + return self.is_valid_quantity() + elif dt == KgtkFormat.DataType.STRING: + return self.is_valid_string() + elif dt == KgtkFormat.DataType.LANGUAGE_QUALIFIED_STRING: + return self.is_valid_language_qualified_string() + elif dt == KgtkFormat.DataType.LOCATION_COORDINATES: + return self.is_valid_location_coordinates() + elif dt == KgtkFormat.DataType.DATE_AND_TIMES: + return self.is_valid_date_and_times() + elif dt == KgtkFormat.DataType.EXTENSION: + return self.is_valid_extension() + elif dt == KgtkFormat.DataType.BOOLEAN: + return self.is_valid_boolean() + elif dt == KgtkFormat.DataType.SYMBOL: + return self.is_valid_symbol() else: - result: typing.List['KgtkValue'] = [ ] - v: str - for v in self.get_list(): - result.append(KgtkValue(v, options=self.options)) - return result + raise ValueError("Unrecognized DataType.") - def is_empty(self, idx: typing.Optional[int] = None)->bool: - """ - Return False if this value is a list and idx is None. - Otherwise, return True if the value is empty. - """ - if self.is_list() and idx is None: - return False - v: str = self.get_item(idx) - return len(v) == 0 + def is_empty(self)->bool: + if self.data_type is not None: + return self.data_type == KgtkFormat.DataType.EMPTY - def is_number_old(self, idx: typing.Optional[int] = None)->bool: - """ - Return False if this value is a list and idx is None. - Otherwise, return True if the first character is 0-9,+,-,. . - """ - if self.is_list() and idx is None: + if len(self.value) != 0: return False - - v: str = self.get_item(idx) - return v.startswith(("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "+", "-", ".")) - - def is_valid_number_old(self, idx: typing.Optional[int] = None)->bool: - """ - Return False if this value is a list and idx is None. - Otherwise, return True if the first character is 0-9,_,-,. - and Python can parse it. - Examples: - 1 - 123 - -123 - +123 - 0b101 - 0o277 - 0x24F - .4 - 0.4 - 10. - 10.4 - 10.4e10 - """ - if self.is_list() and idx is None: + # We are certain that this is an empty value. We can be certain it is valid. + self.data_type = KgtkFormat.DataType.EMPTY + self.valid = True + return True + + def is_valid_empty(self)->bool: + # If it is empty, it is validly so. + return self.is_empty() + + split_list_re: typing.Pattern = re.compile(r"(?typing.List['KgtkValue']: + if self.list_items is not None: + return self.list_items + + self.list_items: typing.List['KgtkValue'] = [ ] + value: str + for value in KgtkValue.split_list_re.split(self.value): + self.list_items.append(KgtkValue(value, options=self.options)) + return self.list_items + + def is_list(self)->bool: + if self.data_type is not None: + return self.data_type == KgtkFormat.DataType.LIST + + if len(self.get_list()) == 1: return False - - v: str = self.get_item(idx) - if not v.startswith(("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "+", "-", ".")): + + # We aare certain that this is a list, although we haven't checked validity. + self.data_type = KgtkFormat.DataType.LIST + return True + + + def is_valid_list(self)->bool: + if not self.is_list(): return False - try: - i: int = int(v, 0) # The 0 allows prefixes: 0b, 0o, and 0x. - return True - except ValueError: - try: - f: float = float(v) - return True - except ValueError: + + if self.valid is not None: + return self.valid + + item: 'KgtkValue' + for item in self.get_list(): + if not item.is_valid(): + # The list is invalid if any item in the list is invalid. + self.valid = False return False - - - def is_number_or_quantity(self, idx: typing.Optional[int] = None)->bool: + + # This is a valid list. + self.valid = True + return True + + def _is_number_or_quantity(self)->bool: + return self.value.startswith(("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "+", "-", ".")) + + def is_number_or_quantity(self)->bool: """ - Return False if this value is a list and idx is None. Otherwise, return True if the first character is 0-9,+,-,. . """ - if self.is_list() and idx is None: - return False - - v: str = self.get_item(idx) - return v.startswith(("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "+", "-", ".")) + if self.data_type is not None: + return self.data_type == KgtkFormat.DataType.NUMBER or self.data_type == KgtkFormat.DataType.QUANTITY + + return self._is_number_or_quantity() # The following lexical analysis is based on: # https://docs.python.org/3/reference/lexical_analysis.html @@ -210,27 +248,33 @@ def is_number_or_quantity(self, idx: typing.Optional[int] = None)->bool: # This matches quantities excluding numbers. quantity_re: typing.Pattern = re.compile(r'^' + quantity_pat + r'$') - def is_valid_number_or_quantity(self, idx: typing.Optional[int] = None)->bool: + def is_valid_number_or_quantity(self)->bool: """ - Return False if this value is a list and idx is None. - Otherwise, return True if the first character is 0-9,_,-,. + Return True if the first character is 0-9,_,-,. and it is either a Python-compatible number or an enhanced quantity. """ - if self.is_list() and idx is None: - return False - - v: str = self.get_item(idx) - if not v.startswith(("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "+", "-", ".")): + # If we know the specific data type, delegate the test to that data type. + if self.data_type is not None: + if self.data_type == KgtkFormat.DataType.NUMBER: + return self.is_valid_number() + elif self.data_type == KgtkFormat.DataType.QUANTITY: + return self.is_valid_quantity() + else: + return False # Not a number or quantity. + + if not self._is_number_or_quantity(): return False - m: typing.Optional[typing.Match] = KgtkValue.number_or_quantity_re.match(v) + # We cannot cache the result of this test because it would interfere + # if we later determined the exact data type. We could work around + # this problem with more thought. + m: typing.Optional[typing.Match] = KgtkValue.number_or_quantity_re.match(self.value) return m is not None - def is_valid_number(self, idx: typing.Optional[int] = None)->bool: + def is_valid_number(self)->bool: """ - Return False if this value is a list and idx is None. Otherwise, return True if the first character is 0-9,_,-,. and it is a Python-compatible number (with optional limited enhancements). @@ -248,173 +292,215 @@ def is_valid_number(self, idx: typing.Optional[int] = None)->bool: 10.4 10.4e10 """ - if self.is_list() and idx is None: - return False + if self.data_type is not None: + if self.data_type != KgtkFormat.DataType.NUMBER: + return False + if self.valid is not None: + return self.valid - v: str = self.get_item(idx) - if not v.startswith(("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "+", "-", ".")): + if not self._is_number_or_quantity(): return False + # We don't know yet if this is a number. It could be a quantity. - m: typing.Optional[typing.Match] = KgtkValue.number_re.match(v) - return m is not None + m: typing.Optional[typing.Match] = KgtkValue.number_re.match(self.value) + if m is None: + return False + + # Now we can be certain that this is a number. + self.data_type = KgtkFormat.DataType.NUMBER + self.valid = True + return True - def is_valid_quantity(self, idx: typing.Optional[int] = None)->bool: + def is_valid_quantity(self)->bool: """ - Return False if this value is a list and idx is None. - Otherwise, return True if the first character is 0-9,_,-,. + Return True if the first character is 0-9,_,-,. and it is an enhanced quantity. """ - if self.is_list() and idx is None: - return False + if self.data_type is not None: + if self.data_type != KgtkFormat.DataType.QUANTITY: + return False + if self.valid is not None: + return self.valid - v: str = self.get_item(idx) - if not v.startswith(("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "+", "-", ".")): + if not self._is_number_or_quantity(): return False + # We don't know yet if this is a quantity. It could be a number. - m: typing.Optional[typing.Match] = KgtkValue.quantity_re.match(v) - return m is not None - + m: typing.Optional[typing.Match] = KgtkValue.quantity_re.match(self.value) + if m is None: + return False + + # Now we can be certain that this is a quantity. + self.data_type = KgtkFormat.DataType.QUANTITY + self.valid = True + return True - def is_string(self, idx: typing.Optional[int] = None)->bool: + def is_string(self)->bool: """ - Return False if this value is a list and idx is None. - Otherwise, return True if the first character is '"'. + Return True if the first character is '"'. Strings begin and end with double quote ("). Any internal double quotes must be escaped with backslash (\"). Triple-double quoted strings are not supported by KGTK File Vormat v2. """ - if self.is_list() and idx is None: - return False + if self.data_type is not None: + return self.data_type == KgtkFormat.DataType.STRING - v: str = self.get_item(idx) - return v.startswith('"') + if not self.value.startswith('"'): + return False + + # We are certain this is a string. We don't yet know if it is valid. + self.data_type = KgtkFormat.DataType.STRING + return True lax_string_re: typing.Pattern = re.compile(r'^".*"$') strict_string_re: typing.Pattern = re.compile(r'^"(?:[^"\\]|\\.)*"$') - def is_valid_string(self, idx: typing.Optional[int] = None)->bool: + def is_valid_string(self)->bool: """ - Return False if this value is a list and idx is None. - Otherwise, return True if the first character is '"', + Strict: return True if the first character is '"', the last character is '"', and any internal '"' characters are escaped by backslashes. """ - if self.is_list() and idx is None: - return False - - v: str = self.get_item(idx) - if not v.startswith('"'): + if not self.is_string(): return False + + if self.valid is not None: + return self.valid + m: typing.Optional[typing.Match] if self.options.allow_lax_strings: - m = KgtkValue.lax_string_re.match(v) + m = KgtkValue.lax_string_re.match(self.value) else: - m = KgtkValue.strict_string_re.match(v) - return m is not None + m = KgtkValue.strict_string_re.match(self.value) + if m is None: + return False + + # We are certain that this is a valid string. + self.valid = True + return True - def is_structured_literal(self, idx: typing.Optional[int] = None)->bool: + def is_structured_literal(self)->bool: """ - Return False if this value is a list and idx is None. - Otherwise, return True if the first character is ^@'!. + Return True if the first character is ^@'!. """ - if self.is_list() and idx is None: - return False - - v: str = self.get_item(idx) - return v.startswith(("^", "@", "'", "!")) + return self.value.startswith(("^", "@", "'", "!")) - def is_symbol(self, idx: typing.Optional[int] = None)->bool: + def is_symbol(self)->bool: """ - Return False if this value is a list and idx is None. - Otherwise, return True if not a number, string, nor structured literal. + Return True if not a number, string, nor structured literal. """ - if self.is_list() and idx is None: - return False + if self.data_type is not None: + return self.data_type == KgtkFormat.DataType.SYMBOL - return not (self.is_number_or_quantity(idx) or self.is_string(idx) or self.is_structured_literal(idx)) + if self.is_number_or_quantity() or self.is_string() or self.is_structured_literal() or self.is_boolean(): + return False + + # We are certain this is a symbol. We assume, for now that it is valid. + self.data_type = KgtkFormat.DataType.SYMBOL + self.valid = True + return True - def is_boolean(self, idx: typing.Optional[int] = None)->bool: + def is_valid_symbol(self)->bool: + # If it is a suymbol, then it is valid. + return self.is_symbol() + + def is_boolean(self)->bool: """ - Return False if this value is a list and idx is None. - Otherwise, return True if the value matches one of the special boolean symbols.. + return True if the value matches one of the special boolean symbols.. """ - if self.is_list() and idx is None: - return False + if self.data_type is not None: + return self.data_type == KgtkFormat.DataType.BOOLEAN - v: str = self.get_item(idx) - return v == KgtkFormat.TRUE_SYMBOL or v == KgtkFormat.FALSE_SYMBOL + if self.value != KgtkFormat.TRUE_SYMBOL and self.value != KgtkFormat.FALSE_SYMBOL: + return False + + # We are certain this is a valid boolean. + self.data_type = KgtkFormat.DataType.BOOLEAN + self.valid = True + return True + def is_valid_boolean(self)->bool: + # If it is a boolean, then it is valid. + return self.is_boolean() - def is_language_qualified_string(self, idx: typing.Optional[int] = None)->bool: + def is_language_qualified_string(self)->bool: """ - Return False if this value is a list and idx is None. - Otherwise, return True if the first character is ' + Return True if the first character is ' """ - if self.is_list() and idx is None: + if self.data_type is not None: + return self.data_type == KgtkFormat.DataType.LANGUAGE_QUALIFIED_STRING + + if not self.value.startswith("'"): return False - v: str = self.get_item(idx) - return v.startswith("'") + self.data_type = KgtkFormat.DataType.LANGUAGE_QUALIFIED_STRING + return True # Support two or three character language codes. Suports hyphenated codes # with country codes or dialect names after a language code. lax_language_qualified_string_re: typing.Pattern = re.compile(r"^(?P'.*')@(?P[a-zA-Z]{2,3}(?:-[a-zA-Z]+)?)$") strict_language_qualified_string_re: typing.Pattern = re.compile(r"^(?P'(?:[^'\\]|\\.)*')@(?P[a-zA-Z]{2,3}(?:-[a-zA-Z]+)?)$") - def is_valid_language_qualified_string(self, idx: typing.Optional[int] = None)->bool: - """Return False if this value is a list and idx is None. - Otherwise, return True if the value looks like a language-qualified string. + def is_valid_language_qualified_string(self)->bool: """ - if self.is_list() and idx is None: + Return True if the value looks like a language-qualified string. + """ + if not self.is_language_qualified_string(): return False - v: str = self.get_item(idx) - # print("checking %s" % v) + # print("checking %s" % self.value) m: typing.Optional[typing.Match] if self.options.allow_lax_lq_strings: - m = KgtkValue.lax_language_qualified_string_re.match(v) + m = KgtkValue.lax_language_qualified_string_re.match(self.value) else: - m = KgtkValue.strict_language_qualified_string_re.match(v) + m = KgtkValue.strict_language_qualified_string_re.match(self.value) if m is None: - # print("match failed for %s" % v) + # print("match failed for %s" % self.value) return False # Validate the language code: lang: str = m.group("lang").lower() # print("lang: %s" % lang) - return LanguageValidator.validate(lang, options=self.options) + if not LanguageValidator.validate(lang, options=self.options): + # print("language validation failed for %s" % self.value) + return False + + # We are certain that this is a valid language qualified string. + self.valid = True + return True - def is_location_coordinates(self, idx: typing.Optional[int] = None)->bool: + def is_location_coordinates(self)->bool: """ - Return False if this value is a list and idx is None. - Otherwise, return True if the first character is @ + Return True if the first character is @ """ - if self.is_list() and idx is None: + if self.data_type is not None: + return self.data_type == KgtkFormat.DataType.LOCATION_COORDINATES + + if not self.value.startswith("@"): return False - v: str = self.get_item(idx) - return v.startswith("@") + self.data_type = KgtkFormat.DataType.LOCATION_COORDINATES + return True #location_coordinates_re: typing.Pattern = re.compile(r"^@(?P[-+]?\d{3}\.\d{5})/(?P[-+]?\d{3}\.\d{5})$") degrees_pat: str = r'(?:[-+]?(?:\d+(?:\.\d*)?)|(?:\.\d+))' location_coordinates_re: typing.Pattern = re.compile(r'^@(?P{degrees})/(?P{degrees})$'.format(degrees=degrees_pat)) - def is_valid_location_coordinates(self, idx: typing.Optional[int] = None)->bool: + def is_valid_location_coordinates(self)->bool: """ Return False if this value is a list and idx is None. Otherwise, return True if the value looks like valid location coordinates. @043.26193/010.92708 """ - if self.is_list() and idx is None: + if not self.is_location_coordinates(): return False - v: str = self.get_item(idx) - m: typing.Optional[typing.Match] = KgtkValue.location_coordinates_re.match(v) + m: typing.Optional[typing.Match] = KgtkValue.location_coordinates_re.match(self.value) if m is None: return False @@ -436,18 +522,23 @@ def is_valid_location_coordinates(self, idx: typing.Optional[int] = None)->bool: except ValueError: return False + # We are certain that this is valid. + self.valid = True return True - def is_date_and_times(self, idx: typing.Optional[int] = None)->bool: + def is_date_and_times(self)->bool: """ - Return False if this value is a list and idx is None. - Otherwise, return True if the first character is ^ + Return True if the first character is ^ """ - if self.is_list() and idx is None: + if self.data_type is not None: + return self.data_type == KgtkFormat.DataType.DATE_AND_TIMES + + if not self.value.startswith("^"): return False - v: str = self.get_item(idx) - return v.startswith("^") + # This is a date and times value. We do not yet know if it si valid. + self.data_type = KgtkFormat.DataType.DATE_AND_TIMES + return True # https://en.wikipedia.org/wiki/ISO_8601 # @@ -492,10 +583,9 @@ def is_date_and_times(self, idx: typing.Optional[int] = None)->bool: precision=precision_pat) lax_date_and_times_re: typing.Pattern = re.compile(r'^{date_and_times}$'.format(date_and_times=lax_date_and_times_pat)) - def is_valid_date_and_times(self, idx: typing.Optional[int] = None)->bool: + def is_valid_date_and_times(self)->bool: """ - Return False if this value is a list and idx is None. - Otherwise, return True if the value looks like valid date and times + Return True if the value looks like valid date and times literal based on ISO-8601. Valid date formats: @@ -541,11 +631,10 @@ def is_valid_date_and_times(self, idx: typing.Optional[int] = None)->bool: TODO: validate the calendar date, eg fail if 31-Apr-2020. """ - if self.is_list() and idx is None: + if not self.is_date_and_times(): return False - v: str = self.get_item(idx) - m: typing.Optional[typing.Match] = KgtkValue.lax_date_and_times_re.match(v) + m: typing.Optional[typing.Match] = KgtkValue.lax_date_and_times_re.match(self.value) if m is None: return False @@ -580,77 +669,42 @@ def is_valid_date_and_times(self, idx: typing.Optional[int] = None)->bool: if day == 0 and not self.options.allow_month_or_day_zero: return False # day 0 was disallowed. + # We are fairly certain that this is a valid date and times. + self.valid = True return True - def is_extension(self, idx: typing.Optional[int] = None)->bool: - """ - Return False if this value is a list and idx is None. - Otherwise, return True if the first character is ! - """ - if self.is_list() and idx is None: - return False - - v: str = self.get_item(idx) - return v.startswith("!") - - - def is_valid_literal(self, idx: typing.Optional[int] = None)->bool: + def is_extension(self)->bool: """ - Return False if this value is a list and idx is None. - Otherwise, return True if the value looks like a valid literal. + Return True if the first character is ! """ - if self.is_list() and idx is None: - return False - - if self.is_string(idx): - return self.is_valid_string(idx) - elif self.is_number_or_quantity(idx): - return self.is_valid_number_or_quantity(idx) - elif self.is_structured_literal(idx): - if self.is_language_qualified_string(idx): - return self.is_valid_language_qualified_string(idx) - elif self.is_location_coordinates(idx): - return self.is_valid_location_coordinates(idx) - elif self.is_date_and_times(idx): - return self.is_valid_date_and_times(idx) - elif self.is_extension(idx): - return False # no validation presently available. - else: - return False # Shouldn't get here. - else: - return False + if self.data_type is not None: + return self.data_type == KgtkFormat.DataType.EXTENSION - def is_valid_item(self, idx: typing.Optional[int] = None)->bool: - if self.is_list() and idx is None: + if not self.value.startswith("!"): return False - if self.is_empty(idx): - return True - elif self.is_valid_literal(idx): - return True - else: - return self.is_symbol(idx) # Should always be True + # This is an extension, but for now, assume that all extensions are invalid. + self.data_type = KgtkFormat.DataType.EXTENSION + self.valid = False + return True - def is_valid(self)->bool: - """ - Is this a valid KGTK cell value? If the value is a list, are all the - components valid? - """ - result: bool = True - kv: KgtkValue - for kv in self.get_values(): - result = result and kv.is_valid_item() - return result + def is_valid_extension(self)->bool: + # For now, all extensions are invalid. + return False - def describe(self, idx: typing.Optional[int] = None)->str: + def describe(self)->str: """ Return a string that describes the value. """ - if self.is_list() and idx is None: - result: str = "List (" + if self.is_list(): + result: str + if self.is_valid_list: + result = "List (" + else: + result = "Invalid List (" kv: KgtkValue first: bool = True - for kv in self.get_values(): + for kv in self.get_list(): if first: first = not first else: @@ -658,43 +712,43 @@ def describe(self, idx: typing.Optional[int] = None)->str: result += kv.describe() return result + ")" - if self.is_empty(idx): + if self.is_empty(): return "Empty" - elif self.is_string(idx): - if self.is_valid_string(idx): + elif self.is_string(): + if self.is_valid_string(): return "String" else: return "Invalid String" - elif self.is_number_or_quantity(idx): - if self.is_valid_number(idx): + elif self.is_number_or_quantity(): + if self.is_valid_number(): return "Number" - elif self.is_valid_quantity(idx): + elif self.is_valid_quantity(): return "Quantity" else: return "Invalid Number or Quantity" - elif self.is_structured_literal(idx): - if self.is_language_qualified_string(idx): - if self.is_valid_language_qualified_string(idx): - return "Language Qualified String" - else: - return "Invalid Language Qualified String" - elif self.is_location_coordinates(idx): - if self.is_valid_location_coordinates(idx): - return "Location Coordinates" - else: - return "Invalid Location Coordinates" - elif self.is_date_and_times(idx): - if self.is_valid_date_and_times(idx): - return "Date and Times" - else: - return "Invalid Date and Times" - elif self.is_extension(idx): - return "Extension (unvalidated)" + elif self.is_language_qualified_string(): + if self.is_valid_language_qualified_string(): + return "Language Qualified String" else: - return "Invalid Structured Literal" - else: + return "Invalid Language Qualified String" + elif self.is_location_coordinates(): + if self.is_valid_location_coordinates(): + return "Location Coordinates" + else: + return "Invalid Location Coordinates" + elif self.is_date_and_times(): + if self.is_valid_date_and_times(): + return "Date and Times" + else: + return "Invalid Date and Times" + elif self.is_extension(): + return "Extension (unvalidated)" + elif self.is_boolean(): + return "Boolean Symbol" + elif self.is_symbol(): return "Symbol" - + else: + return "Unknown" def main(): """ From a3d9dbd938eb0e1b6f55859fd72b36dd18000a52 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Tue, 5 May 2020 16:12:59 -0700 Subject: [PATCH 075/278] Even more refactoring of the value tests. --- kgtk/join/kgtkvalue.py | 441 +++++++++++++++++++---------------------- 1 file changed, 206 insertions(+), 235 deletions(-) diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py index e1fb06f27..9d2875309 100644 --- a/kgtk/join/kgtkvalue.py +++ b/kgtk/join/kgtkvalue.py @@ -23,71 +23,13 @@ class KgtkValue(KgtkFormat): # If this is a list, prepare a KgtkValue object for each item of the list. list_items: typing.Optional[typing.List['KgtkValue']] = None - def get_data_type(self)->KgtkFormat.DataType: - - if self.data_type is not None: - pass - - elif self.is_empty() or self.is_list(): - pass - - elif self.is_string() or self.is_language_qualified_string(): - pass - - elif self.is_number_or_quantity(): - # To determine whether this is a number or a quantity, we have - # to validate one of them. - if not self.is_valid_number(): - # If it isn't a valid number, assume it's a quantity. - self.data_type = KgtkFormat.DataType.QUANTITY - - elif self.is_location_coordinates(): - pass - - elif self.is_date_and_times(): - pass - - elif self.is_extension(): - pass - - elif self.is_boolean() or self.is_symbol(): - pass - - if self.data_type is not None: - return self.data_type - - # Shouldn't get here. - raise ValueError("Unknown data type for '%s'" % self.value) - def is_valid(self)->bool: - dt: KgtkFormat.DataType = self.get_data_type() - if dt == KgtkFormat.DataType.EMPTY: - return self.is_valid_empty() - elif dt == KgtkFormat.DataType.LIST: - return self.is_valid_list() - elif dt == KgtkFormat.DataType.NUMBER: - return self.is_valid_number() - elif dt == KgtkFormat.DataType.QUANTITY: - return self.is_valid_quantity() - elif dt == KgtkFormat.DataType.STRING: - return self.is_valid_string() - elif dt == KgtkFormat.DataType.LANGUAGE_QUALIFIED_STRING: - return self.is_valid_language_qualified_string() - elif dt == KgtkFormat.DataType.LOCATION_COORDINATES: - return self.is_valid_location_coordinates() - elif dt == KgtkFormat.DataType.DATE_AND_TIMES: - return self.is_valid_date_and_times() - elif dt == KgtkFormat.DataType.EXTENSION: - return self.is_valid_extension() - elif dt == KgtkFormat.DataType.BOOLEAN: - return self.is_valid_boolean() - elif dt == KgtkFormat.DataType.SYMBOL: - return self.is_valid_symbol() + if self.valid is not None: + return self.valid else: - raise ValueError("Unrecognized DataType.") + return self.validate() - - def is_empty(self)->bool: + def is_empty(self, validate: bool = False)->bool: if self.data_type is not None: return self.data_type == KgtkFormat.DataType.EMPTY @@ -99,41 +41,39 @@ def is_empty(self)->bool: self.valid = True return True - def is_valid_empty(self)->bool: - # If it is empty, it is validly so. - return self.is_empty() - split_list_re: typing.Pattern = re.compile(r"(?typing.List['KgtkValue']: if self.list_items is not None: return self.list_items + # Return an empty list if this is not a list. self.list_items: typing.List['KgtkValue'] = [ ] - value: str - for value in KgtkValue.split_list_re.split(self.value): - self.list_items.append(KgtkValue(value, options=self.options)) + values: typing.List[str] = KgtkValue.split_list_re.split(self.value) + if len(values) > 1: + # Populate list_items with a KgtkValue for each item in the list: + item_value: str + for item_value in values: + self.list_items.append(KgtkValue(item_value, options=self.options)) return self.list_items - def is_list(self)->bool: - if self.data_type is not None: - return self.data_type == KgtkFormat.DataType.LIST - - if len(self.get_list()) == 1: - return False - - # We aare certain that this is a list, although we haven't checked validity. - self.data_type = KgtkFormat.DataType.LIST - return True - - - def is_valid_list(self)->bool: - if not self.is_list(): - return False + def is_list(self, validate: bool = False)->bool: + # Must test for list before anything else (except empty)! + if self.data_type is None: + if len(self.get_list()) == 0: + return False + # We are certain that this is a list, although we haven't checked validity. + self.data_type = KgtkFormat.DataType.LIST + else: + if self.data_type != KgtkFormat.DataType.LIST: + return False + if not validate: + return True if self.valid is not None: return self.valid - + + # Validate the list. item: 'KgtkValue' for item in self.get_list(): if not item.is_valid(): @@ -148,15 +88,6 @@ def is_valid_list(self)->bool: def _is_number_or_quantity(self)->bool: return self.value.startswith(("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "+", "-", ".")) - def is_number_or_quantity(self)->bool: - """ - Otherwise, return True if the first character is 0-9,+,-,. . - """ - if self.data_type is not None: - return self.data_type == KgtkFormat.DataType.NUMBER or self.data_type == KgtkFormat.DataType.QUANTITY - - return self._is_number_or_quantity() - # The following lexical analysis is based on: # https://docs.python.org/3/reference/lexical_analysis.html @@ -248,7 +179,7 @@ def is_number_or_quantity(self)->bool: # This matches quantities excluding numbers. quantity_re: typing.Pattern = re.compile(r'^' + quantity_pat + r'$') - def is_valid_number_or_quantity(self)->bool: + def is_number_or_quantity(self, validate: bool=False)->bool: """ Return True if the first character is 0-9,_,-,. and it is either a Python-compatible number or an enhanced @@ -257,15 +188,22 @@ def is_valid_number_or_quantity(self)->bool: # If we know the specific data type, delegate the test to that data type. if self.data_type is not None: if self.data_type == KgtkFormat.DataType.NUMBER: - return self.is_valid_number() + if not validate: + return True + return self.is_number(validate=validate) elif self.data_type == KgtkFormat.DataType.QUANTITY: - return self.is_valid_quantity() + if not validate: + return True + return self.is_quantity(validate=validate) else: return False # Not a number or quantity. if not self._is_number_or_quantity(): return False + if not validate: + return True + # We cannot cache the result of this test because it would interfere # if we later determined the exact data type. We could work around # this problem with more thought. @@ -273,7 +211,7 @@ def is_valid_number_or_quantity(self)->bool: return m is not None - def is_valid_number(self)->bool: + def is_number(self, validate: bool=False)->bool: """ Otherwise, return True if the first character is 0-9,_,-,. and it is a Python-compatible number (with optional limited enhancements). @@ -295,6 +233,8 @@ def is_valid_number(self)->bool: if self.data_type is not None: if self.data_type != KgtkFormat.DataType.NUMBER: return False + if not validate: + return True if self.valid is not None: return self.valid @@ -312,7 +252,7 @@ def is_valid_number(self)->bool: return True - def is_valid_quantity(self)->bool: + def is_quantity(self, validate: bool=False)->bool: """ Return True if the first character is 0-9,_,-,. and it is an enhanced quantity. @@ -320,6 +260,8 @@ def is_valid_quantity(self)->bool: if self.data_type is not None: if self.data_type != KgtkFormat.DataType.QUANTITY: return False + if not validate: + return True if self.valid is not None: return self.valid @@ -336,7 +278,10 @@ def is_valid_quantity(self)->bool: self.valid = True return True - def is_string(self)->bool: + lax_string_re: typing.Pattern = re.compile(r'^".*"$') + strict_string_re: typing.Pattern = re.compile(r'^"(?:[^"\\]|\\.)*"$') + + def is_string(self, validate: bool = False)->bool: """ Return True if the first character is '"'. @@ -345,31 +290,21 @@ def is_string(self)->bool: strings are not supported by KGTK File Vormat v2. """ - if self.data_type is not None: - return self.data_type == KgtkFormat.DataType.STRING - - if not self.value.startswith('"'): - return False - - # We are certain this is a string. We don't yet know if it is valid. - self.data_type = KgtkFormat.DataType.STRING - return True - - lax_string_re: typing.Pattern = re.compile(r'^".*"$') - strict_string_re: typing.Pattern = re.compile(r'^"(?:[^"\\]|\\.)*"$') - - def is_valid_string(self)->bool: - """ - Strict: return True if the first character is '"', - the last character is '"', and any internal '"' characters are - escaped by backslashes. - """ - if not self.is_string(): - return False + if self.data_type is None: + if not self.value.startswith('"'): + return False + # We are certain this is a string. We don't yet know if it is valid. + self.data_type = KgtkFormat.DataType.STRING + else: + if self.data_type != KgtkFormat.DataType.STRING: + return False + if not validate: + return True if self.valid is not None: return self.valid - + + # Validate the string: m: typing.Optional[typing.Match] if self.options.allow_lax_strings: m = KgtkValue.lax_string_re.match(self.value) @@ -388,32 +323,30 @@ def is_structured_literal(self)->bool: """ return self.value.startswith(("^", "@", "'", "!")) - def is_symbol(self)->bool: + def is_symbol(self, validate: bool = False)->bool: """ Return True if not a number, string, nor structured literal. """ if self.data_type is not None: return self.data_type == KgtkFormat.DataType.SYMBOL + # Is this a symbol? It is, if it is not something else. if self.is_number_or_quantity() or self.is_string() or self.is_structured_literal() or self.is_boolean(): return False - # We are certain this is a symbol. We assume, for now that it is valid. + # We are certain this is a symbol. We assume that it is valid. self.data_type = KgtkFormat.DataType.SYMBOL self.valid = True return True - def is_valid_symbol(self)->bool: - # If it is a suymbol, then it is valid. - return self.is_symbol() - - def is_boolean(self)->bool: + def is_boolean(self, validate: bool = False)->bool: """ return True if the value matches one of the special boolean symbols.. """ if self.data_type is not None: return self.data_type == KgtkFormat.DataType.BOOLEAN + # Is this a boolean? if self.value != KgtkFormat.TRUE_SYMBOL and self.value != KgtkFormat.FALSE_SYMBOL: return False @@ -422,35 +355,30 @@ def is_boolean(self)->bool: self.valid = True return True - def is_valid_boolean(self)->bool: - # If it is a boolean, then it is valid. - return self.is_boolean() - - def is_language_qualified_string(self)->bool: - """ - Return True if the first character is ' - """ - if self.data_type is not None: - return self.data_type == KgtkFormat.DataType.LANGUAGE_QUALIFIED_STRING - - if not self.value.startswith("'"): - return False - - self.data_type = KgtkFormat.DataType.LANGUAGE_QUALIFIED_STRING - return True - # Support two or three character language codes. Suports hyphenated codes # with country codes or dialect names after a language code. lax_language_qualified_string_re: typing.Pattern = re.compile(r"^(?P'.*')@(?P[a-zA-Z]{2,3}(?:-[a-zA-Z]+)?)$") strict_language_qualified_string_re: typing.Pattern = re.compile(r"^(?P'(?:[^'\\]|\\.)*')@(?P[a-zA-Z]{2,3}(?:-[a-zA-Z]+)?)$") - def is_valid_language_qualified_string(self)->bool: + def is_language_qualified_string(self, validate: bool=False)->bool: """ Return True if the value looks like a language-qualified string. """ - if not self.is_language_qualified_string(): - return False + if self.data_type is None: + if not self.value.startswith("'"): + return False + # We are certain that this is a language qualified string, although we haven't checked validity. + self.data_type = KgtkFormat.DataType.LANGUAGE_QUALIFIED_STRING + else: + if self.data_type != KgtkFormat.DataType.LANGUAGE_QUALIFIED_STRING: + return False + if not validate: + return True + if self.valid is not None: + return self.valid + + # Validate the language qualified string. # print("checking %s" % self.value) m: typing.Optional[typing.Match] if self.options.allow_lax_lq_strings: @@ -473,33 +401,32 @@ def is_valid_language_qualified_string(self)->bool: self.valid = True return True - def is_location_coordinates(self)->bool: - """ - Return True if the first character is @ - """ - if self.data_type is not None: - return self.data_type == KgtkFormat.DataType.LOCATION_COORDINATES - - if not self.value.startswith("@"): - return False - - self.data_type = KgtkFormat.DataType.LOCATION_COORDINATES - return True - #location_coordinates_re: typing.Pattern = re.compile(r"^@(?P[-+]?\d{3}\.\d{5})/(?P[-+]?\d{3}\.\d{5})$") degrees_pat: str = r'(?:[-+]?(?:\d+(?:\.\d*)?)|(?:\.\d+))' location_coordinates_re: typing.Pattern = re.compile(r'^@(?P{degrees})/(?P{degrees})$'.format(degrees=degrees_pat)) - def is_valid_location_coordinates(self)->bool: + def is_location_coordinates(self, validate: bool=False)->bool: """ Return False if this value is a list and idx is None. Otherwise, return True if the value looks like valid location coordinates. @043.26193/010.92708 """ - if not self.is_location_coordinates(): - return False + if self.data_type is None: + if not self.value.startswith("@"): + return False + # We are certain that this is location coordinates, although we haven't checked validity. + self.data_type = KgtkFormat.DataType.LOCATION_COORDINATES + else: + if self.data_type != KgtkFormat.DataType.LOCATION_COORDINATES: + return False + if not validate: + return True + if self.valid is not None: + return self.valid + + # Validate the location coordinates: m: typing.Optional[typing.Match] = KgtkValue.location_coordinates_re.match(self.value) if m is None: return False @@ -526,20 +453,6 @@ def is_valid_location_coordinates(self)->bool: self.valid = True return True - def is_date_and_times(self)->bool: - """ - Return True if the first character is ^ - """ - if self.data_type is not None: - return self.data_type == KgtkFormat.DataType.DATE_AND_TIMES - - if not self.value.startswith("^"): - return False - - # This is a date and times value. We do not yet know if it si valid. - self.data_type = KgtkFormat.DataType.DATE_AND_TIMES - return True - # https://en.wikipedia.org/wiki/ISO_8601 # # The "lax" patterns allow month 00 and day 00, which are excluded by ISO 8601. @@ -583,7 +496,7 @@ def is_date_and_times(self)->bool: precision=precision_pat) lax_date_and_times_re: typing.Pattern = re.compile(r'^{date_and_times}$'.format(date_and_times=lax_date_and_times_pat)) - def is_valid_date_and_times(self)->bool: + def is_date_and_times(self, validate: bool=False)->bool: """ Return True if the value looks like valid date and times literal based on ISO-8601. @@ -631,9 +544,21 @@ def is_valid_date_and_times(self)->bool: TODO: validate the calendar date, eg fail if 31-Apr-2020. """ - if not self.is_date_and_times(): - return False + if self.data_type is None: + if not self.value.startswith("^"): + return False + # We are certain that this is location coordinates, although we haven't checked validity. + self.data_type = KgtkFormat.DataType.DATE_AND_TIMES + else: + if self.data_type != KgtkFormat.DataType.DATE_AND_TIMES: + return False + if not validate: + return True + if self.valid is not None: + return self.valid + + # Validate the date and times: m: typing.Optional[typing.Match] = KgtkValue.lax_date_and_times_re.match(self.value) if m is None: return False @@ -673,35 +598,99 @@ def is_valid_date_and_times(self)->bool: self.valid = True return True - def is_extension(self)->bool: + def is_extension(self, validate=False)->bool: """ Return True if the first character is ! """ if self.data_type is not None: - return self.data_type == KgtkFormat.DataType.EXTENSION + if not self.value.startswith("!"): + return False + # This is an extension, but for now, assume that all extensions are invalid. + self.data_type = KgtkFormat.DataType.EXTENSION + self.valid = False + else: + if self.data_type != KgtkFormat.DataType.EXTENSION: + return False - if not self.value.startswith("!"): - return False + if not validate: + return True + if self.valid is not None: + return self.valid + raise ValueError("Inconsistent extension state.") - # This is an extension, but for now, assume that all extensions are invalid. - self.data_type = KgtkFormat.DataType.EXTENSION - self.valid = False - return True + def classify(self)->KgtkFormat.DataType: + if self.data_type is not None: + return self.data_type + + # Must test for list before anything else (except empty)! + if self.is_empty() or self.is_list(): + pass + + elif self.is_string() or self.is_language_qualified_string(): + pass + + elif self.is_number_or_quantity(): + # To determine whether this is a number or a quantity, we have + # to validate one of them. + if not self.is_number(): + # If it isn't a valid number, assume it's a quantity. + self.data_type = KgtkFormat.DataType.QUANTITY + + elif self.is_location_coordinates(): + pass + + elif self.is_date_and_times(): + pass + + elif self.is_extension(): + pass + + elif self.is_boolean() or self.is_symbol(): + pass + + if self.data_type is not None: + return self.data_type - def is_valid_extension(self)->bool: - # For now, all extensions are invalid. - return False + # Shouldn't get here. + raise ValueError("Unknown data type for '%s'" % self.value) + + def validate(self)->bool: + dt: KgtkFormat.DataType = self.classify() + if dt == KgtkFormat.DataType.EMPTY: + return self.is_empty(validate=True) + elif dt == KgtkFormat.DataType.LIST: + return self.is_list(validate=True) + elif dt == KgtkFormat.DataType.NUMBER: + return self.is_number(validate=True) + elif dt == KgtkFormat.DataType.QUANTITY: + return self.is_quantity(validate=True) + elif dt == KgtkFormat.DataType.STRING: + return self.is_string(validate=True) + elif dt == KgtkFormat.DataType.LANGUAGE_QUALIFIED_STRING: + return self.is_language_qualified_string(validate=True) + elif dt == KgtkFormat.DataType.LOCATION_COORDINATES: + return self.is_location_coordinates(validate=True) + elif dt == KgtkFormat.DataType.DATE_AND_TIMES: + return self.is_date_and_times(validate=True) + elif dt == KgtkFormat.DataType.EXTENSION: + return self.is_extension(validate=True) + elif dt == KgtkFormat.DataType.BOOLEAN: + return self.is_boolean(validate=True) + elif dt == KgtkFormat.DataType.SYMBOL: + return self.is_symbol(validate=True) + else: + raise ValueError("Unrecognized DataType.") + def describe(self)->str: """ Return a string that describes the value. """ - if self.is_list(): - result: str - if self.is_valid_list: - result = "List (" - else: - result = "Invalid List (" + dt: KgtkFormat.DataType = self.classify() + if dt == KgtkFormat.DataType.EMPTY: + return "Empty" if self.is_empty(validate=True) else "Invalid Empty" + elif dt == KgtkFormat.DataType.LIST: + result: str = "List (" if self.is_list(validate=True) else "Invalid List (" kv: KgtkValue first: bool = True for kv in self.get_list(): @@ -711,42 +700,24 @@ def describe(self)->str: result += KgtkFormat.LIST_SEPARATOR result += kv.describe() return result + ")" - - if self.is_empty(): - return "Empty" - elif self.is_string(): - if self.is_valid_string(): - return "String" - else: - return "Invalid String" - elif self.is_number_or_quantity(): - if self.is_valid_number(): - return "Number" - elif self.is_valid_quantity(): - return "Quantity" - else: - return "Invalid Number or Quantity" - elif self.is_language_qualified_string(): - if self.is_valid_language_qualified_string(): - return "Language Qualified String" - else: - return "Invalid Language Qualified String" - elif self.is_location_coordinates(): - if self.is_valid_location_coordinates(): - return "Location Coordinates" - else: - return "Invalid Location Coordinates" - elif self.is_date_and_times(): - if self.is_valid_date_and_times(): - return "Date and Times" - else: - return "Invalid Date and Times" - elif self.is_extension(): - return "Extension (unvalidated)" - elif self.is_boolean(): - return "Boolean Symbol" - elif self.is_symbol(): - return "Symbol" + elif dt == KgtkFormat.DataType.NUMBER: + return "Number" if self.is_number(validate=True) else "Invali Number" + elif dt == KgtkFormat.DataType.QUANTITY: + return "Quantity" if self.is_quantity(validate=True) else "Invalid Quantity" + elif dt == KgtkFormat.DataType.STRING: + return "String" if self.is_string(validate=True) else "Invalid String" + elif dt == KgtkFormat.DataType.LANGUAGE_QUALIFIED_STRING: + return "Language Qualified String" if self.is_language_qualified_string(validate=True) else "Invalid Language Qualified String" + elif dt == KgtkFormat.DataType.LOCATION_COORDINATES: + return "Location Coordinates" if self.is_location_coordinates(validate=True) else "Invalid Location Coordinates" + elif dt == KgtkFormat.DataType.DATE_AND_TIMES: + return "Date and Times" if self.is_date_and_times(validate=True) else "Invalid Date and Times" + elif dt == KgtkFormat.DataType.EXTENSION: + return "Extension" if self.is_extension(validate=True) else "Invalid Extension" + elif dt == KgtkFormat.DataType.BOOLEAN: + return "Boolean" if self.is_boolean(validate=True) else "Invalid Boolean" + elif dt == KgtkFormat.DataType.SYMBOL: + return "Symbol" if self.is_symbol(validate=True) else "Invalid Symbol" else: return "Unknown" From d7d2f8f410f7a8d72b06d5ec605cbcc88f3ff2d4 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Tue, 5 May 2020 16:27:59 -0700 Subject: [PATCH 076/278] Additional documentation. Improve the use of the cache. --- kgtk/join/kgtkvalue.py | 47 ++++++++++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 11 deletions(-) diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py index 9d2875309..a5cb77d44 100644 --- a/kgtk/join/kgtkvalue.py +++ b/kgtk/join/kgtkvalue.py @@ -17,19 +17,25 @@ class KgtkValue(KgtkFormat): value: str = attr.ib(validator=attr.validators.instance_of(str)) options: KgtkValueOptions = attr.ib(validator=attr.validators.instance_of(KgtkValueOptions), default=DEFAULT_KGTK_VALUE_OPTIONS) + # Cache some properties of the value that would be expensive to + # continuously recompute. The class is not frozen because we have these + # cache members. data_type: typing.Optional[KgtkFormat.DataType] = None valid: typing.Optional[bool] = None - # If this is a list, prepare a KgtkValue object for each item of the list. + # If this is a list, cache a KgtkValue object for each item of the list. list_items: typing.Optional[typing.List['KgtkValue']] = None def is_valid(self)->bool: + # Is this a valid whatever it is? if self.valid is not None: return self.valid else: return self.validate() def is_empty(self, validate: bool = False)->bool: + # Is this an empty item? If so, assume it is valid and ignore the + # validate parameter. if self.data_type is not None: return self.data_type == KgtkFormat.DataType.EMPTY @@ -43,7 +49,9 @@ def is_empty(self, validate: bool = False)->bool: split_list_re: typing.Pattern = re.compile(r"(?typing.List['KgtkValue']: + def get_list_items(self)->typing.List['KgtkValue']: + # If this is a KGTK List, return a list of KGTK values representing + # the items in the list. If this is not a KGTK List, return an empty list. if self.list_items is not None: return self.list_items @@ -60,7 +68,7 @@ def get_list(self)->typing.List['KgtkValue']: def is_list(self, validate: bool = False)->bool: # Must test for list before anything else (except empty)! if self.data_type is None: - if len(self.get_list()) == 0: + if len(self.get_list_items()) == 0: return False # We are certain that this is a list, although we haven't checked validity. self.data_type = KgtkFormat.DataType.LIST @@ -75,7 +83,7 @@ def is_list(self, validate: bool = False)->bool: # Validate the list. item: 'KgtkValue' - for item in self.get_list(): + for item in self.get_list_items(): if not item.is_valid(): # The list is invalid if any item in the list is invalid. self.valid = False @@ -209,7 +217,6 @@ def is_number_or_quantity(self, validate: bool=False)->bool: # this problem with more thought. m: typing.Optional[typing.Match] = KgtkValue.number_or_quantity_re.match(self.value) return m is not None - def is_number(self, validate: bool=False)->bool: """ @@ -325,7 +332,9 @@ def is_structured_literal(self)->bool: def is_symbol(self, validate: bool = False)->bool: """ - Return True if not a number, string, nor structured literal. + Return True if not a number, string, nor structured literal, nor boolean. + + The validate parameter is ignored. """ if self.data_type is not None: return self.data_type == KgtkFormat.DataType.SYMBOL @@ -341,7 +350,9 @@ def is_symbol(self, validate: bool = False)->bool: def is_boolean(self, validate: bool = False)->bool: """ - return True if the value matches one of the special boolean symbols.. + Return True if the value matches one of the special boolean symbols. + + The validate parameter is ignored. """ if self.data_type is not None: return self.data_type == KgtkFormat.DataType.BOOLEAN @@ -599,8 +610,11 @@ def is_date_and_times(self, validate: bool=False)->bool: return True def is_extension(self, validate=False)->bool: - """ - Return True if the first character is ! + """Return True if the first character is ! + + Although we refer to the validate parameter in the code below, we + force self.valid to False. + """ if self.data_type is not None: if not self.value.startswith("!"): @@ -619,7 +633,9 @@ def is_extension(self, validate=False)->bool: raise ValueError("Inconsistent extension state.") def classify(self)->KgtkFormat.DataType: + # Classify this KgtkValue into a KgtkDataType. if self.data_type is not None: + # Return the cached value. return self.data_type # Must test for list before anything else (except empty)! @@ -655,7 +671,16 @@ def classify(self)->KgtkFormat.DataType: raise ValueError("Unknown data type for '%s'" % self.value) def validate(self)->bool: + # Validate this KgtkValue. + + # Start by classifying the KgtkValue. dt: KgtkFormat.DataType = self.classify() + + # If the valid flag has already been cached, return that. + if self.valid is not None: + return self.valid + + # Validate the value. if dt == KgtkFormat.DataType.EMPTY: return self.is_empty(validate=True) elif dt == KgtkFormat.DataType.LIST: @@ -684,7 +709,7 @@ def validate(self)->bool: def describe(self)->str: """ - Return a string that describes the value. + Return a string that describes this KGTK Value. """ dt: KgtkFormat.DataType = self.classify() if dt == KgtkFormat.DataType.EMPTY: @@ -693,7 +718,7 @@ def describe(self)->str: result: str = "List (" if self.is_list(validate=True) else "Invalid List (" kv: KgtkValue first: bool = True - for kv in self.get_list(): + for kv in self.get_list_items(): if first: first = not first else: From 6177f4987c42c35d126378bcc3e72edbfbc5287d Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Tue, 5 May 2020 16:48:30 -0700 Subject: [PATCH 077/278] Extract language-qualified string components. --- kgtk/join/kgtkvalue.py | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py index a5cb77d44..b80bd5ad3 100644 --- a/kgtk/join/kgtkvalue.py +++ b/kgtk/join/kgtkvalue.py @@ -26,6 +26,11 @@ class KgtkValue(KgtkFormat): # If this is a list, cache a KgtkValue object for each item of the list. list_items: typing.Optional[typing.List['KgtkValue']] = None + # Offer the components of a language-qualified string: + string: typing.Optional[str] = None + lang: typing.Optional[str] = None + suffix: typing.Optional[str] = None # Includes the leading dash. + def is_valid(self)->bool: # Is this a valid whatever it is? if self.valid is not None: @@ -367,14 +372,19 @@ def is_boolean(self, validate: bool = False)->bool: return True # Support two or three character language codes. Suports hyphenated codes - # with country codes or dialect names after a language code. - lax_language_qualified_string_re: typing.Pattern = re.compile(r"^(?P'.*')@(?P[a-zA-Z]{2,3}(?:-[a-zA-Z]+)?)$") - strict_language_qualified_string_re: typing.Pattern = re.compile(r"^(?P'(?:[^'\\]|\\.)*')@(?P[a-zA-Z]{2,3}(?:-[a-zA-Z]+)?)$") + # with a country code or dialect namesuffix after the language code. + lax_language_qualified_string_re: typing.Pattern = re.compile(r"^(?P'.*')@(?P[a-zA-Z]{2,3}(?P-[a-zA-Z]+)?)$") + strict_language_qualified_string_re: typing.Pattern = re.compile(r"^(?P'(?:[^'\\]|\\.)*')@(?P(?P[a-zA-Z]{2,3})(?P-[a-zA-Z]+)?)$") def is_language_qualified_string(self, validate: bool=False)->bool: """ Return True if the value looks like a language-qualified string. """ + # Clear the cached components lf the lanjguage qualified string: + self.string = None + self.lang = None + self.suffix = None + if self.data_type is None: if not self.value.startswith("'"): return False @@ -400,11 +410,17 @@ def is_language_qualified_string(self, validate: bool=False)->bool: # print("match failed for %s" % self.value) return False - # Validate the language code: - lang: str = m.group("lang").lower() - # print("lang: %s" % lang) + # Extract the string, lang, and optional suffix components: + self.string = m.group("string") + self.lang = m.group("lang") + self.suffix = m.group("suffix") - if not LanguageValidator.validate(lang, options=self.options): + # Extract the combined lang and suffix for use by the LanguageValidator. + lang_suffix: str = m.group("lang_suffix") + # print("lang: %s" % lang_suffix) + + # Validate the language code: + if not LanguageValidator.validate(lang_suffix.lower(), options=self.options): # print("language validation failed for %s" % self.value) return False @@ -732,7 +748,7 @@ def describe(self)->str: elif dt == KgtkFormat.DataType.STRING: return "String" if self.is_string(validate=True) else "Invalid String" elif dt == KgtkFormat.DataType.LANGUAGE_QUALIFIED_STRING: - return "Language Qualified String" if self.is_language_qualified_string(validate=True) else "Invalid Language Qualified String" + return "Language Qualified String (%s)" % self.lang if self.is_language_qualified_string(validate=True) else "Invalid Language Qualified String" elif dt == KgtkFormat.DataType.LOCATION_COORDINATES: return "Location Coordinates" if self.is_location_coordinates(validate=True) else "Invalid Location Coordinates" elif dt == KgtkFormat.DataType.DATE_AND_TIMES: From 640af13475393019fd708f5fc73f7d022beffef7 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Tue, 5 May 2020 18:07:59 -0700 Subject: [PATCH 078/278] Offer selected components of various KGTK data types. --- kgtk/join/kgtkvalue.py | 242 ++++++++++++++++++++++++++++++++++------- 1 file changed, 201 insertions(+), 41 deletions(-) diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py index b80bd5ad3..574f68da0 100644 --- a/kgtk/join/kgtkvalue.py +++ b/kgtk/join/kgtkvalue.py @@ -26,11 +26,32 @@ class KgtkValue(KgtkFormat): # If this is a list, cache a KgtkValue object for each item of the list. list_items: typing.Optional[typing.List['KgtkValue']] = None - # Offer the components of a language-qualified string: - string: typing.Optional[str] = None + # Offer the components of a string or language-qualified string: + contents: typing.Optional[str] = None # String contents without the enclosing quotes lang: typing.Optional[str] = None suffix: typing.Optional[str] = None # Includes the leading dash. + # Offer the components of a number or quantity: + number: typing.Optional[str] = None # Note: not converted to int or float + low_tolerance: typing.Optional[str] = None # Note: not converted to int or float + high_tolerance: typing.Optional[str] = None # Note: not converted to int or float + si_units: typing.Optional[str] = None + wikidata_node: typing.Optional[str] = None + + # Offer the components of a location coordinates: + latstr: typing.Optional[str] = None + lat: typing.Optional[float] = None + lonstr: typing.Optional[str] = None + lon: typing.Optional[float] = None + + # Offer the components of a date and times: + yearstr: typing.Optional[str] = None # Note: not converted to int + monthstr: typing.Optional[str] = None # Note: not converted to int + daystr: typing.Optional[str] = None # Note: not converted to int + hourstr: typing.Optional[str] = None # Note: not converted to int or float + minutesstr: typing.Optional[str] = None # Note: not converted to int or float + secondsstr: typing.Optional[str] = None # Note: not converted to int or float + def is_valid(self)->bool: # Is this a valid whatever it is? if self.valid is not None: @@ -153,8 +174,11 @@ def _is_number_or_quantity(self)->bool: floatnumber=floatnumber_pat, imagnumber=imagnumber_pat) + # Numeric literals with componet labeling: + number_pat: str = r'(?P{numeric})'.format(numeric=numeric_pat) + # Tolerances - tolerance_pat: str = r'(?:\[{numeric},{numeric}\])'.format(numeric=numeric_pat) + tolerance_pat: str = r'(?:\[(?P{numeric}),(?P{numeric})\])'.format(numeric=numeric_pat) # SI units taken from: # http://www.csun.edu/~vceed002/ref/measurement/units/units.pdf @@ -163,12 +187,12 @@ def _is_number_or_quantity(self)->bool: si_unit_pat: str = r'(?:m|kg|s|C|K|mol|cd|F|M|A|N|ohms|V|J|Hz|lx|H|Wb|V|W|Pa)' si_power_pat: str = r'(?:-1|2|3)' # Might need more. si_combiner_pat: str = r'[./]' - si_pat: str = r'(?:{si_unit}{si_power}?(?:{si_combiner}{si_unit}{si_power}?)*)'.format(si_unit=si_unit_pat, + si_pat: str = r'(?P{si_unit}{si_power}?(?:{si_combiner}{si_unit}{si_power}?)*)'.format(si_unit=si_unit_pat, si_combiner=si_combiner_pat, si_power=si_power_pat) # Wikidata nodes (for units): nonzero_digit_pat: str = r'[1-9]' - wikidata_node_pat: str = r'(?:Q{nonzero_digit}{digit}*)'.format(nonzero_digit=nonzero_digit_pat, + wikidata_node_pat: str = r'(?PQ{nonzero_digit}{digit}*)'.format(nonzero_digit=nonzero_digit_pat, digit=digit_pat) units_pat: str = r'(?:{si}|{wikidata_node})'.format(si=si_pat, @@ -176,21 +200,15 @@ def _is_number_or_quantity(self)->bool: # This definition matches numbers or quantities. - number_or_quantity_pat: str = r'{numeric}{tolerance}?{units}?'.format(numeric=numeric_pat, + number_or_quantity_pat: str = r'{numeric}{tolerance}?{units}?'.format(numeric=number_pat, tolerance=tolerance_pat, units=units_pat) - # This definition for quantity excludes plain numbers. - quantity_pat: str = r'{numeric}(?:(?:{tolerance}{units}?)|{units})'.format(numeric=numeric_pat, - tolerance=tolerance_pat, - units=units_pat) + # This matches numbers or quantities. number_or_quantity_re: typing.Pattern = re.compile(r'^' + number_or_quantity_pat + r'$') # This matches numbers but not quantities. - number_re: typing.Pattern = re.compile(r'^' + numeric_pat + r'$') - - # This matches quantities excluding numbers. - quantity_re: typing.Pattern = re.compile(r'^' + quantity_pat + r'$') + number_re: typing.Pattern = re.compile(r'^' + number_pat + r'$') def is_number_or_quantity(self, validate: bool=False)->bool: """ @@ -201,16 +219,25 @@ def is_number_or_quantity(self, validate: bool=False)->bool: # If we know the specific data type, delegate the test to that data type. if self.data_type is not None: if self.data_type == KgtkFormat.DataType.NUMBER: - if not validate: - return True return self.is_number(validate=validate) elif self.data_type == KgtkFormat.DataType.QUANTITY: - if not validate: - return True return self.is_quantity(validate=validate) else: + # Clear the number or quantity components: + self.number = None + self.low_tolerance = None + self.high_tolerance = None + self.si_units = None + self.wikidata_node = None return False # Not a number or quantity. + # Clear the number or quantity components: + self.number = None + self.low_tolerance = None + self.high_tolerance = None + self.si_units = None + self.wikidata_node = None + if not self._is_number_or_quantity(): return False @@ -221,7 +248,25 @@ def is_number_or_quantity(self, validate: bool=False)->bool: # if we later determined the exact data type. We could work around # this problem with more thought. m: typing.Optional[typing.Match] = KgtkValue.number_or_quantity_re.match(self.value) - return m is not None + if m is None: + return False + + # Extract the number or quantity components: + self.number = m.group("number") + self.low_tolerance = m.group("low_tolerance") + self.high_tolerance = m.group("high_tolerance") + self.si_units = m.group("si_units") + self.wikidata_node = m.group("wikidata_node") + + if self.low_tolerance is not None or self.high_tolerance is not None or self.si_units is not None or self.wikidata_node is not None: + # We can be certain that this is a quantity. + self.data_type = KgtkFormat.DataType.QUANTITY + else: + # We can be certain that this is a number + self.data_type = KgtkFormat.DataType.NUMBER + + self.valid = True + return True def is_number(self, validate: bool=False)->bool: """ @@ -244,12 +289,18 @@ def is_number(self, validate: bool=False)->bool: """ if self.data_type is not None: if self.data_type != KgtkFormat.DataType.NUMBER: + # Clear the number components: + self.number = None return False + if not validate: return True if self.valid is not None: return self.valid + # Clear the number components: + self.number = None + if not self._is_number_or_quantity(): return False # We don't know yet if this is a number. It could be a quantity. @@ -258,6 +309,9 @@ def is_number(self, validate: bool=False)->bool: if m is None: return False + # Extract the number components: + self.number = m.group("number") + # Now we can be certain that this is a number. self.data_type = KgtkFormat.DataType.NUMBER self.valid = True @@ -271,27 +325,54 @@ def is_quantity(self, validate: bool=False)->bool: """ if self.data_type is not None: if self.data_type != KgtkFormat.DataType.QUANTITY: + # Clear the quantity components: + self.number = None + self.low_tolerance = None + self.high_tolerance = None + self.si_units = None + self.wikidata_node = None return False + if not validate: return True if self.valid is not None: return self.valid + # Clear the quantity components: + self.number = None + self.low_tolerance = None + self.high_tolerance = None + self.si_units = None + self.wikidata_node = None + if not self._is_number_or_quantity(): return False # We don't know yet if this is a quantity. It could be a number. - m: typing.Optional[typing.Match] = KgtkValue.quantity_re.match(self.value) + m: typing.Optional[typing.Match] = KgtkValue.number_or_quantity_re.match(self.value) if m is None: return False + # Extract the quantity components: + self.number = m.group("number") + self.low_tolerance = m.group("low_tolerance") + self.high_tolerance = m.group("high_tolerance") + self.si_units = m.group("si_units") + self.wikidata_node = m.group("wikidata_node") + + if self.low_tolerance is None and self.high_tolerance is None and self.si_units is None and self.wikidata_node is None: + # This is a number, not a quantity + self.data_type = KgtkFormat.DataType.NUMBER + self.valid = True + return False + # Now we can be certain that this is a quantity. self.data_type = KgtkFormat.DataType.QUANTITY self.valid = True return True - lax_string_re: typing.Pattern = re.compile(r'^".*"$') - strict_string_re: typing.Pattern = re.compile(r'^"(?:[^"\\]|\\.)*"$') + lax_string_re: typing.Pattern = re.compile(r'^"(?P.*)"$') + strict_string_re: typing.Pattern = re.compile(r'^"(?P(?:[^"\\]|\\.)*"$)') def is_string(self, validate: bool = False)->bool: """ @@ -304,11 +385,15 @@ def is_string(self, validate: bool = False)->bool: """ if self.data_type is None: if not self.value.startswith('"'): + # Clear the string components: + self.contents = None return False # We are certain this is a string. We don't yet know if it is valid. self.data_type = KgtkFormat.DataType.STRING else: if self.data_type != KgtkFormat.DataType.STRING: + # Clear the string components: + self.contents = None return False if not validate: @@ -316,6 +401,9 @@ def is_string(self, validate: bool = False)->bool: if self.valid is not None: return self.valid + # Clear the string components: + self.contents = None + # Validate the string: m: typing.Optional[typing.Match] if self.options.allow_lax_strings: @@ -325,6 +413,9 @@ def is_string(self, validate: bool = False)->bool: if m is None: return False + # Extract the contents components: + self.contents = m.group("contents") + # We are certain that this is a valid string. self.valid = True return True @@ -373,25 +464,28 @@ def is_boolean(self, validate: bool = False)->bool: # Support two or three character language codes. Suports hyphenated codes # with a country code or dialect namesuffix after the language code. - lax_language_qualified_string_re: typing.Pattern = re.compile(r"^(?P'.*')@(?P[a-zA-Z]{2,3}(?P-[a-zA-Z]+)?)$") - strict_language_qualified_string_re: typing.Pattern = re.compile(r"^(?P'(?:[^'\\]|\\.)*')@(?P(?P[a-zA-Z]{2,3})(?P-[a-zA-Z]+)?)$") + lax_language_qualified_string_re: typing.Pattern = re.compile(r"^'(?P.*)'@(?P[a-zA-Z]{2,3}(?P-[a-zA-Z]+)?)$") + strict_language_qualified_string_re: typing.Pattern = re.compile(r"^'(?P(?:[^'\\]|\\.)*)'@(?P(?P[a-zA-Z]{2,3})(?P-[a-zA-Z]+)?)$") def is_language_qualified_string(self, validate: bool=False)->bool: """ Return True if the value looks like a language-qualified string. """ - # Clear the cached components lf the lanjguage qualified string: - self.string = None - self.lang = None - self.suffix = None - if self.data_type is None: if not self.value.startswith("'"): + # Clear the cached components of the language qualified string: + self.contents = None + self.lang = None + self.suffix = None return False # We are certain that this is a language qualified string, although we haven't checked validity. self.data_type = KgtkFormat.DataType.LANGUAGE_QUALIFIED_STRING else: if self.data_type != KgtkFormat.DataType.LANGUAGE_QUALIFIED_STRING: + # Clear the cached components of the language qualified string: + self.contents = None + self.lang = None + self.suffix = None return False if not validate: @@ -399,6 +493,11 @@ def is_language_qualified_string(self, validate: bool=False)->bool: if self.valid is not None: return self.valid + # Clear the cached components of the language qualified string: + self.contents = None + self.lang = None + self.suffix = None + # Validate the language qualified string. # print("checking %s" % self.value) m: typing.Optional[typing.Match] @@ -410,8 +509,8 @@ def is_language_qualified_string(self, validate: bool=False)->bool: # print("match failed for %s" % self.value) return False - # Extract the string, lang, and optional suffix components: - self.string = m.group("string") + # Extract the contents, lang, and optional suffix components: + self.contents = m.group("contents") self.lang = m.group("lang") self.suffix = m.group("suffix") @@ -441,11 +540,19 @@ def is_location_coordinates(self, validate: bool=False)->bool: """ if self.data_type is None: if not self.value.startswith("@"): + self.latstr = None + self.lat = None + self.lonstr = None + self.lon = None return False # We are certain that this is location coordinates, although we haven't checked validity. self.data_type = KgtkFormat.DataType.LOCATION_COORDINATES else: if self.data_type != KgtkFormat.DataType.LOCATION_COORDINATES: + self.latstr = None + self.lat = None + self.lonstr = None + self.lon = None return False if not validate: @@ -453,25 +560,34 @@ def is_location_coordinates(self, validate: bool=False)->bool: if self.valid is not None: return self.valid + # Clear the lat/lon components: + self.latstr = None + self.lat = None + self.lonstr = None + self.lon = None + # Validate the location coordinates: m: typing.Optional[typing.Match] = KgtkValue.location_coordinates_re.match(self.value) if m is None: return False - # Latitude normally runs from -90 to +90: latstr: str = m.group("lat") + self.latstr = latstr + lonstr: str = m.group("lon") + self.lonstr = lonstr + + # Latitude normally runs from -90 to +90: try: - lat: float = float(latstr) - if lat < self.options.minimum_valid_lat or lat > self.options.maximum_valid_lat: + self.lat = float(latstr) + if self.lat < self.options.minimum_valid_lat or self.lat > self.options.maximum_valid_lat: return False except ValueError: return False # Longitude normally runs from -180 to +180: - lonstr: str = m.group("lon") try: - lon: float = float(lonstr) - if lon < self.options.minimum_valid_lon or lon > self.options.maximum_valid_lon: + self.lon = float(lonstr) + if self.lon < self.options.minimum_valid_lon or self.lon > self.options.maximum_valid_lon: return False except ValueError: return False @@ -500,7 +616,7 @@ def is_location_coordinates(self, validate: bool=False)->bool: # hour-minutes-seconds hour_pat: str = r'(?P2[0-3]|[01][0-9])' minutes_pat: str = r'(?P[0-5][0-9])' - seconds_pat: str = r'(?P[0-5][0-9])' + seconds_pat: str = r'(?P[0-5][0-9])' # NOTE: It might be the case that the ":" before the minutes in the time zone pattern # should be conditioned upon the hyphen indicator. The Wikipedia article doesn't @@ -573,11 +689,25 @@ def is_date_and_times(self, validate: bool=False)->bool: """ if self.data_type is None: if not self.value.startswith("^"): + # Clear the cached date and times components: + self.yearstr = None + self.monthstr = None + self.daystr = None + self.hourstr = None + self.minutesstr = None + self.secondsstr = None return False # We are certain that this is location coordinates, although we haven't checked validity. self.data_type = KgtkFormat.DataType.DATE_AND_TIMES else: if self.data_type != KgtkFormat.DataType.DATE_AND_TIMES: + # Clear the cached date and times components: + self.yearstr = None + self.monthstr = None + self.daystr = None + self.hourstr = None + self.minutesstr = None + self.secondsstr = None return False if not validate: @@ -585,13 +715,33 @@ def is_date_and_times(self, validate: bool=False)->bool: if self.valid is not None: return self.valid + # Clear the cached date and times components: + self.yearstr = None + self.monthstr = None + self.daystr = None + self.hourstr = None + self.minutesstr = None + self.secondsstr = None + # Validate the date and times: m: typing.Optional[typing.Match] = KgtkValue.lax_date_and_times_re.match(self.value) if m is None: return False - # Validate the year: year_str: str = m.group("year") + self.yearstr = year_str + month_str: str = m.group("month") + self.monthstr = month_str + day_str: str = m.group("day") + self.daystr = day_str + hour_str: str = m.group("hour") + self.hourstr = hour_str + minutes_str: str = m.group("minutes") + self.minutesstr = minutes_str + seconds_str: str = m.group("seconds") + self.secondsstr = seconds_str + + # Validate the year: if year_str is None or len(year_str) == 0: return False # Years are mandatory try: @@ -603,7 +753,6 @@ def is_date_and_times(self, validate: bool=False)->bool: if year > self.options.maximum_valid_year: return False - month_str: str = m.group("month") if month_str is not None: try: month: int = int(month_str) @@ -612,7 +761,6 @@ def is_date_and_times(self, validate: bool=False)->bool: if month == 0 and not self.options.allow_month_or_day_zero: return False # month 0 was disallowed. - day_str: str = m.group("day") if day_str is not None: try: day: int = int(day_str) @@ -686,6 +834,12 @@ def classify(self)->KgtkFormat.DataType: # Shouldn't get here. raise ValueError("Unknown data type for '%s'" % self.value) + def reclassify(self)->KgtkFormat.DataType: + # Classify this KgtkValue into a KgtkDataType, ignoring any cached data_type. + self.data_type = None + self.valid = None + return self.classify() + def validate(self)->bool: # Validate this KgtkValue. @@ -722,6 +876,12 @@ def validate(self)->bool: else: raise ValueError("Unrecognized DataType.") + def revalidate(self, reclassify: bool=False)->bool: + # Revalidate this KgtkValue after clearing cached values. + if reclassify: + self.data_type = None + self.valid = None + return self.validate() def describe(self)->str: """ From b768f44ae59d4937867ea836cdfbeb2ac022b72b Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Tue, 5 May 2020 19:45:59 -0700 Subject: [PATCH 079/278] Make year/month//day available as ints. --- kgtk/join/kgtkvalue.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py index 574f68da0..24bf3e890 100644 --- a/kgtk/join/kgtkvalue.py +++ b/kgtk/join/kgtkvalue.py @@ -46,8 +46,11 @@ class KgtkValue(KgtkFormat): # Offer the components of a date and times: yearstr: typing.Optional[str] = None # Note: not converted to int + year: typing.Optional[int] = None monthstr: typing.Optional[str] = None # Note: not converted to int + month: typing.Optional[int] = None daystr: typing.Optional[str] = None # Note: not converted to int + day: typing.Optional[int] = None hourstr: typing.Optional[str] = None # Note: not converted to int or float minutesstr: typing.Optional[str] = None # Note: not converted to int or float secondsstr: typing.Optional[str] = None # Note: not converted to int or float @@ -696,6 +699,9 @@ def is_date_and_times(self, validate: bool=False)->bool: self.hourstr = None self.minutesstr = None self.secondsstr = None + self.year = None + self.month = None + self.day = None return False # We are certain that this is location coordinates, although we haven't checked validity. self.data_type = KgtkFormat.DataType.DATE_AND_TIMES @@ -708,6 +714,9 @@ def is_date_and_times(self, validate: bool=False)->bool: self.hourstr = None self.minutesstr = None self.secondsstr = None + self.year = None + self.month = None + self.day = None return False if not validate: @@ -722,6 +731,9 @@ def is_date_and_times(self, validate: bool=False)->bool: self.hourstr = None self.minutesstr = None self.secondsstr = None + self.year = None + self.month = None + self.day = None # Validate the date and times: m: typing.Optional[typing.Match] = KgtkValue.lax_date_and_times_re.match(self.value) @@ -745,28 +757,28 @@ def is_date_and_times(self, validate: bool=False)->bool: if year_str is None or len(year_str) == 0: return False # Years are mandatory try: - year: int = int(year_str) + self.year: int = int(year_str) except ValueError: return False - if year < self.options.minimum_valid_year: + if self.year < self.options.minimum_valid_year: return False - if year > self.options.maximum_valid_year: + if self.year > self.options.maximum_valid_year: return False if month_str is not None: try: - month: int = int(month_str) + self.month: int = int(month_str) except ValueError: return False # shouldn't happen - if month == 0 and not self.options.allow_month_or_day_zero: + if self.month == 0 and not self.options.allow_month_or_day_zero: return False # month 0 was disallowed. if day_str is not None: try: - day: int = int(day_str) + self.day: int = int(day_str) except ValueError: return False # shouldn't happen - if day == 0 and not self.options.allow_month_or_day_zero: + if self.day == 0 and not self.options.allow_month_or_day_zero: return False # day 0 was disallowed. # We are fairly certain that this is a valid date and times. From 02a5fbb35b09f0b6f8e4d51c5be57719ead9c16c Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Tue, 5 May 2020 19:51:35 -0700 Subject: [PATCH 080/278] Incicate whether hyphens/colons were present in date/times. --- kgtk/join/kgtkvalue.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py index 24bf3e890..5f5e486b2 100644 --- a/kgtk/join/kgtkvalue.py +++ b/kgtk/join/kgtkvalue.py @@ -54,6 +54,7 @@ class KgtkValue(KgtkFormat): hourstr: typing.Optional[str] = None # Note: not converted to int or float minutesstr: typing.Optional[str] = None # Note: not converted to int or float secondsstr: typing.Optional[str] = None # Note: not converted to int or float + iso8601basic: typing.Optional[bool] = None # True when hyphens/colons present. def is_valid(self)->bool: # Is this a valid whatever it is? @@ -702,6 +703,7 @@ def is_date_and_times(self, validate: bool=False)->bool: self.year = None self.month = None self.day = None + self.iso8601basic = None return False # We are certain that this is location coordinates, although we haven't checked validity. self.data_type = KgtkFormat.DataType.DATE_AND_TIMES @@ -717,6 +719,7 @@ def is_date_and_times(self, validate: bool=False)->bool: self.year = None self.month = None self.day = None + self.iso8601basic = None return False if not validate: @@ -733,6 +736,7 @@ def is_date_and_times(self, validate: bool=False)->bool: self.secondsstr = None self.year = None self.month = None + self.iso8601basic = None self.day = None # Validate the date and times: @@ -752,6 +756,7 @@ def is_date_and_times(self, validate: bool=False)->bool: self.minutesstr = minutes_str seconds_str: str = m.group("seconds") self.secondsstr = seconds_str + self.iso8601basic = m.group("hyphen") is None # Validate the year: if year_str is None or len(year_str) == 0: From ed62331204a68b20e0164201e9c13ca52de7ab27 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Tue, 5 May 2020 19:57:27 -0700 Subject: [PATCH 081/278] DOn't duplucate year/month/day/... strings. --- kgtk/join/kgtkvalue.py | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py index 5f5e486b2..0b3e62e21 100644 --- a/kgtk/join/kgtkvalue.py +++ b/kgtk/join/kgtkvalue.py @@ -744,25 +744,19 @@ def is_date_and_times(self, validate: bool=False)->bool: if m is None: return False - year_str: str = m.group("year") - self.yearstr = year_str - month_str: str = m.group("month") - self.monthstr = month_str - day_str: str = m.group("day") - self.daystr = day_str - hour_str: str = m.group("hour") - self.hourstr = hour_str - minutes_str: str = m.group("minutes") - self.minutesstr = minutes_str - seconds_str: str = m.group("seconds") - self.secondsstr = seconds_str + self.yearstr = m.group("year") + self.monthstr = m.group("month") + self.daystr = m.group("day") + self.hourstr = m.group("hour") + self.minutesstr = m.group("minutes") + self.secondsstr = m.group("seconds") self.iso8601basic = m.group("hyphen") is None # Validate the year: - if year_str is None or len(year_str) == 0: + if self.yearstr is None or len(self.yearstr) == 0: return False # Years are mandatory try: - self.year: int = int(year_str) + self.year: int = int(self.yearstr) except ValueError: return False if self.year < self.options.minimum_valid_year: @@ -770,17 +764,17 @@ def is_date_and_times(self, validate: bool=False)->bool: if self.year > self.options.maximum_valid_year: return False - if month_str is not None: + if self.monthstr is not None: try: - self.month: int = int(month_str) + self.month: int = int(self.monthstr) except ValueError: return False # shouldn't happen if self.month == 0 and not self.options.allow_month_or_day_zero: return False # month 0 was disallowed. - if day_str is not None: + if self.daystr is not None: try: - self.day: int = int(day_str) + self.day: int = int(self.daystr) except ValueError: return False # shouldn't happen if self.day == 0 and not self.options.allow_month_or_day_zero: From 1fef82bed3d929c45a3840cc808cea335f9c7fbb Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Tue, 5 May 2020 20:23:51 -0700 Subject: [PATCH 082/278] Initial date and time repair hack.: --- kgtk/join/kgtkvalue.py | 70 ++++++++++++++++++++++++++++++++--- kgtk/join/kgtkvalueoptions.py | 1 + 2 files changed, 66 insertions(+), 5 deletions(-) diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py index 0b3e62e21..4604f6f87 100644 --- a/kgtk/join/kgtkvalue.py +++ b/kgtk/join/kgtkvalue.py @@ -54,6 +54,8 @@ class KgtkValue(KgtkFormat): hourstr: typing.Optional[str] = None # Note: not converted to int or float minutesstr: typing.Optional[str] = None # Note: not converted to int or float secondsstr: typing.Optional[str] = None # Note: not converted to int or float + zonestr: typing.Optional[str] = None + precisionstr: typing.Optional[str] = None iso8601basic: typing.Optional[bool] = None # True when hyphens/colons present. def is_valid(self)->bool: @@ -703,6 +705,8 @@ def is_date_and_times(self, validate: bool=False)->bool: self.year = None self.month = None self.day = None + self.zonestr = None + self.precisionstr = None self.iso8601basic = None return False # We are certain that this is location coordinates, although we haven't checked validity. @@ -719,6 +723,8 @@ def is_date_and_times(self, validate: bool=False)->bool: self.year = None self.month = None self.day = None + self.zonestr = None + self.precisionstr = None self.iso8601basic = None return False @@ -736,6 +742,8 @@ def is_date_and_times(self, validate: bool=False)->bool: self.secondsstr = None self.year = None self.month = None + self.zonestr = None + self.precisionstr = None self.iso8601basic = None self.day = None @@ -750,8 +758,12 @@ def is_date_and_times(self, validate: bool=False)->bool: self.hourstr = m.group("hour") self.minutesstr = m.group("minutes") self.secondsstr = m.group("seconds") + self.zonestr = m.group("zone") + self.precisionstr = m.group("precision") self.iso8601basic = m.group("hyphen") is None + fixup_needed: bool = False + # Validate the year: if self.yearstr is None or len(self.yearstr) == 0: return False # Years are mandatory @@ -769,21 +781,62 @@ def is_date_and_times(self, validate: bool=False)->bool: self.month: int = int(self.monthstr) except ValueError: return False # shouldn't happen - if self.month == 0 and not self.options.allow_month_or_day_zero: - return False # month 0 was disallowed. + if self.month == 0: + if self.options.repair_month_or_day_zero: + self.month = 1 + self.monthstr = "01" + fixup_needed = True + elif not self.options.allow_month_or_day_zero: + return False # month 0 was disallowed. if self.daystr is not None: try: self.day: int = int(self.daystr) except ValueError: return False # shouldn't happen - if self.day == 0 and not self.options.allow_month_or_day_zero: - return False # day 0 was disallowed. + if self.day == 0: + if self.options.repair_month_or_day_zero: + self.day = 1 + self.daystr = "01" + fixup_needed = True + if not self.options.allow_month_or_day_zero: + return False # day 0 was disallowed. + + if fixup_needed: + self.update_date_and_times() # We are fairly certain that this is a valid date and times. self.valid = True return True + def update_date_and_times(self): + v: str = "^" + self.yearstr + if self.monthstr is not None: + if not self.iso8601basic: + v += "-" + v += self.monthstr + if self.daystr is not None: + if not self.iso8601basic: + v += "-" + v += self.daystr + if self.hourstr is not None: + v += "T" + v += self.hourstr + if self.minutesstr is not None: + if not self.iso8601basic: + v += ":" + v += self.minutesstr + if self.secondsstr is not None: + if not self.iso8601basic: + v += ":" + v += self.secondssr + if self.zonestr is not None: + v += self.zonestr + if self.precisionstr is not None: + v += "/" + v += self.precisionstr + self.value = v + def is_extension(self, validate=False)->bool: """Return True if the first character is ! @@ -949,7 +1002,14 @@ def main(): value: str for value in args.values: - print("%s: %s" % (value, KgtkValue(value, options=value_options).describe()), flush=True) + kv: KgtkValue = KgtkValue(value, options=value_options) + kv.validate() + nv: str = kv.value + if value == nv: + print("%s: %s" % (value, kv.describe()), flush=True) + else: + print("%s => %s: %s" % (value, nv, kv.describe()), flush=True) + if __name__ == "__main__": main() diff --git a/kgtk/join/kgtkvalueoptions.py b/kgtk/join/kgtkvalueoptions.py index 19252f3d9..d576f649c 100644 --- a/kgtk/join/kgtkvalueoptions.py +++ b/kgtk/join/kgtkvalueoptions.py @@ -16,6 +16,7 @@ class KgtkValueOptions: # Allow month 00 or day 00 in dates? This isn't really allowed by ISO # 8601, but appears in wikidata. allow_month_or_day_zero: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) + repair_month_or_day_zero: bool = attr.ib(validator=attr.validators.instance_of(bool), default=True) # When allow_lax_strings is true, strings will be checked to see if they # start and end with double quote ("), but we won't check if internal From f3acd680168915a9ba483c94a7dabc0c12f07810 Mon Sep 17 00:00:00 2001 From: Rongpeng Date: Wed, 6 May 2020 08:57:15 -0700 Subject: [PATCH 083/278] remove line-by-line options --- kgtk/cli/generate_wikidata_triples.py | 34 +----- kgtk/triple_generator.py | 147 ++++++++++++++------------ 2 files changed, 85 insertions(+), 96 deletions(-) diff --git a/kgtk/cli/generate_wikidata_triples.py b/kgtk/cli/generate_wikidata_triples.py index c87495434..d65aa9037 100644 --- a/kgtk/cli/generate_wikidata_triples.py +++ b/kgtk/cli/generate_wikidata_triples.py @@ -97,14 +97,6 @@ def add_arguments(parser): help="if set to yes, read from compressed gz file", dest="use_gz", ) - parser.add_argument( - "-lbl", - "--line-by-line", - action="store", - type=str2bool, - help="if set to yes, read from standard input line by line, otherwise loads whole file into memory", - dest="line_by_line", - ) def run( @@ -116,7 +108,6 @@ def run( truthy: bool, ignore: bool, use_gz: bool, - line_by_line: bool, ): # import modules locally import gzip @@ -136,27 +127,12 @@ def run( fp = gzip.open(sys.stdin.buffer, 'rt') else: fp = sys.stdin - if line_by_line: - print("#line-by-line") - num_line = 1 - while True: - edge = fp.readline() - if not edge: - break - if edge.startswith("#") or num_line == 1: # TODO First line omit - num_line += 1 - continue - else: - generator.entry_point(num_line, edge) - num_line += 1 - else: # not line by line - print("#not line-by-line") - for num, edge in enumerate(fp.readlines()): - if edge.startswith("#") or num == 0: - continue - else: - generator.entry_point(num+1,edge) + for num, edge in enumerate(fp): + if edge.startswith("#") or num == 0: + continue + else: + generator.entry_point(num+1,edge) generator.finalize() # testing profiling locally with direct call diff --git a/kgtk/triple_generator.py b/kgtk/triple_generator.py index 57a32907b..67e3485cd 100644 --- a/kgtk/triple_generator.py +++ b/kgtk/triple_generator.py @@ -8,24 +8,27 @@ from etk.knowledge_graph import KGSchema from etk.wikidata import wiki_namespaces import rfc3986 -from etk.wikidata.value import ( -Precision, -Item, -StringValue, -TimeValue, -QuantityValue, -MonolingualText, -GlobeCoordinate, -ExternalIdentifier, -URLValue +from etk.wikidata.value import ( + Precision, + Item, + StringValue, + TimeValue, + QuantityValue, + MonolingualText, + GlobeCoordinate, + ExternalIdentifier, + URLValue ) BAD_CHARS = [":", "-", "&", ",", " ", "(", ")", "\'", '\"', "/", "\\", "[", "]", ";"] + + class TripleGenerator: """ A class to maintain the status of the generator """ + def __init__( self, prop_file: str, @@ -35,7 +38,7 @@ def __init__( ignore: bool, n: int, dest_fp: TextIO = sys.stdout, - truthy:bool =False + truthy: bool = False ): from etk.wikidata.statement import Rank self.ignore = ignore @@ -48,19 +51,20 @@ def __init__( self.read_num_of_lines = 0 # ignore-logging, if not ignore, log them and move on. if not self.ignore: - self.ignore_file = open("ignored.log","w") + self.ignore_file = open("ignored.log", "w") # corrupted statement id self.corrupted_statement_id = None # truthy - self.truthy = truthy + self.truthy = truthy self.reset_etk_doc() self.serialize_prefix() - self.yyyy_mm_dd_pattern = re.compile("[12]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])") + self.yyyy_mm_dd_pattern = re.compile( + "[12]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])") self.yyyy_pattern = re.compile("[12]\d{3}") - self.quantity_pattern = re.compile("([\+|\-]?[0-9]+\.?[0-9]*)(?:\[([\+|\-]?[0-9]+\.?[0-9]*),([\+|\-]?[0-9]+\.?[0-9]*)\])?([U|Q](?:[0-9]+))?") + self.quantity_pattern = re.compile( + "([\+|\-]?[0-9]+\.?[0-9]*)(?:\[([\+|\-]?[0-9]+\.?[0-9]*),([\+|\-]?[0-9]+\.?[0-9]*)\])?([U|Q](?:[0-9]+))?") - - def _node_2_entity(self, node:str): + def _node_2_entity(self, node: str): ''' A node can be Qxxx or Pxxx, return the proper entity. ''' @@ -70,7 +74,6 @@ def _node_2_entity(self, node:str): entity = WDItem(TripleGenerator.replace_illegal_string(node)) return entity - def set_properties(self, prop_file: str): datatype_mapping = { "item": Item, @@ -79,8 +82,8 @@ def set_properties(self, prop_file: str): "quantity": QuantityValue, "monolingualtext": MonolingualText, "string": StringValue, - "external-identifier":ExternalIdentifier, - "url":URLValue + "external-identifier": ExternalIdentifier, + "url": URLValue } with open(prop_file, "r") as fp: props = fp.readlines() @@ -90,7 +93,7 @@ def set_properties(self, prop_file: str): try: prop_types[node1] = datatype_mapping[node2.strip()] except: - if not self.ignore: + if not self.ignore: raise KGTKException( "DataType {} of node {} is not supported.\n".format( node2, node1 @@ -114,14 +117,15 @@ def reset_etk_doc(self, doc_id: str = "http://isi.edu/default-ns/projects"): self.etk = ETK(kg_schema=kg_schema, modules=ETKModule) self.doc = self.etk.create_document({}, doc_id=doc_id) for k, v in wiki_namespaces.items(): - self.doc.kg.bind(k, v) - + self.doc.kg.bind(k, v) + def serialize(self): """ Seriealize the triples. Used a hack to avoid serializing the prefix again. """ docs = self.etk.process_ems(self.doc) - self.fp.write("\n\n".join(docs[0].kg.serialize("ttl").split("\n\n")[1:])) + self.fp.write("\n\n".join( + docs[0].kg.serialize("ttl").split("\n\n")[1:])) self.fp.flush() self.reset() @@ -132,7 +136,7 @@ def serialize_prefix(self): Relevent issue: https://github.com/RDFLib/rdflib/issues/965 """ for k, v in wiki_namespaces.items(): - line = "@prefix " + k + ": <" + v + "> .\n" + line = "@prefix " + k + ": <" + v + "> .\n" self.fp.write(line) self.fp.write("\n") self.fp.flush() @@ -148,18 +152,18 @@ def finalize(self): self.serialize() @staticmethod - def process_text_string(string:str)->[str,str]: + def process_text_string(string: str) -> [str, str]: ''' Language detection is removed from triple generation. The user is responsible for detect the language ''' - if len(string)==0: - return ["","en"] + if len(string) == 0: + return ["", "en"] if "@" in string: res = string.split("@") text_string = "@".join(res[:-1]).replace('"', "").replace("'", "") - lang = res[-1].replace('"','').replace("'","") + lang = res[-1].replace('"', '').replace("'", "") if len(lang) > 2: - lang ="en" + lang = "en" else: text_string = string.replace('"', "").replace("'", "") lang = "en" @@ -192,7 +196,7 @@ def generate_prop_declaration_triple(self, node1: str, label: str, node2: str) - return True def generate_normal_triple( - self, node1: str, label: str, node2: str, is_qualifier_edge: bool) -> bool: + self, node1: str, label: str, node2: str, is_qualifier_edge: bool) -> bool: entity = self._node_2_entity(node1) # determine the edge type edge_type = self.prop_types[label] @@ -206,7 +210,7 @@ def generate_normal_triple( try: dateTimeString = node2 object = TimeValue( - value=dateTimeString, #TODO + value=dateTimeString, # TODO calendar=Item("Q1985727"), precision=Precision.year, time_zone=0, @@ -214,10 +218,10 @@ def generate_normal_triple( except: return False elif self.yyyy_pattern.match(node2): - try: + try: dateTimeString = node2 + "-01-01" object = TimeValue( - value=dateTimeString, #TODO + value=dateTimeString, # TODO calendar=Item("Q1985727"), precision=Precision.year, time_zone=0, @@ -227,23 +231,24 @@ def generate_normal_triple( else: try: dateTimeString, precision = node2[1:].split("/") - dateTimeString = dateTimeString[:-1] # remove "Z" + dateTimeString = dateTimeString[:-1] # remove "Z" # 2016-00-00T00:00:00 case if "-00-00" in dateTimeString: - dateTimeString = "-01-01".join(dateTimeString.split("-00-00")) + dateTimeString = "-01-01".join( + dateTimeString.split("-00-00")) elif dateTimeString[8:10] == "00": - dateTimeString = dateTimeString[:8]+"01" + dateTimeString[10:] + dateTimeString = dateTimeString[:8] + \ + "01" + dateTimeString[10:] object = TimeValue( value=dateTimeString, calendar=Item("Q1985727"), precision=precision, time_zone=0, ) - except: + except: return False - #TODO other than that, not supported. Creation of normal triple fails - + # TODO other than that, not supported. Creation of normal triple fails elif edge_type == GlobeCoordinate: latitude, longitude = node2[1:].split("/") @@ -264,14 +269,16 @@ def generate_normal_triple( upper_bound = TripleGenerator.clean_number_string(upper_bound) if unit != None: if upper_bound != None and lower_bound != None: - object = QuantityValue(amount, unit=Item(unit),upper_bound=upper_bound,lower_bound=lower_bound) + object = QuantityValue(amount, unit=Item( + unit), upper_bound=upper_bound, lower_bound=lower_bound) else: object = QuantityValue(amount, unit=Item(unit)) else: if upper_bound != None and lower_bound != None: - object = QuantityValue(amount, upper_bound=upper_bound,lower_bound=lower_bound) + object = QuantityValue( + amount, upper_bound=upper_bound, lower_bound=lower_bound) else: - object = QuantityValue(amount) + object = QuantityValue(amount) elif edge_type == MonolingualText: text_string, lang = TripleGenerator.process_text_string(node2) object = MonolingualText(text_string, lang) @@ -291,7 +298,8 @@ def generate_normal_triple( if type(object) == WDItem: self.doc.kg.add_subject(object) self.to_append_statement.add_qualifier(label, object) - self.doc.kg.add_subject(self.to_append_statement) #TODO maybe can be positioned better for the edge cases. + # TODO maybe can be positioned better for the edge cases. + self.doc.kg.add_subject(self.to_append_statement) else: # edge: q1 p8 q2 e8 @@ -299,12 +307,13 @@ def generate_normal_triple( if type(object) == WDItem: self.doc.kg.add_subject(object) if self.truthy: - self.to_append_statement = entity.add_truthy_statement(label, object) + self.to_append_statement = entity.add_truthy_statement( + label, object) else: - self.to_append_statement = entity.add_statement(label, object) + self.to_append_statement = entity.add_statement(label, object) self.doc.kg.add_subject(entity) return True - + @staticmethod def is_invalid_decimal_string(num_string): ''' @@ -315,18 +324,19 @@ def is_invalid_decimal_string(num_string): else: if abs(float(num_string)) < 0.0001 and float(num_string) != 0: return True - return False + return False @staticmethod - def is_valid_uri_with_scheme_and_host(uri:str): + def is_valid_uri_with_scheme_and_host(uri: str): ''' https://github.com/python-hyper/rfc3986/issues/30#issuecomment-461661883 ''' try: uri = rfc3986.URIReference.from_string(uri) - rfc3986.validators.Validator().require_presence_of("scheme", "host").check_validity_of("scheme", "host").validate(uri) + rfc3986.validators.Validator().require_presence_of( + "scheme", "host").check_validity_of("scheme", "host").validate(uri) return True - except : + except: return False @staticmethod @@ -335,9 +345,9 @@ def clean_number_string(num): if num == None: return None else: - return format_float_positional(float(num),trim="-") + return format_float_positional(float(num), trim="-") - def entry_point(self, line_number:int , edge: str): + def entry_point(self, line_number: int, edge: str): """ generates a list of two, the first element is the determination of the edge type using corresponding edge type the second element is a bool indicating whether this is a valid property edge or qualifier edge. @@ -345,12 +355,12 @@ def entry_point(self, line_number:int , edge: str): """ edge_list = edge.strip().split("\t") l = len(edge_list) - if l!=4: + if l != 4: return [node1, label, node2, e_id] = edge_list - node1, label, node2, e_id = node1.strip(),label.strip(),node2.strip(),e_id.strip() - if line_number == 0: #TODO ignore header mode + node1, label, node2, e_id = node1.strip(), label.strip(), node2.strip(), e_id.strip() + if line_number == 0: # TODO ignore header mode # by default a statement edge is_qualifier_edge = False # print("#Debug Info: ",line_number, self.to_append_statement_id, e_id, is_qualifier_edge,self.to_append_statement) @@ -363,10 +373,10 @@ def entry_point(self, line_number:int , edge: str): self.serialize() is_qualifier_edge = False # print("#Debug Info: ",line_number, self.to_append_statement_id, node1, is_qualifier_edge,self.to_append_statement) - self.to_append_statement_id= e_id + self.to_append_statement_id = e_id self.corrupted_statement_id = None else: - # qualifier edge or property declaration edge + # qualifier edge or property declaration edge is_qualifier_edge = True if self.corrupted_statement_id == e_id: # Met a qualifier which associates with a corrupted statement @@ -383,36 +393,39 @@ def entry_point(self, line_number:int , edge: str): if label in self.label_set: success = self.generate_label_triple(node1, label, node2) elif label in self.description_set: - success= self.generate_description_triple(node1, label, node2) + success = self.generate_description_triple(node1, label, node2) elif label in self.alias_set: success = self.generate_alias_triple(node1, label, node2) elif label == "type": # special edge of prop declaration - success = self.generate_prop_declaration_triple(node1, label, node2) + success = self.generate_prop_declaration_triple( + node1, label, node2) else: if label in self.prop_types: - success= self.generate_normal_triple(node1, label, node2, is_qualifier_edge) + success = self.generate_normal_triple( + node1, label, node2, is_qualifier_edge) else: if not self.ignore: raise KGTKException( - "property {}'s type is unknown at line {}.\n".format(label, line_number) + "property {}'s type is unknown at line {}.\n".format( + label, line_number) ) success = False if (not success) and (not is_qualifier_edge) and (not self.ignore): # We have a corrupted edge here. - self.ignore_file.write("Corrupted statement at line number: {} with id {} with current corrupted id {}\n".format(line_number, e_id, self.corrupted_statement_id)) + self.ignore_file.write("Corrupted statement at line number: {} with id {} with current corrupted id {}\n".format( + line_number, e_id, self.corrupted_statement_id)) self.ignore_file.flush() self.corrupted_statement_id = e_id else: self.read_num_of_lines += 1 self.corrupted_statement_id = None - @staticmethod - def replace_illegal_string(s:str)->str: + def replace_illegal_string(s: str) -> str: ''' this function serves as the last gate of keeping illegal characters outside of entity creation. ''' for char in BAD_CHARS: - s = s.replace(char,"_") - return s \ No newline at end of file + s = s.replace(char, "_") + return s From 31622c819175faddf9f94f766802682b171a26dc Mon Sep 17 00:00:00 2001 From: Rongpeng Date: Wed, 6 May 2020 09:02:19 -0700 Subject: [PATCH 084/278] added several default values for commandline arguments --- kgtk/cli/generate_wikidata_triples.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kgtk/cli/generate_wikidata_triples.py b/kgtk/cli/generate_wikidata_triples.py index d65aa9037..475885d31 100644 --- a/kgtk/cli/generate_wikidata_triples.py +++ b/kgtk/cli/generate_wikidata_triples.py @@ -38,6 +38,7 @@ def add_arguments(parser): "--label-property", action="store", type=str, + const="label", help="property identifiers which will create labels, separated by comma','.", dest="labels", ) @@ -46,6 +47,7 @@ def add_arguments(parser): "--alias-property", action="store", type=str, + const="aliases", help="alias identifiers which will create labels, separated by comma','.", dest="aliases", ) @@ -54,6 +56,7 @@ def add_arguments(parser): "--description-property", action="store", type=str, + const="descriptions", help="description identifiers which will create labels, separated by comma','.", dest="descriptions", ) @@ -70,6 +73,7 @@ def add_arguments(parser): "--output-n-lines", action="store", type=int, + const=1000, help="output triples approximately every {n} lines of reading stdin.", dest="n", ) @@ -78,6 +82,7 @@ def add_arguments(parser): "--generate-truthy", action="store", type=str2bool, + const="yes", help="the default is to not generate truthy triples. Specify this option to generate truthy triples. NOTIMPLEMENTED", dest="truthy", ) @@ -86,6 +91,7 @@ def add_arguments(parser): "--ignore", action="store", type=str2bool, + const="no", help="if set to yes, ignore various kinds of exceptions and mistakes and log them to a log file with line number in input file, rather than stopping. logging", dest="ignore", ) @@ -94,6 +100,7 @@ def add_arguments(parser): "--use-gz", action="store", type=str2bool, + const="no", help="if set to yes, read from compressed gz file", dest="use_gz", ) From d0c14b14280342035a228d86ea013b0b207ee738 Mon Sep 17 00:00:00 2001 From: Rongpeng Date: Wed, 6 May 2020 09:15:35 -0700 Subject: [PATCH 085/278] fix the option issue, now only needs property file to run --- kgtk/cli/generate_wikidata_triples.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/kgtk/cli/generate_wikidata_triples.py b/kgtk/cli/generate_wikidata_triples.py index 475885d31..a9e861117 100644 --- a/kgtk/cli/generate_wikidata_triples.py +++ b/kgtk/cli/generate_wikidata_triples.py @@ -38,7 +38,8 @@ def add_arguments(parser): "--label-property", action="store", type=str, - const="label", + default="label", + required=False, help="property identifiers which will create labels, separated by comma','.", dest="labels", ) @@ -47,7 +48,8 @@ def add_arguments(parser): "--alias-property", action="store", type=str, - const="aliases", + required = False, + default="aliases", help="alias identifiers which will create labels, separated by comma','.", dest="aliases", ) @@ -56,7 +58,8 @@ def add_arguments(parser): "--description-property", action="store", type=str, - const="descriptions", + required = False, + default="descriptions", help="description identifiers which will create labels, separated by comma','.", dest="descriptions", ) @@ -65,6 +68,7 @@ def add_arguments(parser): "--property-types", action="store", type=str, + required = True, help="path to the file which contains the property datatype mapping in kgtk format.", dest="prop_file", ) @@ -73,7 +77,8 @@ def add_arguments(parser): "--output-n-lines", action="store", type=int, - const=1000, + required = False, + default=1000, help="output triples approximately every {n} lines of reading stdin.", dest="n", ) @@ -82,7 +87,8 @@ def add_arguments(parser): "--generate-truthy", action="store", type=str2bool, - const="yes", + required = False, + default="yes", help="the default is to not generate truthy triples. Specify this option to generate truthy triples. NOTIMPLEMENTED", dest="truthy", ) @@ -91,7 +97,8 @@ def add_arguments(parser): "--ignore", action="store", type=str2bool, - const="no", + required = False, + default="no", help="if set to yes, ignore various kinds of exceptions and mistakes and log them to a log file with line number in input file, rather than stopping. logging", dest="ignore", ) @@ -100,7 +107,8 @@ def add_arguments(parser): "--use-gz", action="store", type=str2bool, - const="no", + required = False, + default="no", help="if set to yes, read from compressed gz file", dest="use_gz", ) From fb32d6963abe3be861483e8730bfc23032f999d3 Mon Sep 17 00:00:00 2001 From: Rongpeng Date: Wed, 6 May 2020 09:40:45 -0700 Subject: [PATCH 086/278] support arbitrary order of required columns --- kgtk/cli/generate_wikidata_triples.py | 6 +-- kgtk/triple_generator.py | 54 ++++++++++++++++++--------- 2 files changed, 40 insertions(+), 20 deletions(-) diff --git a/kgtk/cli/generate_wikidata_triples.py b/kgtk/cli/generate_wikidata_triples.py index a9e861117..f1cc8dfdc 100644 --- a/kgtk/cli/generate_wikidata_triples.py +++ b/kgtk/cli/generate_wikidata_triples.py @@ -143,11 +143,11 @@ def run( else: fp = sys.stdin # not line by line - for num, edge in enumerate(fp): - if edge.startswith("#") or num == 0: + for line_num, edge in enumerate(fp): + if edge.startswith("#"): continue else: - generator.entry_point(num+1,edge) + generator.entry_point(line_num+1,edge) generator.finalize() # testing profiling locally with direct call diff --git a/kgtk/triple_generator.py b/kgtk/triple_generator.py index 67e3485cd..766154f7f 100644 --- a/kgtk/triple_generator.py +++ b/kgtk/triple_generator.py @@ -63,6 +63,8 @@ def __init__( self.yyyy_pattern = re.compile("[12]\d{3}") self.quantity_pattern = re.compile( "([\+|\-]?[0-9]+\.?[0-9]*)(?:\[([\+|\-]?[0-9]+\.?[0-9]*),([\+|\-]?[0-9]+\.?[0-9]*)\])?([U|Q](?:[0-9]+))?") + # order map, know the column index of ["node1","property","node2",id] + self.order_map = {} def _node_2_entity(self, node: str): ''' @@ -353,14 +355,32 @@ def entry_point(self, line_number: int, edge: str): the second element is a bool indicating whether this is a valid property edge or qualifier edge. Call corresponding downstream functions """ + edge_list = edge.strip().split("\t") l = len(edge_list) - if l != 4: - return + if line_number == 1: + # initialize the order_map + edge_list = edge.strip().split("\t") + node1_index = edge_list.index("node1") + node2_index = edge_list.index("node2") + prop_index = edge_list.index("property") + id_index = edge_list.index("id") + if not all([node1_index>-1,node2_index>-1,prop_index>-1,id_index>-1]): + raise KGTKException("Header of kgtk file misses at least one of required column names: (node1, node2, property and id)") + else: + self.order_map["node1"] = node1_index + self.order_map["node2"] = node2_index + self.order_map["prop"] = prop_index + self.order_map["id"] = id_index + return - [node1, label, node2, e_id] = edge_list - node1, label, node2, e_id = node1.strip(), label.strip(), node2.strip(), e_id.strip() - if line_number == 0: # TODO ignore header mode + # use the order_map to map the node + + node1 = edge_list[self.order_map["node1"]].strip() + node2 = edge_list[self.order_map["node2"]].strip() + prop = edge_list[self.order_map["prop"]].strip() + e_id = edge_list[self.order_map["id"]].strip() + if line_number == 2: # by default a statement edge is_qualifier_edge = False # print("#Debug Info: ",line_number, self.to_append_statement_id, e_id, is_qualifier_edge,self.to_append_statement) @@ -381,7 +401,7 @@ def entry_point(self, line_number: int, edge: str): if self.corrupted_statement_id == e_id: # Met a qualifier which associates with a corrupted statement return - if label != "type" and node1 != self.to_append_statement_id: + if prop != "type" and node1 != self.to_append_statement_id: # 1. not a property declaration edge and # 2. the current qualifier's node1 is not the latest property edge id, throw errors. if not self.ignore: @@ -390,25 +410,25 @@ def entry_point(self, line_number: int, edge: str): node1, line_number, self.to_append_statement_id ) ) - if label in self.label_set: - success = self.generate_label_triple(node1, label, node2) - elif label in self.description_set: - success = self.generate_description_triple(node1, label, node2) - elif label in self.alias_set: - success = self.generate_alias_triple(node1, label, node2) - elif label == "type": + if prop in self.label_set: + success = self.generate_label_triple(node1, prop, node2) + elif prop in self.description_set: + success = self.generate_description_triple(node1, prop, node2) + elif prop in self.alias_set: + success = self.generate_alias_triple(node1, prop, node2) + elif prop == "type": # special edge of prop declaration success = self.generate_prop_declaration_triple( - node1, label, node2) + node1, prop, node2) else: - if label in self.prop_types: + if prop in self.prop_types: success = self.generate_normal_triple( - node1, label, node2, is_qualifier_edge) + node1, prop, node2, is_qualifier_edge) else: if not self.ignore: raise KGTKException( "property {}'s type is unknown at line {}.\n".format( - label, line_number) + prop, line_number) ) success = False if (not success) and (not is_qualifier_edge) and (not self.ignore): From bf1b50c1a26c68b102df0ea96e500d13c067509b Mon Sep 17 00:00:00 2001 From: Rongpeng Date: Wed, 6 May 2020 10:09:51 -0700 Subject: [PATCH 087/278] support using edge id as statement id after removing illegal characters --- kgtk/cli/generate_wikidata_triples.py | 14 +++++++++++++- kgtk/triple_generator.py | 17 +++++++++++------ 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/kgtk/cli/generate_wikidata_triples.py b/kgtk/cli/generate_wikidata_triples.py index f1cc8dfdc..cdc8b44ad 100644 --- a/kgtk/cli/generate_wikidata_triples.py +++ b/kgtk/cli/generate_wikidata_triples.py @@ -112,6 +112,16 @@ def add_arguments(parser): help="if set to yes, read from compressed gz file", dest="use_gz", ) + parser.add_argument( + "-sid", + "--use-id", + action="store", + type=str2bool, + required = False, + default="no", + help="if set to yes, the id in the edge will be used as statement id when creating statement or truthy statement", + dest="use_id", + ) def run( @@ -123,6 +133,7 @@ def run( truthy: bool, ignore: bool, use_gz: bool, + use_id:bool ): # import modules locally import gzip @@ -135,7 +146,8 @@ def run( description_set=descriptions, n=n, ignore=ignore, - truthy=truthy + truthy=truthy, + use_id=use_id ) # process stdin if use_gz: diff --git a/kgtk/triple_generator.py b/kgtk/triple_generator.py index 766154f7f..675ceffe9 100644 --- a/kgtk/triple_generator.py +++ b/kgtk/triple_generator.py @@ -21,7 +21,7 @@ ) BAD_CHARS = [":", "-", "&", ",", " ", - "(", ")", "\'", '\"', "/", "\\", "[", "]", ";"] + "(", ")", "\'", '\"', "/", "\\", "[", "]", ";","|"] class TripleGenerator: @@ -38,7 +38,8 @@ def __init__( ignore: bool, n: int, dest_fp: TextIO = sys.stdout, - truthy: bool = False + truthy: bool = False, + use_id:bool=False, ): from etk.wikidata.statement import Rank self.ignore = ignore @@ -65,6 +66,7 @@ def __init__( "([\+|\-]?[0-9]+\.?[0-9]*)(?:\[([\+|\-]?[0-9]+\.?[0-9]*),([\+|\-]?[0-9]+\.?[0-9]*)\])?([U|Q](?:[0-9]+))?") # order map, know the column index of ["node1","property","node2",id] self.order_map = {} + self.use_id = use_id def _node_2_entity(self, node: str): ''' @@ -198,7 +200,9 @@ def generate_prop_declaration_triple(self, node1: str, label: str, node2: str) - return True def generate_normal_triple( - self, node1: str, label: str, node2: str, is_qualifier_edge: bool) -> bool: + self, node1: str, label: str, node2: str, is_qualifier_edge: bool,e_id:str) -> bool: + if self.use_id: + e_id = TripleGenerator.replace_illegal_string(e_id) entity = self._node_2_entity(node1) # determine the edge type edge_type = self.prop_types[label] @@ -310,9 +314,10 @@ def generate_normal_triple( self.doc.kg.add_subject(object) if self.truthy: self.to_append_statement = entity.add_truthy_statement( - label, object) + label, object,statement_id=e_id) if self.use_id else entity.add_truthy_statement(label,object) else: - self.to_append_statement = entity.add_statement(label, object) + self.to_append_statement = entity.add_statement( + label, object,statement_id=e_id) if self.use_id else entity.add_statement(label, object) self.doc.kg.add_subject(entity) return True @@ -423,7 +428,7 @@ def entry_point(self, line_number: int, edge: str): else: if prop in self.prop_types: success = self.generate_normal_triple( - node1, prop, node2, is_qualifier_edge) + node1, prop, node2, is_qualifier_edge,e_id) else: if not self.ignore: raise KGTKException( From cdf5b13d75f530b729fa0a4240eee58133aea77d Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 6 May 2020 11:01:43 -0700 Subject: [PATCH 088/278] Repair month or day zero. Escape list separators. --- kgtk/cli/validate.py | 6 +++++- kgtk/join/kgtkreader.py | 2 +- kgtk/join/kgtkvalue.py | 17 ++++++++++++----- kgtk/join/kgtkvalueoptions.py | 28 +++++++++++++++++++++++----- 4 files changed, 41 insertions(+), 12 deletions(-) diff --git a/kgtk/cli/validate.py b/kgtk/cli/validate.py index 074f56aed..ce2fce9e2 100644 --- a/kgtk/cli/validate.py +++ b/kgtk/cli/validate.py @@ -153,6 +153,8 @@ def run(kgtk_files: typing.Optional[typing.List[typing.Optional[Path]]], allow_lax_strings: bool = False, allow_lax_lq_strings: bool = False, allow_month_or_day_zero: bool = False, + repair_month_or_day_zero: bool = False, + escape_list_separators: bool = False, minimum_valid_year: int = KgtkValueOptions.MINIMUM_VALID_YEAR, maximum_valid_year: int = KgtkValueOptions.MAXIMUM_VALID_YEAR, compression_type: typing.Optional[str] = None, @@ -175,12 +177,14 @@ def run(kgtk_files: typing.Optional[typing.List[typing.Optional[Path]]], # Build the value parsing option structure. value_options: KgtkValueOptions = KgtkValueOptions(allow_month_or_day_zero=allow_month_or_day_zero, + repair_month_or_day_zero=repair_month_or_day_zero, allow_lax_strings=allow_lax_strings, allow_lax_lq_strings=allow_lax_lq_strings, allow_language_suffixes=allow_language_suffixes, additional_language_codes=additional_language_codes, minimum_valid_year=minimum_valid_year, - maximum_valid_year=maximum_valid_year) + maximum_valid_year=maximum_valid_year, + escape_list_separators=escape_list_separators) try: kgtk_file: typing.Optional[Path] diff --git a/kgtk/join/kgtkreader.py b/kgtk/join/kgtkreader.py index 326b905bf..9b338eb49 100644 --- a/kgtk/join/kgtkreader.py +++ b/kgtk/join/kgtkreader.py @@ -588,7 +588,7 @@ def __next__(self)-> typing.List[str]: def _ignore_invalid_values(self, values: typing.List[str], line: str)->bool: """Give a row of values, validate each value. If we find one or more - validation problems, we might want to emit erro messages and we might + validation problems, we might want to emit error messages and we might want to ignore the entire row. Returns True to indicate that the row should be ignored (skipped). diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py index 4604f6f87..ebbf799e9 100644 --- a/kgtk/join/kgtkvalue.py +++ b/kgtk/join/kgtkvalue.py @@ -87,9 +87,16 @@ def get_list_items(self)->typing.List['KgtkValue']: if self.list_items is not None: return self.list_items - # Return an empty list if this is not a list. - self.list_items: typing.List['KgtkValue'] = [ ] + # Split the KGTK list. values: typing.List[str] = KgtkValue.split_list_re.split(self.value) + + # Perhaps we'd like to escape the list separators instead of splitting on them? + if self.options.escape_list_separators: + self.value = ("\\" + KgtkFormat.LIST_SEPARATOR).join(values) + return [ ] # Return an empty list. + + # Return an empty Python list if this is not a KGTK list. + self.list_items: typing.List['KgtkValue'] = [ ] if len(values) > 1: # Populate list_items with a KgtkValue for each item in the list: item_value: str @@ -742,10 +749,10 @@ def is_date_and_times(self, validate: bool=False)->bool: self.secondsstr = None self.year = None self.month = None + self.day = None self.zonestr = None self.precisionstr = None self.iso8601basic = None - self.day = None # Validate the date and times: m: typing.Optional[typing.Match] = KgtkValue.lax_date_and_times_re.match(self.value) @@ -799,7 +806,7 @@ def is_date_and_times(self, validate: bool=False)->bool: self.day = 1 self.daystr = "01" fixup_needed = True - if not self.options.allow_month_or_day_zero: + elif not self.options.allow_month_or_day_zero: return False # day 0 was disallowed. if fixup_needed: @@ -829,7 +836,7 @@ def update_date_and_times(self): if self.secondsstr is not None: if not self.iso8601basic: v += ":" - v += self.secondssr + v += self.secondsstr if self.zonestr is not None: v += self.zonestr if self.precisionstr is not None: diff --git a/kgtk/join/kgtkvalueoptions.py b/kgtk/join/kgtkvalueoptions.py index d576f649c..b061b92e3 100644 --- a/kgtk/join/kgtkvalueoptions.py +++ b/kgtk/join/kgtkvalueoptions.py @@ -16,7 +16,7 @@ class KgtkValueOptions: # Allow month 00 or day 00 in dates? This isn't really allowed by ISO # 8601, but appears in wikidata. allow_month_or_day_zero: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) - repair_month_or_day_zero: bool = attr.ib(validator=attr.validators.instance_of(bool), default=True) + repair_month_or_day_zero: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) # When allow_lax_strings is true, strings will be checked to see if they # start and end with double quote ("), but we won't check if internal @@ -38,6 +38,8 @@ class KgtkValueOptions: # iterable_validator=attr.validators.instance_of(list)))), additional_language_codes: typing.Optional[typing.List[str]] = attr.ib(default=None) + escape_list_separators: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) + # Minimum and maximum year range in dates. MINIMUM_VALID_YEAR: int = 1583 # Per ISO 8601, years before this one require special agreement. minimum_valid_year: int = attr.ib(validator=attr.validators.instance_of(int), default=MINIMUM_VALID_YEAR) @@ -81,29 +83,45 @@ def add_arguments(cls, parser: ArgumentParser): lqgroup.add_argument( "--disallow-lax-lq-strings", dest="allow_lax_lq_strings", help="Check if single quotes are backslashed inside language qualified strings.", action='store_false') - md0group= parser.add_mutually_exclusive_group() - md0group.add_argument( "--allow-month-or-day-zero", dest="allow_month_or_day_zero", + amd0group= parser.add_mutually_exclusive_group() + amd0group.add_argument( "--allow-month-or-day-zero", dest="allow_month_or_day_zero", help="Allow month or day zero in dates.", action='store_true', default=False) - md0group.add_argument( "--disallow-month-or-day-zero", dest="allow_month_or_day_zero", + amd0group.add_argument( "--disallow-month-or-day-zero", dest="allow_month_or_day_zero", help="Allow month or day zero in dates.", action='store_false') + rmd0group= parser.add_mutually_exclusive_group() + rmd0group.add_argument( "--repair-month-or-day-zero", dest="repair_month_or_day_zero", + help="Repair month or day zero in dates.", action='store_true', default=False) + + rmd0group.add_argument( "--no-repair-month-or-day-zero", dest="repair_month_or_day_zero", + help="Do not repair month or day zero in dates.", action='store_false') + parser.add_argument( "--minimum-valid-year", dest="minimum_valid_year", help="The minimum valid year in dates.", type=int, default=cls.MINIMUM_VALID_YEAR) parser.add_argument( "--maximum-valid-year", dest="maximum_valid_year", help="The maximum valid year in dates.", type=int, default=cls.MAXIMUM_VALID_YEAR) + elsgroup= parser.add_mutually_exclusive_group() + elsgroup.add_argument( "--escape-list-separators", dest="escape_list_separators", + help="Escape all list separators instead of splitting on them.", action='store_true', default=False) + + elsgroup.add_argument( "--no-escape-list-separators", dest="escape_list_separators", + help="Do not escape list separators.", action='store_false') + @classmethod # Build the value parsing option structure. def from_args(cls, args: Namespace)->'KgtkValueOptions': return cls(allow_month_or_day_zero=args.allow_month_or_day_zero, + repair_month_or_day_zero=args.repair_month_or_day_zero, allow_language_suffixes=args.allow_language_suffixes, allow_lax_strings=args.allow_lax_strings, allow_lax_lq_strings=args.allow_lax_lq_strings, additional_language_codes=args.additional_language_codes, minimum_valid_year=args.minimum_valid_year, - maximum_valid_year=args.maximum_valid_year) + maximum_valid_year=args.maximum_valid_year, + escape_list_separators=args.escape_list_separators) DEFAULT_KGTK_VALUE_OPTIONS: KgtkValueOptions = KgtkValueOptions() From fe70fd94a1196876727e1e74eca3303c29ccffb4 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 6 May 2020 11:20:32 -0700 Subject: [PATCH 089/278] Rebuild a list when repairing a child of the list. --- kgtk/join/kgtkvalue.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py index ebbf799e9..8ae0ff089 100644 --- a/kgtk/join/kgtkvalue.py +++ b/kgtk/join/kgtkvalue.py @@ -16,6 +16,7 @@ class KgtkValue(KgtkFormat): value: str = attr.ib(validator=attr.validators.instance_of(str)) options: KgtkValueOptions = attr.ib(validator=attr.validators.instance_of(KgtkValueOptions), default=DEFAULT_KGTK_VALUE_OPTIONS) + parent: typing.Optional['KgtkValue'] = attr.ib(default=None) # Cache some properties of the value that would be expensive to # continuously recompute. The class is not frozen because we have these @@ -101,7 +102,7 @@ def get_list_items(self)->typing.List['KgtkValue']: # Populate list_items with a KgtkValue for each item in the list: item_value: str for item_value in values: - self.list_items.append(KgtkValue(item_value, options=self.options)) + self.list_items.append(KgtkValue(item_value, options=self.options, parent=self)) return self.list_items def is_list(self, validate: bool = False)->bool: @@ -132,6 +133,18 @@ def is_list(self, validate: bool = False)->bool: self.valid = True return True + def rebuild_list(self): + # Called to repair a list when we've repaired a list item. + if self.list_items is None or len(self.list_items) == 0: + return + + values: typing.List[str] = [] + item: KgtkValue + for item in self.list_items: + values.append(item.value) + self.value = KgtkFormat.LIST_SEPARATOR.join(values) + + def _is_number_or_quantity(self)->bool: return self.value.startswith(("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "+", "-", ".")) @@ -810,7 +823,11 @@ def is_date_and_times(self, validate: bool=False)->bool: return False # day 0 was disallowed. if fixup_needed: + # Rapair a month or day zero problem. If this value is the child + #of a list, repair the list parent value, too. self.update_date_and_times() + if self.parent is not None: + self.parent.rebuild_list() # We are fairly certain that this is a valid date and times. self.valid = True From 111c8b160dc0263302085ef35c10e1b36f6a1590 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 6 May 2020 14:10:15 -0700 Subject: [PATCH 090/278] Document the iso8601 extended flag properly. --- kgtk/join/kgtkvalue.py | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py index 8ae0ff089..b89615ee1 100644 --- a/kgtk/join/kgtkvalue.py +++ b/kgtk/join/kgtkvalue.py @@ -16,36 +16,39 @@ class KgtkValue(KgtkFormat): value: str = attr.ib(validator=attr.validators.instance_of(str)) options: KgtkValueOptions = attr.ib(validator=attr.validators.instance_of(KgtkValueOptions), default=DEFAULT_KGTK_VALUE_OPTIONS) + + # TODO: proper validation. parent: typing.Optional['KgtkValue'] = attr.ib(default=None) # Cache some properties of the value that would be expensive to - # continuously recompute. The class is not frozen because we have these - # cache members. + # continuously recompute. data_type: typing.Optional[KgtkFormat.DataType] = None valid: typing.Optional[bool] = None # If this is a list, cache a KgtkValue object for each item of the list. + # + # Note: Please do not access this list directly. Use get_list_items(). list_items: typing.Optional[typing.List['KgtkValue']] = None - # Offer the components of a string or language-qualified string: + # Offer the components of a string or language-qualified string, after validating the item. contents: typing.Optional[str] = None # String contents without the enclosing quotes lang: typing.Optional[str] = None suffix: typing.Optional[str] = None # Includes the leading dash. - # Offer the components of a number or quantity: + # Offer the components of a number or quantity, after validating the item. number: typing.Optional[str] = None # Note: not converted to int or float low_tolerance: typing.Optional[str] = None # Note: not converted to int or float high_tolerance: typing.Optional[str] = None # Note: not converted to int or float si_units: typing.Optional[str] = None wikidata_node: typing.Optional[str] = None - # Offer the components of a location coordinates: + # Offer the components of a location coordinates, after validaating the item: latstr: typing.Optional[str] = None lat: typing.Optional[float] = None lonstr: typing.Optional[str] = None lon: typing.Optional[float] = None - # Offer the components of a date and times: + # Offer the components of a date and times, after validating the item: yearstr: typing.Optional[str] = None # Note: not converted to int year: typing.Optional[int] = None monthstr: typing.Optional[str] = None # Note: not converted to int @@ -57,7 +60,7 @@ class KgtkValue(KgtkFormat): secondsstr: typing.Optional[str] = None # Note: not converted to int or float zonestr: typing.Optional[str] = None precisionstr: typing.Optional[str] = None - iso8601basic: typing.Optional[bool] = None # True when hyphens/colons present. + iso8601extended: typing.Optional[bool] = None # True when hyphens/colons present. def is_valid(self)->bool: # Is this a valid whatever it is? @@ -85,6 +88,8 @@ def is_empty(self, validate: bool = False)->bool: def get_list_items(self)->typing.List['KgtkValue']: # If this is a KGTK List, return a list of KGTK values representing # the items in the list. If this is not a KGTK List, return an empty list. + # + # Note: This is the only routine that should touch self.list_items. if self.list_items is not None: return self.list_items @@ -135,12 +140,13 @@ def is_list(self, validate: bool = False)->bool: def rebuild_list(self): # Called to repair a list when we've repaired a list item. - if self.list_items is None or len(self.list_items) == 0: + list_items: typng.List[KgtkValues] = self.get_list_items() + if list_items is None or len(list_items) == 0: return values: typing.List[str] = [] item: KgtkValue - for item in self.list_items: + for item in list_items: values.append(item.value) self.value = KgtkFormat.LIST_SEPARATOR.join(values) @@ -727,7 +733,7 @@ def is_date_and_times(self, validate: bool=False)->bool: self.day = None self.zonestr = None self.precisionstr = None - self.iso8601basic = None + self.iso8601extended = None return False # We are certain that this is location coordinates, although we haven't checked validity. self.data_type = KgtkFormat.DataType.DATE_AND_TIMES @@ -745,7 +751,7 @@ def is_date_and_times(self, validate: bool=False)->bool: self.day = None self.zonestr = None self.precisionstr = None - self.iso8601basic = None + self.iso8601extended = None return False if not validate: @@ -765,7 +771,7 @@ def is_date_and_times(self, validate: bool=False)->bool: self.day = None self.zonestr = None self.precisionstr = None - self.iso8601basic = None + self.iso8601extended = None # Validate the date and times: m: typing.Optional[typing.Match] = KgtkValue.lax_date_and_times_re.match(self.value) @@ -780,7 +786,7 @@ def is_date_and_times(self, validate: bool=False)->bool: self.secondsstr = m.group("seconds") self.zonestr = m.group("zone") self.precisionstr = m.group("precision") - self.iso8601basic = m.group("hyphen") is None + self.iso8601extended = m.group("hyphen") is not None fixup_needed: bool = False @@ -836,22 +842,22 @@ def is_date_and_times(self, validate: bool=False)->bool: def update_date_and_times(self): v: str = "^" + self.yearstr if self.monthstr is not None: - if not self.iso8601basic: + if self.iso8601extended: v += "-" v += self.monthstr if self.daystr is not None: - if not self.iso8601basic: + if self.iso8601extended: v += "-" v += self.daystr if self.hourstr is not None: v += "T" v += self.hourstr if self.minutesstr is not None: - if not self.iso8601basic: + if self.iso8601extended: v += ":" v += self.minutesstr if self.secondsstr is not None: - if not self.iso8601basic: + if self.iso8601extended: v += ":" v += self.secondsstr if self.zonestr is not None: From 9572db3af4fa5c675d1114187fe73951c87b064a Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 6 May 2020 14:54:54 -0700 Subject: [PATCH 091/278] More field extractions. --- kgtk/join/kgtkvalue.py | 163 +++++++++++++++++++++++++++++++---------- 1 file changed, 123 insertions(+), 40 deletions(-) diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py index b89615ee1..9bdb3331f 100644 --- a/kgtk/join/kgtkvalue.py +++ b/kgtk/join/kgtkvalue.py @@ -30,15 +30,25 @@ class KgtkValue(KgtkFormat): # Note: Please do not access this list directly. Use get_list_items(). list_items: typing.Optional[typing.List['KgtkValue']] = None + # The following members offer access to the fields of a KgtkValue. + # They are accessible immediately after validating the contents + # of the KgtkValue object: + # obj.is_valid() return True + # obj.validate() returns True + # obj.revalidate() returns True + # obj.is_language_qualified_string(validate=True) returns True + #... etc. + # Offer the components of a string or language-qualified string, after validating the item. contents: typing.Optional[str] = None # String contents without the enclosing quotes - lang: typing.Optional[str] = None - suffix: typing.Optional[str] = None # Includes the leading dash. + lang: typing.Optional[str] = None # 2- or 3-character code without suffix. + suffix: typing.Optional[str] = None # Language code suffix, including the leading dash. # Offer the components of a number or quantity, after validating the item. - number: typing.Optional[str] = None # Note: not converted to int or float - low_tolerance: typing.Optional[str] = None # Note: not converted to int or float - high_tolerance: typing.Optional[str] = None # Note: not converted to int or float + numberstr: typing.Optional[str] = None # Note: not converted to int or float + number: typing.Optional[typing.Union[int, float]] = None + low_tolerancestr: typing.Optional[str] = None # Note: not converted to int or float + high_tolerancestr: typing.Optional[str] = None # Note: not converted to int or float si_units: typing.Optional[str] = None wikidata_node: typing.Optional[str] = None @@ -49,18 +59,24 @@ class KgtkValue(KgtkFormat): lon: typing.Optional[float] = None # Offer the components of a date and times, after validating the item: - yearstr: typing.Optional[str] = None # Note: not converted to int + yearstr: typing.Optional[str] = None # Note: before conversion to int year: typing.Optional[int] = None - monthstr: typing.Optional[str] = None # Note: not converted to int + monthstr: typing.Optional[str] = None # Note: before conversion to int month: typing.Optional[int] = None - daystr: typing.Optional[str] = None # Note: not converted to int + daystr: typing.Optional[str] = None # Note: before conversion to int day: typing.Optional[int] = None - hourstr: typing.Optional[str] = None # Note: not converted to int or float - minutesstr: typing.Optional[str] = None # Note: not converted to int or float - secondsstr: typing.Optional[str] = None # Note: not converted to int or float - zonestr: typing.Optional[str] = None + hourstr: typing.Optional[str] = None # Note: before conversion to int or float + hour: typing.Optional[int] = None + minutesstr: typing.Optional[str] = None # Note: before conversion to int or float + minutes: typing.Optional[int] = None + secondsstr: typing.Optional[str] = None # Note: before conversion to int or float + seconds: typing.Optional[int] = None + zonestr: typing.Optional[str] = None # Z or [-+]HH or [-+]HHSS or [-+]HH:SS precisionstr: typing.Optional[str] = None - iso8601extended: typing.Optional[bool] = None # True when hyphens/colons present. + iso8601extended: typing.Optional[bool] = None # True when hyphens/colons are present. + + # Offer the contents of a boolean, after validating the item: + truth: typing.Optional[bool] = None def is_valid(self)->bool: # Is this a valid whatever it is? @@ -179,9 +195,9 @@ def _is_number_or_quantity(self)->bool: long_suffix=long_suffix_pat) integer_pat: str = r'(?:{decinteger}|{bininteger}|{octinteger}|{hexinteger})'.format(decinteger=decinteger_pat, - bininteger=bininteger_pat, - octinteger=octinteger_pat, - hexinteger=hexinteger_pat) + bininteger=bininteger_pat, + octinteger=octinteger_pat, + hexinteger=hexinteger_pat) # Floating point literals. digitpart_pat: str = r'(?:{digit}(?:_?{digit})*)'.format(digit=digit_pat) @@ -256,17 +272,17 @@ def is_number_or_quantity(self, validate: bool=False)->bool: return self.is_quantity(validate=validate) else: # Clear the number or quantity components: - self.number = None - self.low_tolerance = None - self.high_tolerance = None + self.numberstr = None + self.low_tolerancestr = None + self.high_tolerancestr = None self.si_units = None self.wikidata_node = None return False # Not a number or quantity. # Clear the number or quantity components: - self.number = None - self.low_tolerance = None - self.high_tolerance = None + self.numberstr = None + self.low_tolerancestr = None + self.high_tolerancestr = None self.si_units = None self.wikidata_node = None @@ -284,13 +300,24 @@ def is_number_or_quantity(self, validate: bool=False)->bool: return False # Extract the number or quantity components: - self.number = m.group("number") - self.low_tolerance = m.group("low_tolerance") - self.high_tolerance = m.group("high_tolerance") + self.numberstr = m.group("number") + self.low_tolerancestr = m.group("low_tolerance") + self.high_tolerancestr = m.group("high_tolerance") self.si_units = m.group("si_units") self.wikidata_node = m.group("wikidata_node") - if self.low_tolerance is not None or self.high_tolerance is not None or self.si_units is not None or self.wikidata_node is not None: + # For convenience, convert the numeric part to int or float: + # + # TODO: go to this extra work only when requested? + if self.numberstr is None: + raise ValueError("Missing numeric part") + n: str = self.numberstr.lower() + if "." in n or ("e" in n and not n.startswith("0x")): + self.number = float(n) + else: + self.number = int(n) + + if self.low_tolerancestr is not None or self.high_tolerancestr is not None or self.si_units is not None or self.wikidata_node is not None: # We can be certain that this is a quantity. self.data_type = KgtkFormat.DataType.QUANTITY else: @@ -322,7 +349,7 @@ def is_number(self, validate: bool=False)->bool: if self.data_type is not None: if self.data_type != KgtkFormat.DataType.NUMBER: # Clear the number components: - self.number = None + self.numberstr = None return False if not validate: @@ -331,7 +358,7 @@ def is_number(self, validate: bool=False)->bool: return self.valid # Clear the number components: - self.number = None + self.numberstr = None if not self._is_number_or_quantity(): return False @@ -342,7 +369,18 @@ def is_number(self, validate: bool=False)->bool: return False # Extract the number components: - self.number = m.group("number") + self.numberstr = m.group("number") + + # For convenience, convert the numeric part to int or float: + # + # TODO: go to this extra work only when requested? + if self.numberstr is None: + raise ValueError("Missing numeric part") + n: str = self.numberstr.lower() + if "." in n or ("e" in n and not n.startswith("0x")): + self.number = float(n) + else: + self.number = int(n) # Now we can be certain that this is a number. self.data_type = KgtkFormat.DataType.NUMBER @@ -358,9 +396,9 @@ def is_quantity(self, validate: bool=False)->bool: if self.data_type is not None: if self.data_type != KgtkFormat.DataType.QUANTITY: # Clear the quantity components: - self.number = None - self.low_tolerance = None - self.high_tolerance = None + self.numberstr = None + self.low_tolerancestr = None + self.high_tolerancestr = None self.si_units = None self.wikidata_node = None return False @@ -371,9 +409,9 @@ def is_quantity(self, validate: bool=False)->bool: return self.valid # Clear the quantity components: - self.number = None - self.low_tolerance = None - self.high_tolerance = None + self.numberstr = None + self.low_tolerancestr = None + self.high_tolerancestr = None self.si_units = None self.wikidata_node = None @@ -386,13 +424,24 @@ def is_quantity(self, validate: bool=False)->bool: return False # Extract the quantity components: - self.number = m.group("number") - self.low_tolerance = m.group("low_tolerance") - self.high_tolerance = m.group("high_tolerance") + self.numberstr = m.group("number") + self.low_tolerancestr = m.group("low_tolerance") + self.high_tolerancestr = m.group("high_tolerance") self.si_units = m.group("si_units") self.wikidata_node = m.group("wikidata_node") - if self.low_tolerance is None and self.high_tolerance is None and self.si_units is None and self.wikidata_node is None: + # For convenience, convert the numeric part to int or float: + # + # TODO: go to this extra work only when requested? + if self.numberstr is None: + raise ValueError("Missing numeric part") + n: str = self.numberstr.lower() + if "." in n or ("e" in n and not n.startswith("0x")): + self.number = float(n) + else: + self.number = int(n) + + if self.low_tolerancestr is None and self.high_tolerancestr is None and self.si_units is None and self.wikidata_node is None: # This is a number, not a quantity self.data_type = KgtkFormat.DataType.NUMBER self.valid = True @@ -483,15 +532,21 @@ def is_boolean(self, validate: bool = False)->bool: The validate parameter is ignored. """ if self.data_type is not None: - return self.data_type == KgtkFormat.DataType.BOOLEAN + if self.data_type != KgtkFormat.DataType.BOOLEAN: + self.truth = None + return False + self.truth = self.value == KgtkFormat.TRUE_SYMBOL + return True # Is this a boolean? if self.value != KgtkFormat.TRUE_SYMBOL and self.value != KgtkFormat.FALSE_SYMBOL: + self.truth = None return False # We are certain this is a valid boolean. self.data_type = KgtkFormat.DataType.BOOLEAN self.valid = True + self.truth = self.value == KgtkFormat.TRUE_SYMBOL return True # Support two or three character language codes. Suports hyphenated codes @@ -731,6 +786,9 @@ def is_date_and_times(self, validate: bool=False)->bool: self.year = None self.month = None self.day = None + self.hour = None + self.minutes = None + self.seconds = None self.zonestr = None self.precisionstr = None self.iso8601extended = None @@ -749,6 +807,9 @@ def is_date_and_times(self, validate: bool=False)->bool: self.year = None self.month = None self.day = None + self.hour = None + self.minutes = None + self.seconds = None self.zonestr = None self.precisionstr = None self.iso8601extended = None @@ -769,6 +830,9 @@ def is_date_and_times(self, validate: bool=False)->bool: self.year = None self.month = None self.day = None + self.hour = None + self.minutes = None + self.seconds = None self.zonestr = None self.precisionstr = None self.iso8601extended = None @@ -828,6 +892,25 @@ def is_date_and_times(self, validate: bool=False)->bool: elif not self.options.allow_month_or_day_zero: return False # day 0 was disallowed. + # Convert the time fields to ints: + if self.hourstr is not None: + try: + self.hour: int = int(self.hourstr) + except ValueError: + return False # shouldn't happen + + if self.minutesstr is not None: + try: + self.minutes: int = int(self.minutesstr) + except ValueError: + return False # shouldn't happen + + if self.secondsstr is not None: + try: + self.seconds: int = int(self.secondsstr) + except ValueError: + return False # shouldn't happen + if fixup_needed: # Rapair a month or day zero problem. If this value is the child #of a list, repair the list parent value, too. From ec21ee1f1fa3fce502c076ea06687e3641afc440 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 6 May 2020 15:27:37 -0700 Subject: [PATCH 092/278] Provide a map of parsed fields. Add more debugging. --- kgtk/join/kgtkvalue.py | 89 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 85 insertions(+), 4 deletions(-) diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py index 9bdb3331f..583767f3c 100644 --- a/kgtk/join/kgtkvalue.py +++ b/kgtk/join/kgtkvalue.py @@ -38,6 +38,9 @@ class KgtkValue(KgtkFormat): # obj.revalidate() returns True # obj.is_language_qualified_string(validate=True) returns True #... etc. + # + # The fields may be accessed directly, or thet may be obtained in + # a map via obj.get_fields() # Offer the components of a string or language-qualified string, after validating the item. contents: typing.Optional[str] = None # String contents without the enclosing quotes @@ -1098,6 +1101,73 @@ def describe(self)->str: return "Symbol" if self.is_symbol(validate=True) else "Invalid Symbol" else: return "Unknown" + + def get_fields(self)->typing.Mapping[str, typing.Union[str, int, float, bool]]: + results: typing.MutableMapping[str, typing.Union[str, int, float, bool]] = { } + if self.data_type is not None: + results["data_type"] = str(self.data_type) + if self.valid is not None: + results["valid"] = self.valid + if self.contents is not None: + results["contents"] = self.contents + if self.lang is not None: + results["lang"] = self.lang + if self.suffix is not None: + results["suffix"] = self.suffix + if self.numberstr is not None: + results["numberstr"] = self.numberstr + if self.number is not None: + results["number"] = self.number + if self.low_tolerancestr is not None: + results["low_tolerancestr"] = self.low_tolerancestr + if self.high_tolerancestr is not None: + results["high_tolerancestr"] = self.high_tolerancestr + if self.si_units is not None: + results["si_units"] = self.si_units + if self.wikidata_node is not None: + results["wikidata_node"] = self.wikidata_node + if self.latstr is not None: + results["latstr"] = self.latstr + if self.lat is not None: + results["lat"] = self.lat + if self.lonstr is not None: + results["lonstr"] = self.lonstr + if self.lon is not None: + results["lon"] = self.lon + if self.yearstr is not None: + results["yearstr"] = self.yearstr + if self.year is not None: + results["year"] = self.year + if self.monthstr is not None: + results["monthstr"] = self.monthstr + if self.month is not None: + results["month"] = self.month + if self.daystr is not None: + results["daystr"] = self.daystr + if self.day is not None: + results["day"] = self.day + if self.hourstr is not None: + results["hourstr"] = self.hourstr + if self.hour is not None: + results["hour"] = self.hour + if self.minutesstr is not None: + results["minutesstr"] = self.minutesstr + if self.minutes is not None: + results["minutes"] = self.minutes + if self.secondsstr is not None: + results["secondsstr"] = self.secondsstr + if self.seconds is not None: + results["seconds"] = self.seconds + if self.zonestr is not None: + results["zonestr"] = self.zonestr + if self.precisionstr is not None: + results["precisionstr"] = self.precisionstr + if self.iso8601extended is not None: + results["iso8601extended"] = self.iso8601extended + list_items: typing.List[KgtkValue] = self.get_list_items() + if len(list_items) > 0: + results["list_len"] = len(list_items) + return results def main(): """ @@ -1117,12 +1187,23 @@ def main(): for value in args.values: kv: KgtkValue = KgtkValue(value, options=value_options) kv.validate() - nv: str = kv.value - if value == nv: + if value == kv.value: print("%s: %s" % (value, kv.describe()), flush=True) else: - print("%s => %s: %s" % (value, nv, kv.describe()), flush=True) - + print("%s => %s: %s" % (value, kv.value, kv.describe()), flush=True) + + if args.verbose: + fields = kv.get_fields() + for key in sorted(fields.keys()): + print("%s: %s" % (key, str(fields[key]))) + list_items: typing.List[KgtkValue] = kv.get_list_items() + item: KghtValue + for item in list_items: + print("...") + fields = item.get_fields() + for key in sorted(fields.keys()): + print("... %s: %s" % (key, str(fields[key]))) + if __name__ == "__main__": main() From 49be20069a5ed571f20868b777efbad700cf2766 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 6 May 2020 15:29:56 -0700 Subject: [PATCH 093/278] Better documentation. --- kgtk/join/kgtkvalue.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py index 583767f3c..415e5be03 100644 --- a/kgtk/join/kgtkvalue.py +++ b/kgtk/join/kgtkvalue.py @@ -30,17 +30,18 @@ class KgtkValue(KgtkFormat): # Note: Please do not access this list directly. Use get_list_items(). list_items: typing.Optional[typing.List['KgtkValue']] = None - # The following members offer access to the fields of a KgtkValue. - # They are accessible immediately after validating the contents - # of the KgtkValue object: + # The following members offer access to the components (fields) of a + # KgtkValue. They are accessible immediately after validating the + # contents of the KgtkValue object: + # # obj.is_valid() return True # obj.validate() returns True # obj.revalidate() returns True # obj.is_language_qualified_string(validate=True) returns True #... etc. # - # The fields may be accessed directly, or thet may be obtained in - # a map via obj.get_fields() + # The fields may be accessed directly from this object or they may be + # obtained as a map via obj.get_fields() # Offer the components of a string or language-qualified string, after validating the item. contents: typing.Optional[str] = None # String contents without the enclosing quotes From 7a79093bc5b6a14d4699228a59400919b05bef65 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 6 May 2020 15:39:38 -0700 Subject: [PATCH 094/278] DOcument node or edge file. --- kgtk/join/kgtkreader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kgtk/join/kgtkreader.py b/kgtk/join/kgtkreader.py index 9b338eb49..2c64ef9d6 100644 --- a/kgtk/join/kgtkreader.py +++ b/kgtk/join/kgtkreader.py @@ -1,5 +1,5 @@ """ -Read a KGTK edge file in TSV format. +Read a KGTK node or edge file in TSV format. TODO: Add support for alternative envelope formats, such as JSON. """ From f174a1728b86f5630b8306efec3c82f64e6639eb Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 6 May 2020 16:28:49 -0700 Subject: [PATCH 095/278] Change hot value_options are passed. Pass value_options to the edge joiner. Fix bug in lang_suffix parsing. --- kgtk/join/edgejoiner.py | 20 ++++++++++++++++++-- kgtk/join/edgereader.py | 4 ++-- kgtk/join/kgtkreader.py | 7 ++++--- kgtk/join/kgtkvalue.py | 2 +- kgtk/join/nodereader.py | 4 ++-- 5 files changed, 27 insertions(+), 10 deletions(-) diff --git a/kgtk/join/edgejoiner.py b/kgtk/join/edgejoiner.py index 83d592a61..dc93907b4 100644 --- a/kgtk/join/edgejoiner.py +++ b/kgtk/join/edgejoiner.py @@ -18,6 +18,7 @@ from kgtk.join.edgereader import EdgeReader from kgtk.join.kgtkformat import KgtkFormat from kgtk.join.kgtkwriter import KgtkWriter +from kgtk.join.kgtkvalueoptions import KgtkValueOptions, DEFAULT_KGTK_VALUE_OPTIONS from kgtk.join.validationaction import ValidationAction @attr.s(slots=True, frozen=True) @@ -51,6 +52,10 @@ class EdgeJoiner(KgtkFormat): fill_short_lines: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) truncate_long_lines: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) + # TODO: find a working validator + # value_options: typing.Optional[KgtkValueOptions] = attr.ib(attr.validators.optional(attr.validators.instance_of(KgtkValueOptions)), default=None) + value_options: typing.Optional[KgtkValueOptions] = attr.ib(default=None) + gzip_in_parallel: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) verbose: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) @@ -94,6 +99,7 @@ def extract_join_key_set(self, file_path: Path, who: str)->typing.Set[str]: long_line_action=self.long_line_action, fill_short_lines=self.fill_short_lines, truncate_long_lines=self.truncate_long_lines, + value_options = self.value_options, gzip_in_parallel=self.gzip_in_parallel, verbose=self.verbose, very_verbose=self.very_verbose) @@ -181,13 +187,16 @@ def process(self): short_line_action=self.short_line_action, long_line_action=self.long_line_action, fill_short_lines=self.fill_short_lines, - truncate_long_lines=self.truncate_long_lines) + truncate_long_lines=self.truncate_long_lines, + value_options = self.value_options) + right_kr: EdgeReader = EdgeReader.open_edge_file(self.right_file_path, short_line_action=self.short_line_action, long_line_action=self.long_line_action, fill_short_lines=self.fill_short_lines, - truncate_long_lines=self.truncate_long_lines) + truncate_long_lines=self.truncate_long_lines, + value_options = self.value_options) # Map the right column names for the join: joined_column_names: typing.List[str] @@ -256,8 +265,14 @@ def main(): help="Remove excess trailing columns in long lines.", action='store_true') parser.add_argument("-v", "--verbose", dest="verbose", help="Print additional progress messages.", action='store_true') parser.add_argument( "--very-verbose", dest="very_verbose", help="Print additional progress messages.", action='store_true') + + KgtkValueOptions.add_arguments(parser) + args = parser.parse_args() + # Build the value parsing option structure. + value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) + ej: EdgeJoiner = EdgeJoiner(left_file_path=args.left_file_path, right_file_path=args.right_file_path, output_path=args.output_file_path, @@ -271,6 +286,7 @@ def main(): long_line_action=args.long_line_action, fill_short_lines=args.fill_short_lines, truncate_long_lines=args.truncate_long_lines, + value_options=value_options, gzip_in_parallel=args.gzip_in_parallel, verbose=args.verbose, very_verbose=args.very_verbose) diff --git a/kgtk/join/edgereader.py b/kgtk/join/edgereader.py index 0d687988e..d4d343148 100644 --- a/kgtk/join/edgereader.py +++ b/kgtk/join/edgereader.py @@ -13,7 +13,7 @@ from kgtk.join.closableiter import ClosableIter from kgtk.join.enumnameaction import EnumNameAction from kgtk.join.kgtkreader import KgtkReader -from kgtk.join.kgtkvalueoptions import KgtkValueOptions, DEFAULT_KGTK_VALUE_OPTIONS +from kgtk.join.kgtkvalueoptions import KgtkValueOptions from kgtk.join.validationaction import ValidationAction @attr.s(slots=True, frozen=False) @@ -38,7 +38,7 @@ def open_edge_file(cls, invalid_value_action: ValidationAction = ValidationAction.REPORT, header_error_action: ValidationAction = ValidationAction.EXIT, unsafe_column_name_action: ValidationAction = ValidationAction.REPORT, - value_options: KgtkValueOptions = DEFAULT_KGTK_VALUE_OPTIONS, + value_options: typing.Optional[KgtkValueOptions] = None, compression_type: typing.Optional[str] = None, gzip_in_parallel: bool = False, gzip_queue_size: int = KgtkReader.GZIP_QUEUE_SIZE_DEFAULT, diff --git a/kgtk/join/kgtkreader.py b/kgtk/join/kgtkreader.py index 2c64ef9d6..6b73c8f0e 100644 --- a/kgtk/join/kgtkreader.py +++ b/kgtk/join/kgtkreader.py @@ -84,7 +84,7 @@ class KgtkReader(KgtkBase, ClosableIter[typing.List[str]]): # Validate data cell values? invalid_value_action: ValidationAction = attr.ib(validator=attr.validators.instance_of(ValidationAction), default=ValidationAction.REPORT) - value_options: KgtkValueOptions = attr.ib(validator=attr.validators.instance_of(KgtkValueOptions), default=DEFAULT_KGTK_VALUE_OPTIONS) + value_options: typing.Optional[KgtkValueOptions] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(KgtkValueOptions)), default=None) # Repair records with too many or too few fields? fill_short_lines: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) @@ -132,7 +132,7 @@ def open(cls, invalid_value_action: ValidationAction = ValidationAction.REPORT, header_error_action: ValidationAction = ValidationAction.EXIT, unsafe_column_name_action: ValidationAction = ValidationAction.REPORT, - value_options: KgtkValueOptions = DEFAULT_KGTK_VALUE_OPTIONS, + value_options: typing.Optional[KgtkValueOptions] = None, compression_type: typing.Optional[str] = None, gzip_in_parallel: bool = False, gzip_queue_size: int = GZIP_QUEUE_SIZE_DEFAULT, @@ -594,12 +594,13 @@ def _ignore_invalid_values(self, values: typing.List[str], line: str)->bool: Returns True to indicate that the row should be ignored (skipped). """ + options: KgtkValueOptions = self.value_options if self.value_options is not None else DEFAULT_KGTK_VALUE_OPTIONS problems: typing.List[str] = [ ] # Build a list of problems. idx: int value: str for idx, value in enumerate(values): if len(value) > 0: # Optimize the common case of empty columns. - kv: KgtkValue = KgtkValue(value, options=self.value_options) + kv: KgtkValue = KgtkValue(value, options=options) if not kv.is_valid(): problems.append("col %d (%s) value '%s'is an %s" % (idx, self.column_names[idx], value, kv.describe())) diff --git a/kgtk/join/kgtkvalue.py b/kgtk/join/kgtkvalue.py index 415e5be03..9bec2605e 100644 --- a/kgtk/join/kgtkvalue.py +++ b/kgtk/join/kgtkvalue.py @@ -555,7 +555,7 @@ def is_boolean(self, validate: bool = False)->bool: # Support two or three character language codes. Suports hyphenated codes # with a country code or dialect namesuffix after the language code. - lax_language_qualified_string_re: typing.Pattern = re.compile(r"^'(?P.*)'@(?P[a-zA-Z]{2,3}(?P-[a-zA-Z]+)?)$") + lax_language_qualified_string_re: typing.Pattern = re.compile(r"^'(?P.*)'@(?P(?P[a-zA-Z]{2,3})(?P-[a-zA-Z]+)?)$") strict_language_qualified_string_re: typing.Pattern = re.compile(r"^'(?P(?:[^'\\]|\\.)*)'@(?P(?P[a-zA-Z]{2,3})(?P-[a-zA-Z]+)?)$") def is_language_qualified_string(self, validate: bool=False)->bool: diff --git a/kgtk/join/nodereader.py b/kgtk/join/nodereader.py index 0f83d8b8a..4f4189a6f 100644 --- a/kgtk/join/nodereader.py +++ b/kgtk/join/nodereader.py @@ -13,7 +13,7 @@ from kgtk.join.closableiter import ClosableIter from kgtk.join.enumnameaction import EnumNameAction from kgtk.join.kgtkreader import KgtkReader -from kgtk.join.kgtkvalueoptions import KgtkValueOptions, DEFAULT_KGTK_VALUE_OPTIONS +from kgtk.join.kgtkvalueoptions import KgtkValueOptions from kgtk.join.validationaction import ValidationAction @attr.s(slots=True, frozen=False) @@ -37,7 +37,7 @@ def open_node_file(cls, invalid_value_action: ValidationAction = ValidationAction.REPORT, header_error_action: ValidationAction = ValidationAction.EXIT, unsafe_column_name_action: ValidationAction = ValidationAction.REPORT, - value_options: KgtkValueOptions = DEFAULT_KGTK_VALUE_OPTIONS, + value_options: typing.Optional[KgtkValueOptions] = None, compression_type: typing.Optional[str] = None, gzip_in_parallel: bool = False, gzip_queue_size: int = KgtkReader.GZIP_QUEUE_SIZE_DEFAULT, From b969c6625b82761d1aacc9b1063eed17ea2751a8 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 6 May 2020 17:37:55 -0700 Subject: [PATCH 096/278] Give better verbose feedback. Optimize the join key processsing. Accept an error limit. --- kgtk/join/edgejoiner.py | 94 ++++++++++++++++++++++++++++++++--------- 1 file changed, 75 insertions(+), 19 deletions(-) diff --git a/kgtk/join/edgejoiner.py b/kgtk/join/edgejoiner.py index dc93907b4..2a7044283 100644 --- a/kgtk/join/edgejoiner.py +++ b/kgtk/join/edgejoiner.py @@ -58,6 +58,8 @@ class EdgeJoiner(KgtkFormat): gzip_in_parallel: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) + error_limit: int = attr.ib(validator=attr.validators.instance_of(int), default=EdgeReader.ERROR_LIMIT_DEFAULT) + verbose: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) very_verbose: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) @@ -94,6 +96,8 @@ def single_column_key_set(self, kr: EdgeReader, join_idx: int)->typing.Set[str]: return result def extract_join_key_set(self, file_path: Path, who: str)->typing.Set[str]: + if self.verbose: + print("Extracting the %s join key set" % who) kr: EdgeReader = EdgeReader.open_edge_file(file_path, short_line_action=self.short_line_action, long_line_action=self.long_line_action, @@ -101,6 +105,7 @@ def extract_join_key_set(self, file_path: Path, who: str)->typing.Set[str]: truncate_long_lines=self.truncate_long_lines, value_options = self.value_options, gzip_in_parallel=self.gzip_in_parallel, + error_limit=self.error_limit, verbose=self.verbose, very_verbose=self.very_verbose) @@ -123,24 +128,30 @@ def extract_join_key_set(self, file_path: Path, who: str)->typing.Set[str]: return self.single_column_key_set(kr, join_idx) # closes er file - def join_key_sets(self)->typing.Set[str]: + def join_key_sets(self)->typing.Optional[typing.Set[str]]: """ Read the input edge files the first time, building the sets of left and right join values. """ - left_join_key_set: typing.Set[str] = self.extract_join_key_set(self.left_file_path, "left") - right_join_key_set: typing.Set[str] = self.extract_join_key_set(self.right_file_path, "right") - - joined_key_set: typing.Set[str] if self.left_join and self.right_join: - # TODO: This joins everything! We can shortut computing these sets. - joined_key_set = left_join_key_set.union(right_join_key_set) + if self.verbose: + print("Outer join, no need to compute join keys.") + return None elif self.left_join and not self.right_join: - joined_key_set = left_join_key_set.copy() + if self.verbose: + print("Computing the left join key set") + return self.extract_join_key_set(self.left_file_path, "left").copy() + elif self.right_join and not self.left_join: - joined_key_set = right_join_key_set.copy() + if self.verbose: + print("Computing the right join key set") + return self.extract_join_key_set(self.right_file_path, "right").copy() + else: - joined_key_set = left_join_key_set.intersection(right_join_key_set) - return joined_key_set + if self.verbose: + print("Computing the inner join key set") + left_join_key_set: typing.Set[str] = self.extract_join_key_set(self.left_file_path, "left") + right_join_key_set: typing.Set[str] = self.extract_join_key_set(self.right_file_path, "right") + return left_join_key_set.intersection(right_join_key_set) def merge_columns(self, left_kr: EdgeReader, right_kr: EdgeReader)->typing.Tuple[typing.List[str], typing.List[str]]: joined_column_names: typing.List[str] = [ ] @@ -180,25 +191,33 @@ def merge_columns(self, left_kr: EdgeReader, right_kr: EdgeReader)->typing.Tuple return (joined_column_names, right_column_names) def process(self): - joined_key_set: typing.Set[str] = self.join_key_sets() + joined_key_set: typing.Optional[typing.Set[str]] = self.join_key_sets() + if self.verbose: + print("Opening the left edge file: %s" % str(self.left_file_path)) # Open the input files for the second time. This won't work with stdin. left_kr: EdgeReader = EdgeReader.open_edge_file(self.left_file_path, short_line_action=self.short_line_action, long_line_action=self.long_line_action, fill_short_lines=self.fill_short_lines, truncate_long_lines=self.truncate_long_lines, - value_options = self.value_options) + value_options = self.value_options, + error_limit=self.error_limit) + if self.verbose: + print("Opening the right edge file: %s" % str(self.right_file_path)) right_kr: EdgeReader = EdgeReader.open_edge_file(self.right_file_path, short_line_action=self.short_line_action, long_line_action=self.long_line_action, fill_short_lines=self.fill_short_lines, truncate_long_lines=self.truncate_long_lines, - value_options = self.value_options) + value_options = self.value_options, + error_limit=self.error_limit) - # Map the right column names for the join: + + if self.verbose: + print("Mapping the column names for the join.") joined_column_names: typing.List[str] right_column_names: typing.List[str] (joined_column_names, right_column_names) = self.merge_columns(left_kr, right_kr) @@ -209,6 +228,8 @@ def process(self): print("mapped right columns: %s" % " ".join(right_column_names)) print(" joined columns: %s" % " ".join(joined_column_names)) + if self.verbose: + print("Opening the output edge file: %s" % str(self.output_path)) ew: KgtkWriter = KgtkWriter.open(joined_column_names, self.output_path, require_all_columns=False, @@ -218,21 +239,52 @@ def process(self): verbose=self.verbose, very_verbose=self.very_verbose) + output_data_lines: int = 0 + left_data_lines_read: int = 0 + left_data_lines_kept: int = 0 + right_data_lines_read: int = 0 + right_data_lines_kept: int = 0 + + if self.verbose: + print("Processing the left input file") row: typing.list[str] left_node1_idx: int = self.node1_column_idx(left_kr, who="left") for row in left_kr: - left_key: str = self.build_join_key(left_kr, left_node1_idx, row) - if left_key in joined_key_set: + left_data_lines_read += 1 + if joined_key_set is None: ew.write(row) + output_data_lines += 1 + left_data_lines_kept += 1 + else: + left_key: str = self.build_join_key(left_kr, left_node1_idx, row) + if left_key in joined_key_set: + ew.write(row) + output_data_lines += 1 + left_data_lines_kept += 1 + if self.verbose: + print("Processing the right input file") right_shuffle_list: typing.List[int] = ew.build_shuffle_list(right_column_names) right_node1_idx: int = self.node1_column_idx(right_kr, who="right") for row in right_kr: - right_key: str = self.build_join_key(right_kr, right_node1_idx, row) - if right_key in joined_key_set: + right_data_lines_read += 1 + if joined_key_set is None: ew.write(row, shuffle_list=right_shuffle_list) + output_data_lines += 1 + right_data_lines_kept += 1 + else: + right_key: str = self.build_join_key(right_kr, right_node1_idx, row) + if right_key in joined_key_set: + ew.write(row, shuffle_list=right_shuffle_list) + output_data_lines += 1 + right_data_lines_kept += 1 ew.close() + if self.verbose: + print("The join is complete") + print("%d left input data lines read, %d kept" % (left_data_lines_read, left_data_lines_kept)) + print("%d right input data lines read, %d kept" % (right_data_lines_read, right_data_lines_kept)) + print("%d data lines written." % output_data_lines) def main(): """ @@ -241,6 +293,9 @@ def main(): parser = ArgumentParser() parser.add_argument(dest="left_file_path", help="The left KGTK file to join", type=Path) parser.add_argument(dest="right_file_path", help="The right KGTK file to join", type=Path) + parser.add_argument( "--error-limit", dest="error_limit", + help="The maximum number of errors to report before failing", type=int, default=EdgeReader.ERROR_LIMIT_DEFAULT) + parser.add_argument( "--field-separator", dest="field_separator", help="Separator for multifield keys", default=EdgeJoiner.FIELD_SEPARATOR_DEFAULT) parser.add_argument( "--fill-short-lines", dest="fill_short_lines", help="Fill missing trailing columns in short lines with empty values.", action='store_true') @@ -288,6 +343,7 @@ def main(): truncate_long_lines=args.truncate_long_lines, value_options=value_options, gzip_in_parallel=args.gzip_in_parallel, + error_limit=args.error_limit, verbose=args.verbose, very_verbose=args.very_verbose) From aede6c1eb396078bcdb08253bfe878013e495c86 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 6 May 2020 17:45:08 -0700 Subject: [PATCH 097/278] Flush progress reports to ensure a timely appearance. Provide key file names. --- kgtk/join/edgejoiner.py | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/kgtk/join/edgejoiner.py b/kgtk/join/edgejoiner.py index 2a7044283..fcee3ee94 100644 --- a/kgtk/join/edgejoiner.py +++ b/kgtk/join/edgejoiner.py @@ -97,7 +97,7 @@ def single_column_key_set(self, kr: EdgeReader, join_idx: int)->typing.Set[str]: def extract_join_key_set(self, file_path: Path, who: str)->typing.Set[str]: if self.verbose: - print("Extracting the %s join key set" % who) + print("Extracting the %s join key set from %s" % (who, str(file_path)), flush=True) kr: EdgeReader = EdgeReader.open_edge_file(file_path, short_line_action=self.short_line_action, long_line_action=self.long_line_action, @@ -134,21 +134,21 @@ def join_key_sets(self)->typing.Optional[typing.Set[str]]: """ if self.left_join and self.right_join: if self.verbose: - print("Outer join, no need to compute join keys.") + print("Outer join, no need to compute join keys.", flush=True) return None elif self.left_join and not self.right_join: if self.verbose: - print("Computing the left join key set") + print("Computing the left join key set", flush=True) return self.extract_join_key_set(self.left_file_path, "left").copy() elif self.right_join and not self.left_join: if self.verbose: - print("Computing the right join key set") + print("Computing the right join key set", flush=True) return self.extract_join_key_set(self.right_file_path, "right").copy() else: if self.verbose: - print("Computing the inner join key set") + print("Computing the inner join key set", flush=True) left_join_key_set: typing.Set[str] = self.extract_join_key_set(self.left_file_path, "left") right_join_key_set: typing.Set[str] = self.extract_join_key_set(self.right_file_path, "right") return left_join_key_set.intersection(right_join_key_set) @@ -194,7 +194,7 @@ def process(self): joined_key_set: typing.Optional[typing.Set[str]] = self.join_key_sets() if self.verbose: - print("Opening the left edge file: %s" % str(self.left_file_path)) + print("Opening the left edge file: %s" % str(self.left_file_path), flush=True) # Open the input files for the second time. This won't work with stdin. left_kr: EdgeReader = EdgeReader.open_edge_file(self.left_file_path, short_line_action=self.short_line_action, @@ -206,7 +206,7 @@ def process(self): if self.verbose: - print("Opening the right edge file: %s" % str(self.right_file_path)) + print("Opening the right edge file: %s" % str(self.right_file_path), flush=True) right_kr: EdgeReader = EdgeReader.open_edge_file(self.right_file_path, short_line_action=self.short_line_action, long_line_action=self.long_line_action, @@ -217,19 +217,19 @@ def process(self): if self.verbose: - print("Mapping the column names for the join.") + print("Mapping the column names for the join.", flush=True) joined_column_names: typing.List[str] right_column_names: typing.List[str] (joined_column_names, right_column_names) = self.merge_columns(left_kr, right_kr) if self.verbose: - print(" left columns: %s" % " ".join(left_kr.column_names)) - print(" right columns: %s" % " ".join(right_kr.column_names)) - print("mapped right columns: %s" % " ".join(right_column_names)) - print(" joined columns: %s" % " ".join(joined_column_names)) + print(" left columns: %s" % " ".join(left_kr.column_names), flush=True) + print(" right columns: %s" % " ".join(right_kr.column_names), flush=True) + print("mapped right columns: %s" % " ".join(right_column_names), flush=True) + print(" joined columns: %s" % " ".join(joined_column_names), flush=True) if self.verbose: - print("Opening the output edge file: %s" % str(self.output_path)) + print("Opening the output edge file: %s" % str(self.output_path), flush=True) ew: KgtkWriter = KgtkWriter.open(joined_column_names, self.output_path, require_all_columns=False, @@ -246,7 +246,7 @@ def process(self): right_data_lines_kept: int = 0 if self.verbose: - print("Processing the left input file") + print("Processing the left input file: %s" % str(self.left_file_path), flush=True) row: typing.list[str] left_node1_idx: int = self.node1_column_idx(left_kr, who="left") for row in left_kr: @@ -263,7 +263,7 @@ def process(self): left_data_lines_kept += 1 if self.verbose: - print("Processing the right input file") + print("Processing the right input file: %s" % str(self.right_file_path), flush=True) right_shuffle_list: typing.List[int] = ew.build_shuffle_list(right_column_names) right_node1_idx: int = self.node1_column_idx(right_kr, who="right") for row in right_kr: @@ -281,10 +281,10 @@ def process(self): ew.close() if self.verbose: - print("The join is complete") - print("%d left input data lines read, %d kept" % (left_data_lines_read, left_data_lines_kept)) - print("%d right input data lines read, %d kept" % (right_data_lines_read, right_data_lines_kept)) - print("%d data lines written." % output_data_lines) + print("The join is complete", flush=True) + print("%d left input data lines read, %d kept" % (left_data_lines_read, left_data_lines_kept), flush=True) + print("%d right input data lines read, %d kept" % (right_data_lines_read, right_data_lines_kept), flush=True) + print("%d data lines written." % output_data_lines, flush=True) def main(): """ From c2381984f046f8d813943705e737b3137e1e37a4 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 6 May 2020 18:06:06 -0700 Subject: [PATCH 098/278] Give more feedback on the join columns. Flush the output file when done with the left input file. --- kgtk/join/edgejoiner.py | 10 +++++++++- kgtk/join/kgtkwriter.py | 4 ++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/kgtk/join/edgejoiner.py b/kgtk/join/edgejoiner.py index fcee3ee94..2b5e35555 100644 --- a/kgtk/join/edgejoiner.py +++ b/kgtk/join/edgejoiner.py @@ -97,7 +97,7 @@ def single_column_key_set(self, kr: EdgeReader, join_idx: int)->typing.Set[str]: def extract_join_key_set(self, file_path: Path, who: str)->typing.Set[str]: if self.verbose: - print("Extracting the %s join key set from %s" % (who, str(file_path)), flush=True) + print("Extracting the join key set from the %s input file: %s" % (who, str(file_path)), flush=True) kr: EdgeReader = EdgeReader.open_edge_file(file_path, short_line_action=self.short_line_action, long_line_action=self.long_line_action, @@ -113,15 +113,21 @@ def extract_join_key_set(self, file_path: Path, who: str)->typing.Set[str]: raise ValueError("The %s file is not an edge file" % who) join_idx: int = self.node1_column_idx(kr, who) + if self.verbose: + print("Joining on node1 (index %s in the %s input file)" % (join_idx, who)) # join_on_label and join_on_node2 may be specified if self.join_on_label or self.join_on_node2: if self.join_on_label: if kr.label_column_idx < 0: raise ValueError("join_on_label may not be used because the %s input file does not have a label column." % who) + if self.verbose: + print("Joining on label (index %s in the %s input file)" % (kr.label_column_idx, who)) if self.join_on_node2: if kr.node2_column_idx < 0: raise ValueError("join_on_node2 may not be used because the %s input file does not have a node2 column." % who) + if self.verbose: + print("Joining on node2 (index %s in the %s input file)" % (kr.node2_column_idx, who)) return self.multi_column_key_set(kr, join_idx) # closes er file else: # This uses optimized code: @@ -261,6 +267,8 @@ def process(self): ew.write(row) output_data_lines += 1 left_data_lines_kept += 1 + # Flush the output file so far: + ew.flush() if self.verbose: print("Processing the right input file: %s" % str(self.right_file_path), flush=True) diff --git a/kgtk/join/kgtkwriter.py b/kgtk/join/kgtkwriter.py index 2aac2f7e1..367b6f88f 100644 --- a/kgtk/join/kgtkwriter.py +++ b/kgtk/join/kgtkwriter.py @@ -289,6 +289,10 @@ def write(self, values: typing.List[str], sys.stdout.write(".") sys.stdout.flush() + def flush(self): + if self.gzip_thread is None: + self.file_out.flush() + def close(self): if self.gzip_thread is not None: self.gzip_thread.close() From 41378a836020592932588c205b867c53ddf79d7c Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 6 May 2020 18:10:49 -0700 Subject: [PATCH 099/278] Give feedback on the number of join keys. --- kgtk/join/edgejoiner.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/kgtk/join/edgejoiner.py b/kgtk/join/edgejoiner.py index 2b5e35555..b70101422 100644 --- a/kgtk/join/edgejoiner.py +++ b/kgtk/join/edgejoiner.py @@ -138,6 +138,7 @@ def join_key_sets(self)->typing.Optional[typing.Set[str]]: """ Read the input edge files the first time, building the sets of left and right join values. """ + join_key_set: typing.Set[str] if self.left_join and self.right_join: if self.verbose: print("Outer join, no need to compute join keys.", flush=True) @@ -145,19 +146,32 @@ def join_key_sets(self)->typing.Optional[typing.Set[str]]: elif self.left_join and not self.right_join: if self.verbose: print("Computing the left join key set", flush=True) - return self.extract_join_key_set(self.left_file_path, "left").copy() + join_key_set = self.extract_join_key_set(self.left_file_path, "left").copy() + if self.verbose: + print("There are %d keys in the left join key set." % len(join_key_set)) + return join_key_set elif self.right_join and not self.left_join: if self.verbose: print("Computing the right join key set", flush=True) - return self.extract_join_key_set(self.right_file_path, "right").copy() + join_key_set = self.extract_join_key_set(self.right_file_path, "right").copy() + if self.verbose: + print("There are %d keys in the right join key set." % len(join_key_set)) + return join_key_set else: if self.verbose: print("Computing the inner join key set", flush=True) left_join_key_set: typing.Set[str] = self.extract_join_key_set(self.left_file_path, "left") + if self.verbose: + print("There are %d keys in the left file key set." % len(left_join_key_set)) right_join_key_set: typing.Set[str] = self.extract_join_key_set(self.right_file_path, "right") - return left_join_key_set.intersection(right_join_key_set) + if self.verbose: + print("There are %d keys in the right file key set." % len(right_join_key_set)) + join_key_set = left_join_key_set.intersection(right_join_key_set) + if self.verbose: + print("There are %d keys in the inner join key set." % len(join_key_set)) + return joiin_key_set def merge_columns(self, left_kr: EdgeReader, right_kr: EdgeReader)->typing.Tuple[typing.List[str], typing.List[str]]: joined_column_names: typing.List[str] = [ ] From 5dfb02a3df62493be336a4ff1d203a342324cbd0 Mon Sep 17 00:00:00 2001 From: ckxz105 Date: Wed, 6 May 2020 19:00:59 -0700 Subject: [PATCH 100/278] vector embedding: add support for dimension value set, pca, property-value setting. Please refer to readme update for details. --- kgtk/cli/text_embedding.py | 104 ++++++----- kgtk/cli/text_embedding_README.md | 42 +++-- kgtk/gt/embedding_utils.py | 296 +++++++++++++++++------------- 3 files changed, 255 insertions(+), 187 deletions(-) diff --git a/kgtk/cli/text_embedding.py b/kgtk/cli/text_embedding.py index 6083da388..ed73266dd 100644 --- a/kgtk/cli/text_embedding.py +++ b/kgtk/cli/text_embedding.py @@ -130,36 +130,36 @@ def main(**kwargs): import re import argparse import pickle + from collections import defaultdict from kgtk.gt.embedding_utils import EmbeddingVector # get input parameters from kwargs output_uri = kwargs.get("output_uri", "") parallel_count = kwargs.get("parallel_count", "1") - black_list_files = kwargs.get("black_list_files", "") + black_list_files = kwargs.get("black_list_files", []) all_models_names = kwargs.get("all_models_names", ['bert-base-wikipedia-sections-mean-tokens']) input_format = kwargs.get("input_format", "kgtk_format") input_uris = kwargs.get("input_uris", []) output_format = kwargs.get("output_format", "kgtk_format") property_labels_files = kwargs.get("property_labels_file_uri", []) query_server = kwargs.get("query_server") - properties = dict() - all_property_relate_inputs = [kwargs.get("label_properties", ["label"]), - kwargs.get("description_properties", ["description"]), - kwargs.get("isa_properties", ["P31"]), - kwargs.get("has_properties", ["all"]), - ] - all_required_properties = ["label_properties", "description_properties", - "isa_properties", "has_properties"] - cache_config = {"use_cache": kwargs.get("use_cache", False), - "host": kwargs.get("cache_host", "dsbox01.isi.edu"), - "port": kwargs.get("cache_port", 6379) - } - for each_property, each_input in zip(all_required_properties, all_property_relate_inputs): - for each in each_input: - properties[each] = each_property + + cache_config = { + "use_cache": kwargs.get("use_cache", False), + "host": kwargs.get("cache_host", "dsbox01.isi.edu"), + "port": kwargs.get("cache_port", 6379) + } + + sentence_properties = { + "label_properties": kwargs.get("label_properties", ["label"]), + "description_properties": kwargs.get("description_properties", ["description"]), + "isa_properties": kwargs.get("isa_properties", ["P31"]), + "has_properties": kwargs.get("has_properties", ["all"]), + "property_values": kwargs.get("property_values", []) + } output_properties = { - "metatada_properties": kwargs.get("metatada_properties", []), + "metadata_properties": kwargs.get("metadata_properties", []), "output_properties": kwargs.get("output_properties", "text_embedding") } @@ -173,8 +173,8 @@ def main(**kwargs): raise KGTKException("No input file path given!") if output_uri == "": - output_uri = os.getenv("HOME") # os.getcwd() - if black_list_files != "": + output_uri = os.getenv("HOME") + if black_list_files: black_list_set = load_black_list_files(black_list_files) else: black_list_set = set() @@ -184,7 +184,8 @@ def main(**kwargs): else: property_labels_dict = {} - run_TSNE = kwargs.get("run_TSNE", True) + dimensional_reduction = kwargs.get("dimensional_reduction", "none") + dimension_val = kwargs.get("dimension_val", 2) for each_model_name in all_models_names: for each_input_file in input_uris: @@ -192,13 +193,14 @@ def main(**kwargs): process = EmbeddingVector(each_model_name, query_server=query_server, cache_config=cache_config, parallel_count=parallel_count) process.read_input(file_path=each_input_file, skip_nodes_set=black_list_set, - input_format=input_format, target_properties=properties, + input_format=input_format, target_properties=sentence_properties, property_labels_dict=property_labels_dict) process.get_vectors() process.plot_result(output_properties=output_properties, input_format=input_format, output_uri=output_uri, - run_TSNE=run_TSNE, output_format=output_format) - process.evaluate_result() + dimensional_reduction=dimensional_reduction, dimension_val=dimension_val, + output_format=output_format) + # process.evaluate_result() _logger.info("*" * 20 + "finished" + "*" * 20) except Exception as e: _logger.debug(e, exc_info=True) @@ -212,17 +214,7 @@ def parser(): def add_arguments(parser): - import argparse - def str2bool(v): - if isinstance(v, bool): - return v - if v.lower() in ('yes', 'true', 't', 'y', '1'): - return True - elif v.lower() in ('no', 'false', 'f', 'n', '0'): - return False - else: - raise argparse.ArgumentTypeError('Boolean value expected.') - + from kgtk.gt.embedding_utils import str2bool parser.accept_shared_argument('_debug') # logging level, no longer need as there is a global --debug choice for it # parser.add_argument('-l', '--logging-level', action='store', dest='logging_level', @@ -243,27 +235,32 @@ def str2bool(v): help="the input file format, could either be `test_format` or `kgtk_format`, default is `kgtk_format`", ) parser.add_argument('-p', '--property-labels-file', action='store', nargs='+', dest='property_labels_file_uri', help="the path to the property labels file.", ) + # properties (only valid for kgtk format input/output data) parser.add_argument('--label-properties', action='store', nargs='+', dest='label_properties', default=["label"], - help="""The names of the eges for label properties, Default is ["label"]. \n + help="""The names of the edges for label properties, Default is ["label"]. \n This argument is only valid for input in kgtk format.""") parser.add_argument('--description-properties', action='store', nargs='+', dest='description_properties', default=["description"], - help="""The names of the eges for description properties, Default is ["description"].\n + help="""The names of the edges for description properties, Default is ["description"].\n This argument is only valid for input in kgtk format.""") parser.add_argument('--isa-properties', action='store', nargs='+', dest='isa_properties', default=["P31"], - help="""The names of the eges for `isa` properties, Default is ["P31"] (the `instance of` node in - wikidata).\n This argument is only valid for input in kgtk format.""") + help="""The names of the edges for `isa` properties, Default is ["P31"] (the `instance of` node in + wikidata).""") parser.add_argument('--has-properties', action='store', nargs='+', dest='has_properties', default=["all"], - help="""The names of the eges for `has` properties, Default is ["all"] (will automatically append all - properties found for each node).\n This argument is only valid for input in kgtk format.""") + help="""The names of the edges for `has` properties, Default is ["all"] (will automatically append all + properties found for each node).""") + parser.add_argument('--property-value', action='store', nargs='+', + dest='property_values', default=[], + help="""For those edges found in `has` properties, the nodes specified here will display with + corresponding edge(property) values. instead of edge name. """) parser.add_argument('--output-property', action='store', dest='output_properties', default="text_embedding", - help="""The output property name used to record the embedding. Default is `output_properties`. \nThis - argument is only valid for output in kgtk format.""") + help="""The output property name used to record the embedding. Default is `output_properties`. \n + This argument is only valid for output in kgtk format.""") # output parser.add_argument('-o', '--embedding-projector-metadata-path', action='store', dest='output_uri', default="", help="output path for the metadata file, default will be current user's home directory") @@ -272,18 +269,27 @@ def str2bool(v): help="output format, can either be `tsv_format` or `kgtk_format`. \nIf choose `tsv_format`, the output " "will be a tsv file, with each row contains only the vector representation of a node. Each " "dimension is separated by a tab") - parser.add_argument('--embedding-projector-metatada', action='store', nargs='+', - dest='metatada_properties', default=[], + parser.add_argument('--embedding-projector-metadata', action='store', nargs='+', + dest='metadata_properties', default=[], help="""list of properties used to construct a metadata file for use in the Google Embedding Projector: http://projector.tensorflow.org. \n Default: the label and description of each node.""") + # black list file parser.add_argument('-b', '--black-list', nargs='+', action='store', dest='black_list_files', - default="", + default=[], help="the black list file, contains the Q nodes which should not consider as candidates.") - # run tsne or not - parser.add_argument("--run-TSNE", type=str2bool, nargs='?', action='store', - default=True, dest="run_TSNE", - help="whether to run TSNE or not after the embedding, default is true.") + + # dimensional reduction relate + parser.add_argument("--dimensional-reduction", nargs='?', action='store', + default="none", dest="dimensional_reduction", choices=("pca", "tsne", "none"), + help='whether to run dimensional reduction algorithm or not after the embedding, default is None (not ' + 'run). ' + ) + parser.add_argument("--dimension", type=int, nargs='?', action='store', + default=2, dest="dimension_val", + help='How many dimension should remained after reductions, only valid when set to run dimensional ' + 'reduction, default value is 2 ' + ) parser.add_argument("--parallel", nargs='?', action='store', default="1", dest="parallel_count", diff --git a/kgtk/cli/text_embedding_README.md b/kgtk/cli/text_embedding_README.md index a05bf3ba7..17753bd8c 100644 --- a/kgtk/cli/text_embedding_README.md +++ b/kgtk/cli/text_embedding_README.md @@ -22,22 +22,22 @@ kgtk text_embedding \ --embedding-projector-path/ -o # optional, default is the home directory of current user --black-list/ -b # optional,default is None --logging-level/ -l \ # optional, default is `info` - --run-TSNE False # optional, default is True + --dimensional-reduction pca \ # optional, default is none + --dimension 5 \ #optional, default is 2 --parallel 4 # optional, default is 1 ``` ##### Example 1: For easiest running, just give the input file as `kgtk text_embedding -i input_file.csv` ##### Example 2: -Running with more specific parameters and not run TSNE (output original embedding vectors): +Running with more specific parameters and then run TSNE to reduce output dimension: ``` -kgtk text_embedding \ +kgtk text_embedding --debug \ --input test_edges_file.tsv \ --model bert-base-wikipedia-sections-mean-tokens bert-base-nli-cls-token \ --label-properties P1449 P1559 \ --description-properties P94 \ - --logging-level debug \ - --run-TSNE false + --dimensional-reduction tsne ``` ##### Example 3: Running with test format input and tsv output(for visulization at google embedding projector) @@ -116,20 +116,40 @@ an ordered list of properties. When a property contains multiple values, the fir If not given, the program will try to use the default edge(property) name as `P279`. Those words in properties will be for vector embedding later. ##### --has-properties -an ordered list of properties. The output consists of a comma-separated text with the labels of the properties, using and for the last item, e.g., “country, place of birth, religion and canonization status” +an ordered list of properties. The output consists of a comma-separated text with the labels of the properties, using and for the last item, e.g., “country, place of birth, religion and canonization status” . If not given, the program will use all of the found properties found for the node. Those words in properties will be for vector embedding later. +##### --property-value +If the properties in `has-properties` is a property which need to check for details, specify the edge name here and the system will go further to get the property values of this node instead of use the name of this edge. Default is empty `[]` +For example: For wikidata node `Q41421` (Michael Jordan) `P544` (member of sports team), if specified here, the generated sentence will be "Michael Jordan has Chicago Bulls" instead of "Michael Jordan has member of sports team". + ##### --out-properties the property used to record the embedding. If not given, the program will use the edge(property) name as `text_embedding`. This option is only available when output format is set to `kgtk_format`. +##### --property-labels-file +This parameter only works for KGTK format input. For some condition, KGTK format's value is just a reference to another P node. In this condition, user need to specify another label file for KGTK to read. +For example, if run without the labels file on the wikidata dump file, we will get some generated sentence like: +`WALS genus code is a Q19847637, Q20824104, and has P1466 and P1468` (sentence generated for P1467). After add the labels file, we will get the correct sentence as: `WALS genus code is a Wikidata property for an identifier, Wikidata property for items about languages, and has WALS family code and WALS lect code`. +This property labels file should also be a KGTK format file. One example file is [here](https://drive.google.com/open?id=1F7pb4LEx5MT1YTqycUCQcs8H2OWmBbB6 "here") (accessed only available for KGTK developers). + + +#### Dimensional Reduction Algorithm + +##### --dimensional-reduction +User can choose to whether run some dimensional reduction algorithm to reduce the output vector dimensions. Default is not run. +Currently 3 choices can be made: +- `none`: not run dimensional reduction algorithm) +- `tsne`: run TSNE algorithm, note that TSNE only works for some special dimensional number +- `pca`: run PCA alogirhtm + +##### --dimension +If specified to run dimensional algorithm, user can run with this choice to specify how many dimensions to keep for the final vector output. + ### Output There will be 2 part of files: -##### --run-TSNE -User can choose to whether run TSNE to reduce the dimension of the output vectors after getting the embeding vectors or not. The default is True. - ##### Logger file -User can set up the different logging level to records different infomation. Default is `warning` level. Available options are: `debug / info / warning / error / none`. If set to `none`, no logging file will generate. +If passed with global parameter `--debug`, an extra debugging logger file will be stored at user's home directory. ##### Metadata File User can specify where to store the metadata file for the vectors. If not given, the default is to save the metadata file at user's home directly. If set to `none`, no metadata file will generate. @@ -152,7 +172,7 @@ Third column is the embeded vecotrs. You can also set up the parallel count to some number larger than 1 to run in multiprocess mode. Currently only support for kgtk format input data. For example: `--parallel 4` ##### Reduced Embedding Vectors -This will have embedded vectors values after running TSNE and reduced dimension to 2-dimensions for each Q nodes. This is used for visulization. (for example, you can view it at Google's online tools here: http://projector.tensorflow.org/) +This will have embedded vectors values after running dimensional reduction algorithm and reduced dimension to 2-dimensions for each Q nodes. This is used for visulization. (for example, you can view it at Google's online tools here: http://projector.tensorflow.org/) 3. Metadata for the generated vectors: This will contains the metadata information for the Q nodes generated from 2 files mentioned above. It will contains the Q node value of each vector, the type (it is a `candidate` or a `ground truth` node), the given label of the Q node and corresponding fetched description information from wikidata. #### Query / cache related diff --git a/kgtk/gt/embedding_utils.py b/kgtk/gt/embedding_utils.py index f4fb45f5f..1e57491f2 100644 --- a/kgtk/gt/embedding_utils.py +++ b/kgtk/gt/embedding_utils.py @@ -9,13 +9,13 @@ import pickle import os import time +import argparse from pyrallel import ParallelProcessor -from sklearn.manifold import TSNE # type: ignore +from collections import defaultdict, OrderedDict from tqdm import tqdm # type: ignore from ast import literal_eval from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses, models # type: ignore -from collections import defaultdict from SPARQLWrapper import SPARQLWrapper, JSON, POST, URLENCODED # type: ignore from kgtk.exceptions import KGTKException @@ -23,7 +23,7 @@ class EmbeddingVector: def __init__(self, model_name=None, query_server=None, cache_config: dict = None, parallel_count=1): self._logger = logging.getLogger(__name__) - if model_name is None: + if not model_name: self.model_name = 'bert-base-nli-mean-tokens' # xlnet need to be trained before using, we can't use this for now # elif model_name == "xlnet-base-cased": @@ -50,22 +50,20 @@ def __init__(self, model_name=None, query_server=None, cache_config: dict = None try: _ = self.redis_server.get("foo") self._logger.debug("Cache server {}:{} connected!".format(host, port)) - except: + except Exception as e: self._logger.error("Cache server {}:{} is not able to be connected! Will not use cache!".format(host, port)) + self._logger.debug(e, exc_info=True) self.redis_server = None else: self.redis_server = None self._parallel_count = int(parallel_count) self._logger.debug("Running with {} processes.".format(parallel_count)) - self.qnodes_descriptions = dict() self.vectors_map = dict() - self.property_labels_dict = dict() - self.q_node_to_label = dict() - self.node_labels = dict() + self.node_labels = dict() # this is used to store {node:label} pairs + self.candidates = defaultdict(dict) # this is used to store all node {node:dict()} information self.vectors_2D = None self.vector_dump_file = None self.gt_nodes = set() - self.candidates = defaultdict(dict) self.metadata = [] self.gt_indexes = set() self.input_format = "" @@ -115,20 +113,30 @@ def send_sparql_query(self, query_body: str): raise KGTKException(error_message) def _get_labels(self, nodes: typing.List[str]): - query_nodes = " ".join(["wd:{}".format(each) for each in nodes]) - query = """ - select ?item ?nodeLabel - where { - values ?item {""" + query_nodes + """} - ?item rdfs:label ?nodeLabel. - FILTER(LANG(?nodeLabel) = "en"). - } - """ - results2 = self.send_sparql_query(query) - for each_res in results2: - node_id = each_res['item']['value'].split("/")[-1] - value = each_res['nodeLabel']['value'] - self.node_labels[node_id] = value + nodes_need_query = set() + for each in nodes: + if each not in self.node_labels: + nodes_need_query.add(each) + if nodes_need_query: + query_nodes = " ".join(["wd:{}".format(each) for each in nodes_need_query]) + query = """ + select ?item ?nodeLabel + where { + values ?item {""" + query_nodes + """} + ?item rdfs:label ?nodeLabel. + FILTER(LANG(?nodeLabel) = "en"). + } + """ + results2 = self.send_sparql_query(query) + for each_res in results2: + node_id = each_res['item']['value'].split("/")[-1] + nodes_need_query.remove(node_id) + value = each_res['nodeLabel']['value'] + self.node_labels[node_id] = value + + # for those nodes we can't find label, just add this to dict to prevent query again + for each_node in nodes_need_query: + self.node_labels[each_node] = each_node def _get_labels_and_descriptions(self, query_qnodes: str, need_find_label: bool, need_find_description: bool): query_body = """ @@ -154,83 +162,89 @@ def _get_labels_and_descriptions(self, query_qnodes: str, need_find_label: bool, if need_find_description: self.candidates[each_node]["description_properties"] = [description] - def _get_property_values(self, query_qnodes, query_part_names, query_part_properties): + def _get_property_values(self, query_qnodes: str, properties: dict, properties_reversed: dict): + """ + run sparql query to get corresponding property values of given q nodes + """ used_p_node_ids = set() - for part_name, part in zip(query_part_names, query_part_properties): + all_needed_properties = "" + for part_name, part in properties.items(): if part_name == "isa_properties": self._get_labels(part) - for i, each in enumerate(part): - if each not in {"label", "description", "all"}: - query_body2 = """ - select ?item ?eachPropertyLabel - where {{ - values ?item {{{all_nodes}}} - ?item wdt:{qnode} ?eachProperty. - SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }} - }} - """.format(all_nodes=query_qnodes, qnode=each) - results2 = self.send_sparql_query(query_body2) - - for each_res in results2: - node_id = each_res['item']['value'].split("/")[-1] - value = each_res['eachPropertyLabel']['value'] - if part_name == "isa_properties" and self.node_labels[each].endswith("of"): - value = self.node_labels[each] + "||" + value - used_p_node_ids.add(node_id) - if part_name in self.candidates[node_id]: - self.candidates[node_id][part_name].add(value) - else: - self.candidates[node_id][part_name] = {value} + + for each_node, role in properties_reversed.items(): + if role != {"has_properties"} and each_node not in {"label", "description", "all"}: + all_needed_properties += "wdt:{} ".format(each_node) + + query_body = """ + select ?item ?properties ?eachPropertyValueLabel + where {{ + values ?item {{{all_nodes}}} + values ?properties {{{properties}}} + ?item ?properties ?eachPropertyValue. + SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }} + }} + """.format(all_nodes=query_qnodes, properties=all_needed_properties) + results = self.send_sparql_query(query_body) + + for each_res in results: + node_id = each_res['item']['value'].split("/")[-1] + node_property = each_res['properties']['value'].split("/")[-1] + roles = properties_reversed[node_property] + value = each_res['eachPropertyValueLabel']['value'] + if node_property in properties["isa_properties"] and self.node_labels[node_property].endswith("of"): + value = self.node_labels[node_property] + "||" + value + used_p_node_ids.add(node_property) + for each_role in roles: + if each_role != "property_values": + if each_role in self.candidates[node_id]: + self.candidates[node_id][each_role].add(value) + else: + self.candidates[node_id][each_role] = {value} return used_p_node_ids - def _get_all_properties(self, query_qnodes, used_p_node_ids, properties_list): - has_properties_set = set(properties_list[3]) + def _get_all_properties(self, query_qnodes: str, used_p_node_ids: set, properties: dict): + """ + run sparql query to get all properties of given q nodes + """ + has_properties_set = set(properties["has_properties"]) query_body3 = """ - select DISTINCT ?item ?p_entity ?p_entityLabel - where { - values ?item {""" + query_qnodes + """} - ?item ?p ?o. - FILTER regex(str(?p), "^http://www.wikidata.org/prop/P", "i") - BIND (IRI(REPLACE(STR(?p), "http://www.wikidata.org/prop", "http://www.wikidata.org/entity")) AS ?p_entity) . - SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } - } - """ + select DISTINCT ?item ?p_entity ?p_entityLabel + where { + values ?item {""" + query_qnodes + """} + ?item ?p ?o. + FILTER regex(str(?p), "^http://www.wikidata.org/prop/P", "i") + BIND (IRI(REPLACE(STR(?p), "http://www.wikidata.org/prop", "http://www.wikidata.org/entity")) AS ?p_entity) . + SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } + } + """ results3 = self.send_sparql_query(query_body3) for each in results3: node_name = each['item']['value'].split("/")[-1] p_node_id = each['p_entity']['value'].split("/")[-1] p_node_label = each['p_entityLabel']['value'] if p_node_id not in used_p_node_ids: - if properties_list[3] == ["all"] or p_node_id in has_properties_set: + if has_properties_set == {"all"} or p_node_id in has_properties_set: if "has_properties" in self.candidates[node_name]: self.candidates[node_name]["has_properties"].add(p_node_label) else: self.candidates[node_name]["has_properties"] = {p_node_label} - def get_item_description(self, qnodes: typing.List[str] = None, target_properties: dict = {}): + def get_item_description(self, target_properties: dict, properties_reversed: dict, + qnodes: typing.Union[set, typing.List[str]]): """ use sparql query to get the descriptions of given Q nodes """ - if qnodes is None: - qnodes = self.candidates - if "all" in target_properties: - find_all_properties = True - else: - find_all_properties = False - properties_list = [[] for _ in range(4)] - names = ["labels", "descriptions", "isa_properties", "has_properties"] - for k, v in target_properties.items(): - if v == "label_properties": - properties_list[0].append(k) - elif v == "description_properties": - properties_list[1].append(k) - elif v == "isa_properties": - properties_list[2].append(k) - elif v == "has_properties": - properties_list[3].append(k) + # find_all_properties = False + if "all" in properties_reversed: + # find_all_properties = True + _ = properties_reversed.pop("all") + self._logger.info("Need to find all properties.") hash_generator = hashlib.md5() - hash_generator.update(str(properties_list).encode('utf-8')) + # sort to ensure the hash key same + target_properties = OrderedDict(sorted(target_properties.items())) + hash_generator.update(str(target_properties).encode('utf-8')) properties_list_hash = "||" + str(hash_generator.hexdigest()) sentences_cache_dict = {} @@ -240,7 +254,7 @@ def get_item_description(self, qnodes: typing.List[str] = None, target_propertie cache_res = self.redis_server.get(cache_key) self._logger.debug("Cached key is: {}".format(cache_key)) if cache_res is not None: - self._logger.debug("Cache hitted {}".format(cache_key)) + self._logger.debug("Cache hit {}".format(cache_key)) sentences_cache_dict[each_node] = cache_res.decode("utf-8") self._logger.debug("Cached for those nodes {} / {}".format(len(sentences_cache_dict), len(qnodes))) @@ -254,31 +268,22 @@ def get_item_description(self, qnodes: typing.List[str] = None, target_propertie # only need to do query when we still have remained nodes if len(qnodes) > 0: - need_find_label = "label" in properties_list[0] - need_find_description = "description" in properties_list[1] query_qnodes = "" for each in qnodes: query_qnodes += "wd:{} ".format(each) + need_find_label = "label" in target_properties["label_properties"] + need_find_description = "description" in target_properties["description_properties"] # this is used to get corresponding labels / descriptions if need_find_label or need_find_description: self._get_labels_and_descriptions(query_qnodes, need_find_label, need_find_description) - if len(properties_list[3]) > len(qnodes): - # in this condition, we have too many properties need to be queried, it will waste time - # query to get all properties then filtering would save more times - find_all_properties = True - query_part2_names = names[:3] - query_part2_properties = properties_list[:3] - else: - query_part2_names = names - query_part2_properties = properties_list # this is used to get corresponding labels of properties values - used_p_node_ids = self._get_property_values(query_qnodes, query_part2_names, query_part2_properties) + used_p_node_ids = self._get_property_values(query_qnodes, target_properties, properties_reversed) # if need get all properties, we need to run extra query - if find_all_properties: - self._get_all_properties(query_qnodes, used_p_node_ids, properties_list) + # if find_all_properties: + self._get_all_properties(query_qnodes, used_p_node_ids, target_properties) for each_node_id in qnodes: each_sentence = self.attribute_to_sentence(self.candidates[each_node_id], each_node_id) @@ -294,14 +299,14 @@ def get_item_description(self, qnodes: typing.List[str] = None, target_propertie def _process_one(self, args): """ - one process for multiprocess calling - :param args: - :return: + one process for multiprocess calling, should not be used for any other function + :param args: args to receive from main process + :return: corresponding node vector and attribute """ node_id = args["node_id"] each_node_attributes = args["attribute"] - concated_sentence = self.attribute_to_sentence(each_node_attributes, node_id) - vectors = self.get_sentences_embedding([concated_sentence], [node_id])[0] + concat_sentence = self.attribute_to_sentence(each_node_attributes, node_id) + vectors = self.get_sentences_embedding([concat_sentence], [node_id])[0] return {"v_" + node_id: vectors, "c_" + node_id: each_node_attributes} def _multiprocess_collector(self, data): @@ -313,14 +318,19 @@ def _multiprocess_collector(self, data): k = k.replace("c_", "") self.candidates[k] = v - def read_input(self, file_path: str, skip_nodes_set: set = None, - input_format: str = "kgtk_format", target_properties: dict = {}, - property_labels_dict: dict = {}, black_list_set: set = set() + def read_input(self, file_path: str, target_properties: dict, property_labels_dict: dict, + skip_nodes_set: set = None, input_format: str = "kgtk_format", + black_list_set: typing.Optional[set] = None ): """ load the input candidates files """ - self.property_labels_dict = property_labels_dict + self.node_labels.update(property_labels_dict) + # reverse sentence property to be {property : role) + properties_reversed = defaultdict(set) + for k, v in target_properties.items(): + for each_property in v: + properties_reversed[each_property].add(k) if input_format == "test_format": self.input_format = input_format @@ -335,6 +345,7 @@ def read_input(self, file_path: str, skip_nodes_set: set = None, raise KGTKException("Can't find ground truth id column! It should either named as `GT_kg_id` or `kg_id`") for _, each in input_df.iterrows(): + temp = [] if isinstance(each["candidates"], str): temp = str(each['candidates']).split("|") elif each['candidates'] is np.nan or math.isnan(each['candidates']): @@ -355,20 +366,20 @@ def read_input(self, file_path: str, skip_nodes_set: set = None, temp.extend(gt_nodes) for each_q in temp: - self.q_node_to_label[each_q] = label + self.node_labels[each_q] = label if skip_nodes_set is not None and each_q in skip_nodes_set: to_remove_q.add(each_q) temp = set(temp) - to_remove_q count += len(temp) self.gt_nodes.add(each[gt_column_id]) - self.get_item_description(temp, target_properties) + self.get_item_description(target_properties, properties_reversed, temp) self._logger.info("Totally {} rows with {} candidates loaded.".format(str(len(gt)), str(count))) elif input_format == "kgtk_format": # assume the input edge file is sorted - if "all" in target_properties: - _ = target_properties.pop("all") + if "all" in properties_reversed: + _ = properties_reversed.pop("all") add_all_properties = True else: add_all_properties = False @@ -406,11 +417,14 @@ def read_input(self, file_path: str, skip_nodes_set: set = None, for each_line in f: each_line = each_line.replace("\n", "").split("\t") node_id = each_line[column_references["node"]] + # skip nodes id in black list + if black_list_set and node_id in black_list_set: + continue + node_property = each_line[column_references["property"]] node_value = each_line[column_references["value"]] # remove @ mark if "@" in node_value and node_value[0] != "@": - node_value_org = node_value node_value = node_value[:node_value.index("@")] # remove extra double quote " and single quote ' @@ -423,7 +437,7 @@ def read_input(self, file_path: str, skip_nodes_set: set = None, if current_process_node_id is None: current_process_node_id = node_id else: - # if we get to next id, concate all properties into one sentence to represent the Q node + # if we get to next id, concat all properties into one sentence to represent the Q node # for multi process if self._parallel_count > 1: @@ -431,35 +445,40 @@ def read_input(self, file_path: str, skip_nodes_set: set = None, pp.add_task(each_arg) # for single process else: - concated_sentence = self.attribute_to_sentence(each_node_attributes, current_process_node_id) - each_node_attributes["sentence"] = concated_sentence + concat_sentence = self.attribute_to_sentence(each_node_attributes, current_process_node_id) + each_node_attributes["sentence"] = concat_sentence self.candidates[current_process_node_id] = each_node_attributes - # after write down finish, we can cleaer and start parsing next one + # after write down finish, we can clear and start parsing next one each_node_attributes = {"has_properties": [], "isa_properties": [], "label_properties": [], "description_properties": []} # update to new id current_process_node_id = node_id - if node_property in target_properties: - each_node_attributes[target_properties[node_property]].append(node_value) + if node_property in properties_reversed: + roles = properties_reversed[node_property] + if "property_values" in roles: + node_value = self.get_real_label_name(node_value) + for each_role in roles: + if each_role != "property_values": + each_node_attributes[each_role].append(node_value) if add_all_properties and each_line[column_references["value"]][0] == "P": - each_node_attributes["has_properties"].append(node_value) + each_node_attributes["has_properties"].append(self.get_real_label_name(node_value)) # close multiprocess pool if self._parallel_count > 1: pp.task_done() pp.join() else: - raise KGTKException("Unkonwn input format {}".format(input_format)) + raise KGTKException("Unknown input format {}".format(input_format)) self._logger.info("Totally {} Q nodes loaded.".format(len(self.candidates))) self.vector_dump_file = "dump_vectors_{}_{}.pkl".format(file_path[:file_path.rfind(".")], self.model_name) # self._logger.debug("The cache file name will be {}".format(self.vector_dump_file)) def get_real_label_name(self, node): - if node in self.property_labels_dict: - return self.property_labels_dict[node] + if node in self.node_labels: + return self.node_labels[node] else: return node @@ -489,6 +508,7 @@ def attribute_to_sentence(self, attribute_dict: dict, node_id=None): concated_sentence += " is a " elif concated_sentence == "": concated_sentence += "It is a " + # remove last ", " concated_sentence += temp[:-2] if "has_properties" in attribute_dict and len(attribute_dict["has_properties"]) > 0: temp = [self.get_real_label_name(each) for each in attribute_dict["has_properties"]] @@ -580,9 +600,9 @@ def print_vector(self, vectors, output_properties: str = "text_embedding", outpu else: print(str(each_dimension) + "\n", end="") - def plot_result(self, output_properties={}, input_format="kgtk_format", + def plot_result(self, output_properties: dict, input_format="kgtk_format", output_uri: str = "", output_format="kgtk_format", - run_TSNE=True + dimensional_reduction="none", dimension_val=2 ): """ transfer the vectors to lower dimension so that we can plot @@ -590,12 +610,23 @@ def plot_result(self, output_properties={}, input_format="kgtk_format", """ self.vectors_map = {k: v for k, v in sorted(self.vectors_map.items(), key=lambda item: item[0], reverse=True)} vectors = list(self.vectors_map.values()) - # use TSNE to reduce dimension - if run_TSNE: - self._logger.warning("Start running TSNE to reduce dimension. It will take a long time.") + # reduce dimension if needed + if dimensional_reduction.lower() == "tsne": + self._logger.warning("Start running TSNE to reduce dimension. It will take some time.") + start = time.time() + from sklearn.manifold import TSNE # type: ignore + self.vectors_2D = TSNE(n_components=int(dimension_val), random_state=0).fit_transform(vectors) + self._logger.info("Totally used {} seconds.".format(time.time() - start)) + elif dimensional_reduction.lower() == "pca": + self._logger.warning("Start running PCA to reduce dimension. It will take some time.") start = time.time() - self.vectors_2D = TSNE(n_components=2, random_state=0).fit_transform(vectors) + from sklearn.decomposition import PCA # type: ignore + self.vectors_2D = PCA(n_components=int(dimension_val)).fit_transform(vectors) self._logger.info("Totally used {} seconds.".format(time.time() - start)) + elif dimensional_reduction.lower() == "none": + self._logger.info("Not run dimensional reduction algorithm.") + else: + raise KGTKException("Unknown or unsupport dimensional reduction type: {}".format(dimensional_reduction)) if input_format == "test_format": gt_indexes = set() @@ -605,7 +636,7 @@ def plot_result(self, output_properties={}, input_format="kgtk_format", self.metadata.append("Q_nodes\tType\tLabel\tDescription") for i, each in enumerate(self.vectors_map.keys()): - label = self.q_node_to_label[each] + label = self.node_labels[each] description = self.candidates[each]["sentence"] if i in gt_indexes: self.metadata.append("{}\tground_truth_node\t{}\t{}".format(each, label, description)) @@ -614,7 +645,7 @@ def plot_result(self, output_properties={}, input_format="kgtk_format", self.gt_indexes = gt_indexes elif input_format == "kgtk_format": - if len(output_properties.get("metatada_properties", [])) == 0: + if len(output_properties.get("metadata_properties", [])) == 0: for k, v in self.candidates.items(): label = v.get("label_properties", "") if len(label) > 0 and isinstance(label, list): @@ -624,7 +655,7 @@ def plot_result(self, output_properties={}, input_format="kgtk_format", description = description[0] self.metadata.append("{}\t\t{}\t{}".format(k, label, description)) else: - required_properties = output_properties["metatada_properties"] + required_properties = output_properties["metadata_properties"] self.metadata.append("node\t" + "\t".join(required_properties)) for k, v in self.candidates.items(): each_metadata = k + "\t" @@ -633,7 +664,7 @@ def plot_result(self, output_properties={}, input_format="kgtk_format", self.metadata.append(each_metadata) metadata_output_path = os.path.join(output_uri, self.vector_dump_file.split("/")[-1]) - if run_TSNE: + if self.vectors_2D is not None: self.print_vector(self.vectors_2D, output_properties.get("output_properties"), output_format) else: self.print_vector(vectors, output_properties.get("output_properties"), output_format) @@ -674,3 +705,14 @@ def calculate_distance(a, b): dist += (v1 - v2) ** 2 dist = dist ** 0.5 return dist + + +def str2bool(v): + if isinstance(v, bool): + return v + if v.lower() in ('yes', 'true', 't', 'y', '1'): + return True + elif v.lower() in ('no', 'false', 'f', 'n', '0'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') From fd176e9e71923f1608ccc114c1a597cefbd00ff8 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 6 May 2020 19:01:24 -0700 Subject: [PATCH 101/278] Add arbitrary join column names. --- kgtk/join/edgejoiner.py | 120 +++++++++++++++++++++++++++------------- 1 file changed, 83 insertions(+), 37 deletions(-) diff --git a/kgtk/join/edgejoiner.py b/kgtk/join/edgejoiner.py index b70101422..e9924eb83 100644 --- a/kgtk/join/edgejoiner.py +++ b/kgtk/join/edgejoiner.py @@ -38,6 +38,10 @@ class EdgeJoiner(KgtkFormat): join_on_label: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) join_on_node2: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) + # TODO: Write fuill validators + left_join_columns: typing.Optional[typing.List[str]] = attr.ib(default=None) + right_join_columns: typing.Optional[typing.List[str]] = attr.ib(default=None) + # The prefix applied to right file column names in the output file: prefix: typing.Optional[str] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(str)), default=None) @@ -72,19 +76,24 @@ def node1_column_idx(self, kr: EdgeReader, who: str)->int: raise ValueError("EdgeJoiner: unknown node1 column index in KGTK %s edge type." % who) return idx - def build_join_key(self, kr: EdgeReader, join_idx: int, row: typing.List[str])->str: - key: str = row[join_idx] - if self.join_on_label: - key += self.field_separator+ row[kr.label_column_idx] - if self.join_on_node2: - key += self.field_separator+ row[kr.node2_column_idx] + def build_join_key(self, kr: EdgeReader, join_idx_list: typing.List[int], row: typing.List[str])->str: + key: str = "" + join_idx: int + first: bool = True + for join_idx in join_idx_list: + if first: + first = False + else: + key += self.field_separator + + key += row[join_idx] return key - def multi_column_key_set(self, kr: EdgeReader, join_idx: int)->typing.Set[str]: + def multi_column_key_set(self, kr: EdgeReader, join_idx_list: typing.List[int])->typing.Set[str]: result: typing.Set[str] = set() row: typing.List[str] for row in kr: - result.add(self.build_join_key(kr, join_idx, row)) + result.add(self.build_join_key(kr, join_idx_list, row)) return result # Optimized for a single join column: @@ -95,26 +104,25 @@ def single_column_key_set(self, kr: EdgeReader, join_idx: int)->typing.Set[str]: result.add(row[join_idx]) return result - def extract_join_key_set(self, file_path: Path, who: str)->typing.Set[str]: - if self.verbose: - print("Extracting the join key set from the %s input file: %s" % (who, str(file_path)), flush=True) - kr: EdgeReader = EdgeReader.open_edge_file(file_path, - short_line_action=self.short_line_action, - long_line_action=self.long_line_action, - fill_short_lines=self.fill_short_lines, - truncate_long_lines=self.truncate_long_lines, - value_options = self.value_options, - gzip_in_parallel=self.gzip_in_parallel, - error_limit=self.error_limit, - verbose=self.verbose, - very_verbose=self.very_verbose) + def build_join_idx_list(self, kr: EdgeReader, who: str, join_columns: typing.Optional[typing.List[str]])->typing.List[int]: + join_idx: int + join_idx_list: typing.List[int] = [ ] + col_num: int = 1 + if join_columns is not None and len(join_columns) > 0: + join_column:str + for join_column in join_columns: + if join_column not in kr.column_name_map: + raise ValueError("Join column %s not found in in the %s input file" % (join_column, who)) + join_idx = kr.column_name_map[join_column] + if self.verbose: + print("Join column %d: %s (index %d in the %s input file)" % (col_num, join_column, join_idx, who)) + join_idx_list.append(join_idx) + return join_idx_list - if not kr.is_edge_file: - raise ValueError("The %s file is not an edge file" % who) - - join_idx: int = self.node1_column_idx(kr, who) + join_idx = self.node1_column_idx(kr, who) if self.verbose: print("Joining on node1 (index %s in the %s input file)" % (join_idx, who)) + join_idx_list.append(join_idx) # join_on_label and join_on_node2 may be specified if self.join_on_label or self.join_on_node2: @@ -123,15 +131,42 @@ def extract_join_key_set(self, file_path: Path, who: str)->typing.Set[str]: raise ValueError("join_on_label may not be used because the %s input file does not have a label column." % who) if self.verbose: print("Joining on label (index %s in the %s input file)" % (kr.label_column_idx, who)) + join_idx_list.append(kr.label_column_idx) + if self.join_on_node2: if kr.node2_column_idx < 0: raise ValueError("join_on_node2 may not be used because the %s input file does not have a node2 column." % who) if self.verbose: print("Joining on node2 (index %s in the %s input file)" % (kr.node2_column_idx, who)) - return self.multi_column_key_set(kr, join_idx) # closes er file - else: + join_idx_list.append(kr.node2_column_idx) + return join_idx_list + + + def extract_join_key_set(self, file_path: Path, who: str, join_columns: typing.Optional[typing.List[str]])->typing.Set[str]: + if self.verbose: + print("Extracting the join key set from the %s input file: %s" % (who, str(file_path)), flush=True) + if join_columns is not None: + print("Using join columns: %s" % " ".join(join_columns)) + kr: EdgeReader = EdgeReader.open_edge_file(file_path, + short_line_action=self.short_line_action, + long_line_action=self.long_line_action, + fill_short_lines=self.fill_short_lines, + truncate_long_lines=self.truncate_long_lines, + value_options = self.value_options, + gzip_in_parallel=self.gzip_in_parallel, + error_limit=self.error_limit, + verbose=self.verbose, + very_verbose=self.very_verbose) + + if not kr.is_edge_file: + raise ValueError("The %s file is not an edge file" % who) + + join_idx_list: typing.List[int] = self.build_join_idx_list(kr, who, join_columns) + if len(join_idx_list) == 1: # This uses optimized code: - return self.single_column_key_set(kr, join_idx) # closes er file + return self.single_column_key_set(kr, join_idx_list[0]) # closes er file + else: + return self.multi_column_key_set(kr, join_idx_list) # closes er file def join_key_sets(self)->typing.Optional[typing.Set[str]]: @@ -146,7 +181,7 @@ def join_key_sets(self)->typing.Optional[typing.Set[str]]: elif self.left_join and not self.right_join: if self.verbose: print("Computing the left join key set", flush=True) - join_key_set = self.extract_join_key_set(self.left_file_path, "left").copy() + join_key_set = self.extract_join_key_set(self.left_file_path, "left", self.left_join_columns).copy() if self.verbose: print("There are %d keys in the left join key set." % len(join_key_set)) return join_key_set @@ -154,7 +189,7 @@ def join_key_sets(self)->typing.Optional[typing.Set[str]]: elif self.right_join and not self.left_join: if self.verbose: print("Computing the right join key set", flush=True) - join_key_set = self.extract_join_key_set(self.right_file_path, "right").copy() + join_key_set = self.extract_join_key_set(self.right_file_path, "right", self.right_join_columns).copy() if self.verbose: print("There are %d keys in the right join key set." % len(join_key_set)) return join_key_set @@ -162,16 +197,16 @@ def join_key_sets(self)->typing.Optional[typing.Set[str]]: else: if self.verbose: print("Computing the inner join key set", flush=True) - left_join_key_set: typing.Set[str] = self.extract_join_key_set(self.left_file_path, "left") + left_join_key_set: typing.Set[str] = self.extract_join_key_set(self.left_file_path, "left", self.left_join_columns) if self.verbose: print("There are %d keys in the left file key set." % len(left_join_key_set)) - right_join_key_set: typing.Set[str] = self.extract_join_key_set(self.right_file_path, "right") + right_join_key_set: typing.Set[str] = self.extract_join_key_set(self.right_file_path, "right", self.right_join_columns) if self.verbose: print("There are %d keys in the right file key set." % len(right_join_key_set)) join_key_set = left_join_key_set.intersection(right_join_key_set) if self.verbose: print("There are %d keys in the inner join key set." % len(join_key_set)) - return joiin_key_set + return join_key_set def merge_columns(self, left_kr: EdgeReader, right_kr: EdgeReader)->typing.Tuple[typing.List[str], typing.List[str]]: joined_column_names: typing.List[str] = [ ] @@ -236,6 +271,15 @@ def process(self): error_limit=self.error_limit) + # TODO: We ought to do this test sooner. + left_join_idx_list: typing.List[int] = self.build_join_idx_list(left_kr, "left", self.left_join_columns) + right_join_idx_list: typing.List[int] = self.build_join_idx_list(right_kr, "right", self.right_join_columns) + if len(left_join_idx_list) != len(right_join_idx_list): + print("the left join key has %d components, the right join key has %d columns. Exiting." % (len(left_join_idx_list), len(right_join_idx_list))) + left_kr.close() + right_kr.close() + return + if self.verbose: print("Mapping the column names for the join.", flush=True) joined_column_names: typing.List[str] @@ -268,7 +312,6 @@ def process(self): if self.verbose: print("Processing the left input file: %s" % str(self.left_file_path), flush=True) row: typing.list[str] - left_node1_idx: int = self.node1_column_idx(left_kr, who="left") for row in left_kr: left_data_lines_read += 1 if joined_key_set is None: @@ -276,7 +319,7 @@ def process(self): output_data_lines += 1 left_data_lines_kept += 1 else: - left_key: str = self.build_join_key(left_kr, left_node1_idx, row) + left_key: str = self.build_join_key(left_kr, left_join_idx_list, row) if left_key in joined_key_set: ew.write(row) output_data_lines += 1 @@ -287,7 +330,6 @@ def process(self): if self.verbose: print("Processing the right input file: %s" % str(self.right_file_path), flush=True) right_shuffle_list: typing.List[int] = ew.build_shuffle_list(right_column_names) - right_node1_idx: int = self.node1_column_idx(right_kr, who="right") for row in right_kr: right_data_lines_read += 1 if joined_key_set is None: @@ -295,7 +337,7 @@ def process(self): output_data_lines += 1 right_data_lines_kept += 1 else: - right_key: str = self.build_join_key(right_kr, right_node1_idx, row) + right_key: str = self.build_join_key(right_kr, right_join_idx_list, row) if right_key in joined_key_set: ew.write(row, shuffle_list=right_shuffle_list) output_data_lines += 1 @@ -325,6 +367,7 @@ def main(): parser.add_argument( "--join-on-node2", dest="join_on_node2", help="If both input files are edge files, include the node2 column in the join.", action='store_true') parser.add_argument( "--gzip-in-parallel", dest="gzip_in_parallel", help="Execute gzip in parallel.", action='store_true') parser.add_argument( "--left-join", dest="left_join", help="Perform a left outer join.", action='store_true') + parser.add_argument( "--left-join-columns", dest="left_join_columns", help="Left file join columns.", nargs='+') parser.add_argument( "--long-line-action", dest="long_line_action", help="The action to take when a long line is detected.", @@ -333,6 +376,7 @@ def main(): parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to read", type=Path, default=None) parser.add_argument( "--prefix", dest="prefix", help="The prefix applied to right file column names in the output file.") parser.add_argument( "--right-join", dest="right_join", help="Perform a right outer join.", action='store_true') + parser.add_argument( "--right-join-columns", dest="right_join_columns", help="Right file join columns.", nargs='+') parser.add_argument( "--short-line-action", dest="short_line_action", help="The action to take whe a short line is detected.", @@ -357,6 +401,8 @@ def main(): right_join=args.right_join, join_on_label=args.join_on_label, join_on_node2=args.join_on_node2, + left_join_columns=args.left_join_columns, + right_join_columns=args.right_join_columns, prefix=args.prefix, field_separator=args.field_separator, short_line_action=args.short_line_action, From 4b32b054144a4ba5e2fe082748bc10130f02f61d Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 6 May 2020 19:04:17 -0700 Subject: [PATCH 102/278] Rename the file join column arguments. --- kgtk/join/edgejoiner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kgtk/join/edgejoiner.py b/kgtk/join/edgejoiner.py index e9924eb83..0d08725b7 100644 --- a/kgtk/join/edgejoiner.py +++ b/kgtk/join/edgejoiner.py @@ -366,8 +366,8 @@ def main(): parser.add_argument( "--join-on-label", dest="join_on_label", help="If both input files are edge files, include the label column in the join.", action='store_true') parser.add_argument( "--join-on-node2", dest="join_on_node2", help="If both input files are edge files, include the node2 column in the join.", action='store_true') parser.add_argument( "--gzip-in-parallel", dest="gzip_in_parallel", help="Execute gzip in parallel.", action='store_true') + parser.add_argument( "--left-file-join-columns", dest="left_join_columns", help="Left file join columns.", nargs='+') parser.add_argument( "--left-join", dest="left_join", help="Perform a left outer join.", action='store_true') - parser.add_argument( "--left-join-columns", dest="left_join_columns", help="Left file join columns.", nargs='+') parser.add_argument( "--long-line-action", dest="long_line_action", help="The action to take when a long line is detected.", @@ -375,8 +375,8 @@ def main(): parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to read", type=Path, default=None) parser.add_argument( "--prefix", dest="prefix", help="The prefix applied to right file column names in the output file.") + parser.add_argument( "--right-file-join-columns", dest="right_join_columns", help="Right file join columns.", nargs='+') parser.add_argument( "--right-join", dest="right_join", help="Perform a right outer join.", action='store_true') - parser.add_argument( "--right-join-columns", dest="right_join_columns", help="Right file join columns.", nargs='+') parser.add_argument( "--short-line-action", dest="short_line_action", help="The action to take whe a short line is detected.", From 165e42fd2d842a70953654ec57fc9a300bbbda37 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 6 May 2020 19:26:25 -0700 Subject: [PATCH 103/278] Noin node files as well as edge files. --- kgtk/join/edgejoiner.py | 153 +++++++++++++++++++++++----------------- 1 file changed, 90 insertions(+), 63 deletions(-) diff --git a/kgtk/join/edgejoiner.py b/kgtk/join/edgejoiner.py index 0d08725b7..4d9923c8d 100644 --- a/kgtk/join/edgejoiner.py +++ b/kgtk/join/edgejoiner.py @@ -15,7 +15,7 @@ import typing from kgtk.join.enumnameaction import EnumNameAction -from kgtk.join.edgereader import EdgeReader +from kgtk.join.kgtkreader import KgtkReader from kgtk.join.kgtkformat import KgtkFormat from kgtk.join.kgtkwriter import KgtkWriter from kgtk.join.kgtkvalueoptions import KgtkValueOptions, DEFAULT_KGTK_VALUE_OPTIONS @@ -62,21 +62,28 @@ class EdgeJoiner(KgtkFormat): gzip_in_parallel: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) - error_limit: int = attr.ib(validator=attr.validators.instance_of(int), default=EdgeReader.ERROR_LIMIT_DEFAULT) + error_limit: int = attr.ib(validator=attr.validators.instance_of(int), default=KgtkReader.ERROR_LIMIT_DEFAULT) verbose: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) very_verbose: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) FIELD_SEPARATOR_DEFAULT: str = KgtkFormat.LIST_SEPARATOR - def node1_column_idx(self, kr: EdgeReader, who: str)->int: + def node1_column_idx(self, kr: KgtkReader, who: str)->int: idx: int = kr.node1_column_idx if idx < 0: # TODO: throw a better exception - raise ValueError("EdgeJoiner: unknown node1 column index in KGTK %s edge type." % who) + raise ValueError("EdgeJoiner: unknown node1 column index in KGTK %s edge file." % who) return idx - def build_join_key(self, kr: EdgeReader, join_idx_list: typing.List[int], row: typing.List[str])->str: + def id_column_idx(self, kr: KgtkReader, who: str)->int: + idx: int = kr.id_column_idx + if idx < 0: + # TODO: throw a better exception + raise ValueError("EdgeJoiner: unknown id column index in KGTK %s node file." % who) + return idx + + def build_join_key(self, kr: KgtkReader, join_idx_list: typing.List[int], row: typing.List[str])->str: key: str = "" join_idx: int first: bool = True @@ -89,7 +96,7 @@ def build_join_key(self, kr: EdgeReader, join_idx_list: typing.List[int], row: t key += row[join_idx] return key - def multi_column_key_set(self, kr: EdgeReader, join_idx_list: typing.List[int])->typing.Set[str]: + def multi_column_key_set(self, kr: KgtkReader, join_idx_list: typing.List[int])->typing.Set[str]: result: typing.Set[str] = set() row: typing.List[str] for row in kr: @@ -97,32 +104,42 @@ def multi_column_key_set(self, kr: EdgeReader, join_idx_list: typing.List[int])- return result # Optimized for a single join column: - def single_column_key_set(self, kr: EdgeReader, join_idx: int)->typing.Set[str]: + def single_column_key_set(self, kr: KgtkReader, join_idx: int)->typing.Set[str]: result: typing.Set[str] = set() row: typing.List[str] for row in kr: result.add(row[join_idx]) return result - def build_join_idx_list(self, kr: EdgeReader, who: str, join_columns: typing.Optional[typing.List[str]])->typing.List[int]: + def build_join_idx_list(self, kr: KgtkReader, who: str, join_columns: typing.Optional[typing.List[str]])->typing.List[int]: join_idx: int join_idx_list: typing.List[int] = [ ] col_num: int = 1 if join_columns is not None and len(join_columns) > 0: + if self.verbose: + print("Using %s file join columns: %s" % (who, " ".join(join_columns)), flush=True) join_column:str for join_column in join_columns: if join_column not in kr.column_name_map: raise ValueError("Join column %s not found in in the %s input file" % (join_column, who)) join_idx = kr.column_name_map[join_column] if self.verbose: - print("Join column %d: %s (index %d in the %s input file)" % (col_num, join_column, join_idx, who)) + print("Join column %d: %s (index %d in the %s input file)" % (col_num, join_column, join_idx, who), flush=True) join_idx_list.append(join_idx) return join_idx_list - join_idx = self.node1_column_idx(kr, who) - if self.verbose: - print("Joining on node1 (index %s in the %s input file)" % (join_idx, who)) - join_idx_list.append(join_idx) + if kr.is_edge_file: + join_idx = self.node1_column_idx(kr, who) + if self.verbose: + print("Joining on node1 (index %s in the %s input file)" % (join_idx, who), flush=True) + join_idx_list.append(join_idx) + elif kr.is_node_file: + join_idx = self.id_column_idx(kr, who) + if self.verbose: + print("Joining on id (index %s in the %s input file)" % (join_idx, who), flush=True) + join_idx_list.append(join_idx) + else: + raise ValueError("Unknown file type in build_join_idx_list(...)") # join_on_label and join_on_node2 may be specified if self.join_on_label or self.join_on_node2: @@ -130,38 +147,35 @@ def build_join_idx_list(self, kr: EdgeReader, who: str, join_columns: typing.Opt if kr.label_column_idx < 0: raise ValueError("join_on_label may not be used because the %s input file does not have a label column." % who) if self.verbose: - print("Joining on label (index %s in the %s input file)" % (kr.label_column_idx, who)) + print("Joining on label (index %s in the %s input file)" % (kr.label_column_idx, who), flush=True) join_idx_list.append(kr.label_column_idx) if self.join_on_node2: if kr.node2_column_idx < 0: raise ValueError("join_on_node2 may not be used because the %s input file does not have a node2 column." % who) if self.verbose: - print("Joining on node2 (index %s in the %s input file)" % (kr.node2_column_idx, who)) + print("Joining on node2 (index %s in the %s input file)" % (kr.node2_column_idx, who), flush=True) join_idx_list.append(kr.node2_column_idx) return join_idx_list - def extract_join_key_set(self, file_path: Path, who: str, join_columns: typing.Optional[typing.List[str]])->typing.Set[str]: + def extract_join_key_set(self, file_path: Path, who: str, join_idx_list: typing.List[int])->typing.Set[str]: if self.verbose: print("Extracting the join key set from the %s input file: %s" % (who, str(file_path)), flush=True) - if join_columns is not None: - print("Using join columns: %s" % " ".join(join_columns)) - kr: EdgeReader = EdgeReader.open_edge_file(file_path, - short_line_action=self.short_line_action, - long_line_action=self.long_line_action, - fill_short_lines=self.fill_short_lines, - truncate_long_lines=self.truncate_long_lines, - value_options = self.value_options, - gzip_in_parallel=self.gzip_in_parallel, - error_limit=self.error_limit, - verbose=self.verbose, - very_verbose=self.very_verbose) + kr: KgtkReader = KgtkReader.open(file_path, + short_line_action=self.short_line_action, + long_line_action=self.long_line_action, + fill_short_lines=self.fill_short_lines, + truncate_long_lines=self.truncate_long_lines, + value_options = self.value_options, + gzip_in_parallel=self.gzip_in_parallel, + error_limit=self.error_limit, + verbose=self.verbose, + very_verbose=self.very_verbose) if not kr.is_edge_file: raise ValueError("The %s file is not an edge file" % who) - - join_idx_list: typing.List[int] = self.build_join_idx_list(kr, who, join_columns) + if len(join_idx_list) == 1: # This uses optimized code: return self.single_column_key_set(kr, join_idx_list[0]) # closes er file @@ -169,7 +183,7 @@ def extract_join_key_set(self, file_path: Path, who: str, join_columns: typing.O return self.multi_column_key_set(kr, join_idx_list) # closes er file - def join_key_sets(self)->typing.Optional[typing.Set[str]]: + def join_key_sets(self, left_join_idx_list: typing.List[int], right_join_idx_list: typing.List[int])->typing.Optional[typing.Set[str]]: """ Read the input edge files the first time, building the sets of left and right join values. """ @@ -181,34 +195,34 @@ def join_key_sets(self)->typing.Optional[typing.Set[str]]: elif self.left_join and not self.right_join: if self.verbose: print("Computing the left join key set", flush=True) - join_key_set = self.extract_join_key_set(self.left_file_path, "left", self.left_join_columns).copy() + join_key_set = self.extract_join_key_set(self.left_file_path, "left", left_join_idx_list).copy() if self.verbose: - print("There are %d keys in the left join key set." % len(join_key_set)) + print("There are %d keys in the left join key set." % len(join_key_set), flush=True) return join_key_set elif self.right_join and not self.left_join: if self.verbose: print("Computing the right join key set", flush=True) - join_key_set = self.extract_join_key_set(self.right_file_path, "right", self.right_join_columns).copy() + join_key_set = self.extract_join_key_set(self.right_file_path, "right", right_join_idx_list).copy() if self.verbose: - print("There are %d keys in the right join key set." % len(join_key_set)) + print("There are %d keys in the right join key set." % len(join_key_set), flush=True) return join_key_set else: if self.verbose: print("Computing the inner join key set", flush=True) - left_join_key_set: typing.Set[str] = self.extract_join_key_set(self.left_file_path, "left", self.left_join_columns) + left_join_key_set: typing.Set[str] = self.extract_join_key_set(self.left_file_path, "left", left_join_idx_list) if self.verbose: - print("There are %d keys in the left file key set." % len(left_join_key_set)) - right_join_key_set: typing.Set[str] = self.extract_join_key_set(self.right_file_path, "right", self.right_join_columns) + print("There are %d keys in the left file key set." % len(left_join_key_set), flush=True) + right_join_key_set: typing.Set[str] = self.extract_join_key_set(self.right_file_path, "right", right_join_idx_list) if self.verbose: - print("There are %d keys in the right file key set." % len(right_join_key_set)) + print("There are %d keys in the right file key set." % len(right_join_key_set), flush=True) join_key_set = left_join_key_set.intersection(right_join_key_set) if self.verbose: - print("There are %d keys in the inner join key set." % len(join_key_set)) + print("There are %d keys in the inner join key set." % len(join_key_set), flush=True) return join_key_set - def merge_columns(self, left_kr: EdgeReader, right_kr: EdgeReader)->typing.Tuple[typing.List[str], typing.List[str]]: + def merge_columns(self, left_kr: KgtkReader, right_kr: KgtkReader)->typing.Tuple[typing.List[str], typing.List[str]]: joined_column_names: typing.List[str] = [ ] right_column_names: typing.List[str] = [ ] @@ -246,40 +260,48 @@ def merge_columns(self, left_kr: EdgeReader, right_kr: EdgeReader)->typing.Tuple return (joined_column_names, right_column_names) def process(self): - joined_key_set: typing.Optional[typing.Set[str]] = self.join_key_sets() - if self.verbose: print("Opening the left edge file: %s" % str(self.left_file_path), flush=True) - # Open the input files for the second time. This won't work with stdin. - left_kr: EdgeReader = EdgeReader.open_edge_file(self.left_file_path, - short_line_action=self.short_line_action, - long_line_action=self.long_line_action, - fill_short_lines=self.fill_short_lines, - truncate_long_lines=self.truncate_long_lines, - value_options = self.value_options, - error_limit=self.error_limit) + left_kr: KgtkReader = KgtkReader.open(self.left_file_path, + short_line_action=self.short_line_action, + long_line_action=self.long_line_action, + fill_short_lines=self.fill_short_lines, + truncate_long_lines=self.truncate_long_lines, + value_options = self.value_options, + error_limit=self.error_limit) if self.verbose: print("Opening the right edge file: %s" % str(self.right_file_path), flush=True) - right_kr: EdgeReader = EdgeReader.open_edge_file(self.right_file_path, - short_line_action=self.short_line_action, - long_line_action=self.long_line_action, - fill_short_lines=self.fill_short_lines, - truncate_long_lines=self.truncate_long_lines, - value_options = self.value_options, - error_limit=self.error_limit) - + right_kr: KgtkReader = KgtkReader.open(self.right_file_path, + short_line_action=self.short_line_action, + long_line_action=self.long_line_action, + fill_short_lines=self.fill_short_lines, + truncate_long_lines=self.truncate_long_lines, + value_options = self.value_options, + error_limit=self.error_limit) + + if left_kr.is_edge_file and right_kr.is_edge_file: + if self.verbose: + print("Both input files are edge files.", flush=True) + elif left_kr.is_node_file and right_kr.is_node_file: + if self.verbose: + print("Both input files are node files.", flush=True) + else: + print("Cannot join edge and node files.", flush=True) + return - # TODO: We ought to do this test sooner. left_join_idx_list: typing.List[int] = self.build_join_idx_list(left_kr, "left", self.left_join_columns) right_join_idx_list: typing.List[int] = self.build_join_idx_list(right_kr, "right", self.right_join_columns) if len(left_join_idx_list) != len(right_join_idx_list): - print("the left join key has %d components, the right join key has %d columns. Exiting." % (len(left_join_idx_list), len(right_join_idx_list))) + print("the left join key has %d components, the right join key has %d columns. Exiting." % (len(left_join_idx_list), len(right_join_idx_list)), flush=True) left_kr.close() right_kr.close() return + # This might open the input files for a second time. This won't work with stdin. + joined_key_set: typing.Optional[typing.Set[str]] = self.join_key_sets(left_join_idx_list, right_join_idx_list) + if self.verbose: print("Mapping the column names for the join.", flush=True) joined_column_names: typing.List[str] @@ -353,12 +375,17 @@ def process(self): def main(): """ Test the KGTK file joiner. + + Edge files can be joined to edge files. + Node files can also be joined to node files. + + TODO: Add more KgtkReader parameters, especially mode. """ parser = ArgumentParser() parser.add_argument(dest="left_file_path", help="The left KGTK file to join", type=Path) parser.add_argument(dest="right_file_path", help="The right KGTK file to join", type=Path) parser.add_argument( "--error-limit", dest="error_limit", - help="The maximum number of errors to report before failing", type=int, default=EdgeReader.ERROR_LIMIT_DEFAULT) + help="The maximum number of errors to report before failing", type=int, default=KgtkReader.ERROR_LIMIT_DEFAULT) parser.add_argument( "--field-separator", dest="field_separator", help="Separator for multifield keys", default=EdgeJoiner.FIELD_SEPARATOR_DEFAULT) parser.add_argument( "--fill-short-lines", dest="fill_short_lines", @@ -419,4 +446,4 @@ def main(): if __name__ == "__main__": main() - +a From f9dcf4b93ad17f7248f25476a3b7ad26bb6ddaef Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 6 May 2020 19:58:28 -0700 Subject: [PATCH 104/278] Add feedback. Add value filter controls. --- kgtk/join/edgejoiner.py | 7 +++++-- kgtk/join/ifexists.py | 43 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/kgtk/join/edgejoiner.py b/kgtk/join/edgejoiner.py index 4d9923c8d..45a508827 100644 --- a/kgtk/join/edgejoiner.py +++ b/kgtk/join/edgejoiner.py @@ -1,5 +1,7 @@ """ -Join two KTKG edge files. The output file is an edge file. +Join two KTKG edge files or two KGTK node files. The output file is an edge file or a node file. + +TODO: rename this to KgtkJoiner. Note: This implementation builds im-memory sets of all the key values in each input file. @@ -46,6 +48,7 @@ class EdgeJoiner(KgtkFormat): prefix: typing.Optional[str] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(str)), default=None) # The field separator used in multifield joins. The KGHT list character should be safe. + # TODO: USE THE COLUMN SEPARATOR !!!!! field_separator: str = attr.ib(validator=attr.validators.instance_of(str), default=KgtkFormat.LIST_SEPARATOR) # Ignore records with too many or too few fields? @@ -446,4 +449,4 @@ def main(): if __name__ == "__main__": main() -a + diff --git a/kgtk/join/ifexists.py b/kgtk/join/ifexists.py index 206cb52a9..88b772ffc 100644 --- a/kgtk/join/ifexists.py +++ b/kgtk/join/ifexists.py @@ -54,6 +54,10 @@ class IfExists(KgtkFormat): fill_short_lines: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) truncate_long_lines: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) + # TODO: find a working validator + # value_options: typing.Optional[KgtkValueOptions] = attr.ib(attr.validators.optional(attr.validators.instance_of(KgtkValueOptions)), default=None) + value_options: typing.Optional[KgtkValueOptions] = attr.ib(default=None) + gzip_in_parallel: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) verbose: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) @@ -119,23 +123,39 @@ def extract_key_set(self, kr: KgtkReader, who: str, key_columns: typing.List[int def process(self): # Open the input files once. + if self.verbose: + print("Opening the left input file: %s" % self.left_file_path, flush=True) left_kr: KgtkReader = KgtkReader.open(self.left_file_path, short_line_action=self.short_line_action, long_line_action=self.long_line_action, fill_short_lines=self.fill_short_lines, - truncate_long_lines=self.truncate_long_lines) + truncate_long_lines=self.truncate_long_lines, + value_options = self.value_options) + if self.verbose: + print("Opening the right input file: %s" % self.right_file_path, flush=True) right_kr: KgtkReader = KgtkReader.open(self.right_file_path, short_line_action=self.short_line_action, long_line_action=self.long_line_action, fill_short_lines=self.fill_short_lines, - truncate_long_lines=self.truncate_long_lines) + truncate_long_lines=self.truncate_long_lines, + value_options = self.value_options) left_key_columns: typing.List[int] = self.get_key_columns(self.left_keys, left_kr, right_kr, "left") right_key_columns: typing.List[int] = self.get_key_columns(self.right_keys, right_kr, left_kr, "right") + if len(left_key_columns) != len(right_key_columns): + print("There are %d left key columns but %d right key columns. Exiting." % (len(left_key_columns), len(right_key_columns)), flush=True) + return + + if self.verbose: + print("Building the input key set from %s" % self.right_file_path, flush=True) key_set: typint.Set[str] = self.extract_key_set(right_kr, "right", right_key_columns) + if self.verbose: + print("There are %d entries in the key set." % len(key_set)) + if self.verbose: + print("Opening the output file: %s" % self.output_path, flush=True) ew: KgtkWriter = KgtkWriter.open(left_kr.column_names, self.output_path, require_all_columns=False, @@ -145,12 +165,22 @@ def process(self): verbose=self.verbose, very_verbose=self.very_verbose) + if self.verbose: + print("Filtering records from %s" % self.left_file_path, flush=True) + input_line_count: int = 0 + output_line_count: int = 0; + row: typing.list[str] for row in left_kr: + input_line_count += 1 left_key: str = self.build_key(row, left_key_columns) if left_key in key_set: ew.write(line) + output_line_count += 1 ew.close() + + if self.verbose: + print("Read %d records, write %d records." % (input_line_count, output_line_count)) def main(): """ @@ -162,6 +192,9 @@ def main(): parser.add_argument(dest="right_file_path", help="The right KGTK file to join", type=Path) + parser.add_argument( "--error-limit", dest="error_limit", + help="The maximum number of errors to report before failing", type=int, default=KgtkReader.ERROR_LIMIT_DEFAULT) + parser.add_argument( "--field-separator", dest="field_separator", help="Separator for multifield keys", default=IfExists.FIELD_SEPARATOR_DEFAULT) parser.add_argument( "--fill-short-lines", dest="fill_short_lines", @@ -190,8 +223,13 @@ def main(): parser.add_argument( "--very-verbose", dest="very_verbose", help="Print additional progress messages.", action='store_true') + KgtkValueOptions.add_arguments(parser) + args = parser.parse_args() + # Build the value parsing option structure. + value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) + ie: IfExists = IfExists(left_file_path=args.left_file_path, right_file_path=args.right_file_path, output_path=args.output_file_path, @@ -202,6 +240,7 @@ def main(): long_line_action=args.long_line_action, fill_short_lines=args.fill_short_lines, truncate_long_lines=args.truncate_long_lines, + value_options=value_options, gzip_in_parallel=args.gzip_in_parallel, verbose=args.verbose, very_verbose=args.very_verbose) From ed54b635e4503bdebe26ff07f0a01eb853eac1a9 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 6 May 2020 20:02:58 -0700 Subject: [PATCH 105/278] Add missing import. Correct output name. --- kgtk/join/edgejoiner.py | 2 +- kgtk/join/ifexists.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/kgtk/join/edgejoiner.py b/kgtk/join/edgejoiner.py index 45a508827..b2484a547 100644 --- a/kgtk/join/edgejoiner.py +++ b/kgtk/join/edgejoiner.py @@ -20,7 +20,7 @@ from kgtk.join.kgtkreader import KgtkReader from kgtk.join.kgtkformat import KgtkFormat from kgtk.join.kgtkwriter import KgtkWriter -from kgtk.join.kgtkvalueoptions import KgtkValueOptions, DEFAULT_KGTK_VALUE_OPTIONS +from kgtk.join.kgtkvalueoptions import KgtkValueOptions from kgtk.join.validationaction import ValidationAction @attr.s(slots=True, frozen=True) diff --git a/kgtk/join/ifexists.py b/kgtk/join/ifexists.py index 88b772ffc..90cf2d1c9 100644 --- a/kgtk/join/ifexists.py +++ b/kgtk/join/ifexists.py @@ -28,6 +28,7 @@ from kgtk.join.kgtkformat import KgtkFormat from kgtk.join.kgtkreader import KgtkReader from kgtk.join.kgtkwriter import KgtkWriter +from kgtk.join.kgtkvalueoptions import KgtkValueOptions from kgtk.join.validationaction import ValidationAction @attr.s(slots=True, frozen=True) @@ -175,7 +176,7 @@ def process(self): input_line_count += 1 left_key: str = self.build_key(row, left_key_columns) if left_key in key_set: - ew.write(line) + ew.write(row) output_line_count += 1 ew.close() From 9cddcc3bac8b23a19f98e8ade258f1c90f885633 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 6 May 2020 20:07:55 -0700 Subject: [PATCH 106/278] Pass the error_limit through. --- kgtk/join/ifexists.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/kgtk/join/ifexists.py b/kgtk/join/ifexists.py index 90cf2d1c9..c5ce50257 100644 --- a/kgtk/join/ifexists.py +++ b/kgtk/join/ifexists.py @@ -61,6 +61,8 @@ class IfExists(KgtkFormat): gzip_in_parallel: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) + error_limit: int = attr.ib(validator=attr.validators.instance_of(int), default=KgtkReader.ERROR_LIMIT_DEFAULT) + verbose: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) very_verbose: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) @@ -131,7 +133,11 @@ def process(self): long_line_action=self.long_line_action, fill_short_lines=self.fill_short_lines, truncate_long_lines=self.truncate_long_lines, - value_options = self.value_options) + value_options = self.value_options, + error_limit=self.error_limit, + verbose=self.verbose, + very_verbose=self.very_verbose, + ) if self.verbose: print("Opening the right input file: %s" % self.right_file_path, flush=True) @@ -140,7 +146,11 @@ def process(self): long_line_action=self.long_line_action, fill_short_lines=self.fill_short_lines, truncate_long_lines=self.truncate_long_lines, - value_options = self.value_options) + value_options = self.value_options, + error_limit=self.error_limit, + verbose=self.verbose, + very_verbose=self.very_verbose, + ) left_key_columns: typing.List[int] = self.get_key_columns(self.left_keys, left_kr, right_kr, "left") right_key_columns: typing.List[int] = self.get_key_columns(self.right_keys, right_kr, left_kr, "right") @@ -243,6 +253,7 @@ def main(): truncate_long_lines=args.truncate_long_lines, value_options=value_options, gzip_in_parallel=args.gzip_in_parallel, + error_limit=args.error_limit, verbose=args.verbose, very_verbose=args.very_verbose) From 719bbfe12eb14cab937df9a91ab771e6aee950c2 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 6 May 2020 20:20:42 -0700 Subject: [PATCH 107/278] Unify edgejoiner and nodejoiner. --- kgtk/join/{edgejoiner.py => kgtkjoiner.py} | 12 +- kgtk/join/nodejoiner.py | 245 --------------------- 2 files changed, 5 insertions(+), 252 deletions(-) rename kgtk/join/{edgejoiner.py => kgtkjoiner.py} (98%) delete mode 100644 kgtk/join/nodejoiner.py diff --git a/kgtk/join/edgejoiner.py b/kgtk/join/kgtkjoiner.py similarity index 98% rename from kgtk/join/edgejoiner.py rename to kgtk/join/kgtkjoiner.py index b2484a547..d512d7a0b 100644 --- a/kgtk/join/edgejoiner.py +++ b/kgtk/join/kgtkjoiner.py @@ -1,8 +1,6 @@ """ Join two KTKG edge files or two KGTK node files. The output file is an edge file or a node file. -TODO: rename this to KgtkJoiner. - Note: This implementation builds im-memory sets of all the key values in each input file. @@ -24,7 +22,7 @@ from kgtk.join.validationaction import ValidationAction @attr.s(slots=True, frozen=True) -class EdgeJoiner(KgtkFormat): +class KgtkJoiner(KgtkFormat): left_file_path: Path = attr.ib(validator=attr.validators.instance_of(Path)) right_file_path: Path = attr.ib(validator=attr.validators.instance_of(Path)) output_path: typing.Optional[Path] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(Path))) @@ -76,14 +74,14 @@ def node1_column_idx(self, kr: KgtkReader, who: str)->int: idx: int = kr.node1_column_idx if idx < 0: # TODO: throw a better exception - raise ValueError("EdgeJoiner: unknown node1 column index in KGTK %s edge file." % who) + raise ValueError("KgtkJoiner: unknown node1 column index in KGTK %s edge file." % who) return idx def id_column_idx(self, kr: KgtkReader, who: str)->int: idx: int = kr.id_column_idx if idx < 0: # TODO: throw a better exception - raise ValueError("EdgeJoiner: unknown id column index in KGTK %s node file." % who) + raise ValueError("KgtkJoiner: unknown id column index in KGTK %s node file." % who) return idx def build_join_key(self, kr: KgtkReader, join_idx_list: typing.List[int], row: typing.List[str])->str: @@ -390,7 +388,7 @@ def main(): parser.add_argument( "--error-limit", dest="error_limit", help="The maximum number of errors to report before failing", type=int, default=KgtkReader.ERROR_LIMIT_DEFAULT) - parser.add_argument( "--field-separator", dest="field_separator", help="Separator for multifield keys", default=EdgeJoiner.FIELD_SEPARATOR_DEFAULT) + parser.add_argument( "--field-separator", dest="field_separator", help="Separator for multifield keys", default=KgtkJoiner.FIELD_SEPARATOR_DEFAULT) parser.add_argument( "--fill-short-lines", dest="fill_short_lines", help="Fill missing trailing columns in short lines with empty values.", action='store_true') parser.add_argument( "--join-on-label", dest="join_on_label", help="If both input files are edge files, include the label column in the join.", action='store_true') @@ -424,7 +422,7 @@ def main(): # Build the value parsing option structure. value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) - ej: EdgeJoiner = EdgeJoiner(left_file_path=args.left_file_path, + ej: KgtkJoiner = KgtkJoiner(left_file_path=args.left_file_path, right_file_path=args.right_file_path, output_path=args.output_file_path, left_join=args.left_join, diff --git a/kgtk/join/nodejoiner.py b/kgtk/join/nodejoiner.py deleted file mode 100644 index 97ce6029d..000000000 --- a/kgtk/join/nodejoiner.py +++ /dev/null @@ -1,245 +0,0 @@ -""" -Join two KTKG edge files. The output file is an edge file. - -Note: This implementation builds im-memory sets of all the key values in -each input file. - -""" - -from argparse import ArgumentParser -import attr -import gzip -from pathlib import Path -from multiprocessing import Queue -import sys -import typing - -from kgtk.join.enumnameaction import EnumNameAction -from kgtk.join.kgtkformat import KgtkFormat -from kgtk.join.kgtkwriter import KgtkWriter -from kgtk.join.nodereader import NodeReader -from kgtk.join.validationaction import ValidationAction - -@attr.s(slots=True, frozen=True) -class NodeJoiner(KgtkFormat): - left_file_path: Path = attr.ib(validator=attr.validators.instance_of(Path)) - right_file_path: Path = attr.ib(validator=attr.validators.instance_of(Path)) - output_path: typing.Optional[Path] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(Path))) - - # left_join == False and right_join == False: inner join - # left_join == True and right_join == False: left join - # left_join == False and right_join == True: right join - # left_join = True and right_join == True: outer join - left_join: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) - right_join: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) - - # The prefix applied to right file column names in the output file: - prefix: typing.Optional[str] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(str)), default=None) - - # The field separator used in multifield joins. The KGHT list character should be safe. - field_separator: str = attr.ib(validator=attr.validators.instance_of(str), default=KgtkFormat.LIST_SEPARATOR) - - # Ignore records with too many or too few fields? - short_line_action: ValidationAction = attr.ib(validator=attr.validators.instance_of(ValidationAction), default=ValidationAction.EXCLUDE) - long_line_action: ValidationAction = attr.ib(validator=attr.validators.instance_of(ValidationAction), default=ValidationAction.EXCLUDE) - - # Require or fill trailing fields? - fill_short_lines: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) - truncate_long_lines: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) - - gzip_in_parallel: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) - - verbose: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) - very_verbose: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) - - FIELD_SEPARATOR_DEFAULT: str = KgtkFormat.LIST_SEPARATOR - - def id_column_idx(self, kr: NodeReader, who: str)->int: - idx: int = kr.id_column_idx - if idx < 0: - # TODO: throw a better exception - raise ValueError("NodeJoiner: unknown node1 column index in KGTK %s edge type." % who) - return idx - - def single_column_key_set(self, kr: NodeReader, join_idx: int)->typing.Set[str]: - result: typing.Set[str] = set() - row: typing.List[str] - for row in kr: - result.add(row[join_idx]) - return result - - def extract_join_key_set(self, file_path: Path, who: str)->typing.Set[str]: - kr: NodeReader = NodeReader.open_node_file(file_path, - short_line_action=self.short_line_action, - long_line_action=self.long_line_action, - fill_short_lines=self.fill_short_lines, - truncate_long_lines=self.truncate_long_lines, - gzip_in_parallel=self.gzip_in_parallel, - verbose=self.verbose, - very_verbose=self.very_verbose) - - if not kr.is_node_file: - raise ValueError("The %s file is not a node file" % who) - - join_idx: int = self.id_column_idx(kr, who) - return self.single_column_key_set(kr, join_idx) # closes er file - - - def join_key_sets(self)->typing.Set[str]: - """ - Read the input edge files the first time, building the sets of left and right join values. - """ - left_join_key_set: typing.Set[str] = self.extract_join_key_set(self.left_file_path, "left") - right_join_key_set: typing.Set[str] = self.extract_join_key_set(self.right_file_path, "right") - - joined_key_set: typing.Set[str] - if self.left_join and self.right_join: - # TODO: This joins everything! We can shortut computing these sets. - joined_key_set = left_join_key_set.union(right_join_key_set) - elif self.left_join and not self.right_join: - joined_key_set = left_join_key_set.copy() - elif self.right_join and not self.left_join: - joined_key_set = right_join_key_set.copy() - else: - joined_key_set = left_join_key_set.intersection(right_join_key_set) - return joined_key_set - - def merge_columns(self, left_kr: NodeReader, right_kr: NodeReader)->typing.Tuple[typing.List[str], typing.List[str]]: - joined_column_names: typing.List[str] = [ ] - right_column_names: typing.List[str] = [ ] - - # First step: copy the left column names. - column_name: str - for column_name in left_kr.column_names: - joined_column_names.append(column_name) - - idx: int = 0 - for column_name in right_kr.column_names: - if idx == right_kr.id_column_idx: - # The right file is an edge file and this is its node1 column index. - if left_kr.id_column_idx >= 0: - # The left file has a node1 column. Map to that. - column_name = left_kr.column_names[left_kr.id_column_idx] - else: - # Apparently we don't have a destination in the left file. Punt. - raise ValueError("Can't map right join column name to the left file #2.") - elif idx == right_kr.label_column_idx and left_kr.label_column_idx >= 0: - # Map the right file's label column to the left file's label column. - column_name = left_kr.column_names[left_kr.label_column_idx] - elif idx == right_kr.node2_column_idx and left_kr.node2_column_idx >= 0: - # Map the right file's node2 column to the left file's node2 column. - column_name = left_kr.column_names[left_kr.node2_column_idx] - else: - # Apply the prefix. - if self.prefix is not None and len(self.prefix) > 0: - column_name = self.prefix + column_name - - right_column_names.append(column_name) - if column_name not in joined_column_names: - joined_column_names.append(column_name) - idx += 1 - - return (joined_column_names, right_column_names) - - def process(self): - joined_key_set: typing.Set[str] = self.join_key_sets() - - # Open the input files for the second time. This won't work with stdin. - left_kr: NodeReader = NodeReader.open_node_file(self.left_file_path, - short_line_action=self.short_line_action, - long_line_action=self.long_line_action, - fill_short_lines=self.fill_short_lines, - truncate_long_lines=self.truncate_long_lines) - - right_kr: NodeReader = NodeReader.open_node_file(self.right_file_path, - short_line_action=self.short_line_action, - long_line_action=self.long_line_action, - fill_short_lines=self.fill_short_lines, - truncate_long_lines=self.truncate_long_lines) - - # Map the right column names for the join: - joined_column_names: typing.List[str] - right_column_names: typing.List[str] - (joined_column_names, right_column_names) = self.merge_columns(left_kr, right_kr) - - if self.verbose: - print(" left columns: %s" % " ".join(left_kr.column_names)) - print(" right columns: %s" % " ".join(right_kr.column_names)) - print("mapped right columns: %s" % " ".join(right_column_names)) - print(" joined columns: %s" % " ".join(joined_column_names)) - - ew: KgtkWriter = KgtkWriter.open(joined_column_names, - self.output_path, - require_all_columns=False, - prohibit_extra_columns=True, - fill_missing_columns=True, - gzip_in_parallel=self.gzip_in_parallel, - verbose=self.verbose, - very_verbose=self.very_verbose) - - row: typing.list[str] - left_node1_idx: int = self.id_column_idx(left_kr, who="left") - for row in left_kr: - left_key: str = row[left_node1_idx] - if left_key in joined_key_set: - ew.write(row) - - right_shuffle_list: typing.List[int] = ew.build_shuffle_list(right_column_names) - right_node1_idx: int = self.id_column_idx(right_kr, who="right") - for row in right_kr: - right_key: str = row[right_node1_idx] - if right_key in joined_key_set: - ew.write(row, shuffle_list=right_shuffle_list) - - ew.close() - -def main(): - """ - Test the KGTK file joiner. - """ - parser = ArgumentParser() - parser.add_argument(dest="left_file_path", help="The left KGTK file to join", type=Path) - parser.add_argument(dest="right_file_path", help="The right KGTK file to join", type=Path) - parser.add_argument( "--field-separator", dest="field_separator", help="Separator for multifield keys", default=NodeJoiner.FIELD_SEPARATOR_DEFAULT) - parser.add_argument( "--fill-short-lines", dest="fill_short_lines", - help="Fill missing trailing columns in short lines with empty values.", action='store_true') - parser.add_argument( "--gzip-in-parallel", dest="gzip_in_parallel", help="Execute gzip in parallel.", action='store_true') - parser.add_argument( "--left-join", dest="left_join", help="Perform a left outer join.", action='store_true') - - parser.add_argument( "--long-line-action", dest="long_line_action", - help="The action to take when a long line is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - - parser.add_argument( "--short-line-action", dest="short_line_action", - help="The action to take whe a short line is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - - parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to read", type=Path, default=None) - parser.add_argument( "--prefix", dest="prefix", help="The prefix applied to right file column names in the output file.") - parser.add_argument( "--right-join", dest="right_join", help="Perform a right outer join.", action='store_true') - parser.add_argument( "--truncate-long-lines", dest="truncate_long_lines", - help="Remove excess trailing columns in long lines.", action='store_true') - parser.add_argument("-v", "--verbose", dest="verbose", help="Print additional progress messages.", action='store_true') - parser.add_argument( "--very-verbose", dest="very_verbose", help="Print additional progress messages.", action='store_true') - args = parser.parse_args() - - nj: NodeJoiner = NodeJoiner(left_file_path=args.left_file_path, - right_file_path=args.right_file_path, - output_path=args.output_file_path, - left_join=args.left_join, - right_join=args.right_join, - prefix=args.prefix, - field_separator=args.field_separator, - short_line_action=args.short_line_action, - long_line_action=args.long_line_action, - fill_short_lines=args.fill_short_lines, - truncate_long_lines=args.truncate_long_lines, - gzip_in_parallel=args.gzip_in_parallel, - verbose=args.verbose, - very_verbose=args.very_verbose) - - nj.process() - -if __name__ == "__main__": - main() - From af869671b4bf59ee6feb626e7b436da890272590 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Thu, 7 May 2020 11:44:37 -0700 Subject: [PATCH 108/278] Offer convenience iterators. --- kgtk/join/kgtkreader.py | 212 +++++++++++++++++++++++++++++++++------- 1 file changed, 178 insertions(+), 34 deletions(-) diff --git a/kgtk/join/kgtkreader.py b/kgtk/join/kgtkreader.py index 6b73c8f0e..546974d57 100644 --- a/kgtk/join/kgtkreader.py +++ b/kgtk/join/kgtkreader.py @@ -495,14 +495,9 @@ def exclude_line(self, action: ValidationAction, msg: str, line: str)->bool: raise ValueError("Too many data errors, exiting.") return result - # This is both and iterable and an iterator object. - def __iter__(self)->typing.Iterator[typing.List[str]]: - return self - # Get the next edge values as a list of strings. - # TODO: Convert integers, coordinates, etc. to Python types - def __next__(self)-> typing.List[str]: - values: typing.List[str] + def nextrow(self)-> typing.List[str]: + row: typing.List[str] # This loop accomodates lines that are ignored. while (True): @@ -541,42 +536,45 @@ def __next__(self)-> typing.List[str]: if self.exclude_line(self.whitespace_line_action, "saw a whitespace line", line): continue - values = line.split(self.column_separator) + row = line.split(self.column_separator) - # Optionally fill missing trailing columns with empty values: - if self.fill_short_lines and len(values) < self.column_count: - while len(values) < self.column_count: - values.append("") + # Optionally fill missing trailing columns with empty row: + if self.fill_short_lines and len(row) < self.column_count: + while len(row) < self.column_count: + row.append("") # Optionally remove extra trailing columns: - if self.truncate_long_lines and len(values) > self.column_count: - values = values[:self.column_count] + if self.truncate_long_lines and len(row) > self.column_count: + row = row[:self.column_count] # Optionally validate that the line contained the right number of columns: # # When we report line numbers in error messages, line 1 is the first line after the header line. - if self.short_line_action != ValidationAction.PASS and len(values) < self.column_count: + if self.short_line_action != ValidationAction.PASS and len(row) < self.column_count: if self.exclude_line(self.short_line_action, "Required %d columns, saw %d: '%s'" % (self.column_count, - len(values), + len(row), line), line): continue - if self.long_line_action != ValidationAction.PASS and len(values) > self.column_count: + if self.long_line_action != ValidationAction.PASS and len(row) > self.column_count: if self.exclude_line(self.long_line_action, "Required %d columns, saw %d (%d extra): '%s'" % (self.column_count, - len(values), - len(values) - self.column_count, + len(row), + len(row) - self.column_count, line), line): continue - if self._ignore_if_blank_fields(values, line): + if self._ignore_if_blank_fields(row, line): continue if self.invalid_value_action != ValidationAction.PASS: - if self._ignore_invalid_values(values, line): + # TODO: find a way to optionally cache the KgtkValue objects + # so we don't have to create them a second time in the conversion + # and iterator methods below. + if self._ignore_invalid_values(row, line): continue self.data_lines_passed += 1 @@ -584,7 +582,165 @@ def __next__(self)-> typing.List[str]: sys.stdout.write(".") sys.stdout.flush() - return values + return row + + # This is both and iterable and an iterator object. + def __iter__(self)->typing.Iterator[typing.List[str]]: + return self + + # Get the next row values as a list of strings. + # TODO: Convert integers, coordinates, etc. to Python types + def __next__(self)-> typing.List[str]: + return self.nextrow() + + def concise(self)->typing.Iterator[typing.List[typing.Optional[str]]]: + """ + Using a generator function, create an iterator that returns rows of fields + as strings. Empty fields will be returned as None. + + """ + while True: + # self.nextrow() will throw StopIteration when done. + row: typing.List[str] = self.nextrow() + + # Copy the row, converting empty fields into None: + results: typing.List[typing.Optional[str]] = [ ] + field: str + for field in row: + if len(field) == 0: + results.append(None) + else: + results.append(field) + yield results + + + def to_kgtk_values(self, row: typing.List[str], validate: bool = False)->typing.List[KgtkValue]: + """ + Convert an input row into a list of KgtkValue instances. + + When validate is True, validate each KgtkValue object. + """ + options: KgtkValueOptions = self.value_options if self.value_options is not None else DEFAULT_KGTK_VALUE_OPTIONS + results: typing.List[KgtkValue] = [ ] + field: str + for field in row: + kv = KgtkValue(field, options=options) + if validate: + kv.validate() + results.append(kv) + return results + + def kgtk_values(self, validate: bool = False)->typing.Iterator[typing.List[KgtkValue]]: + """ + Using a generator function, create an iterator that returns rows of fields + as KgtkValue objects. + + When validate is True, validate each KgtkValue object. + """ + while True: + # self.nextrow() will throw StopIteration when done. + yield self.to_kgtk_values(self.nextrow(), validate=validate) + + def to_concise_kgtk_values(self, row: typing.List[str], validate: bool = False)->typing.List[typing.Optional[KgtkValue]]: + """ + Convert an input row into a list of KgtkValue instances. Empty fields will be returned as None. + + When validate is True, validate each KgtkValue object. + """ + options: KgtkValueOptions = self.value_options if self.value_options is not None else DEFAULT_KGTK_VALUE_OPTIONS + results: typing.List[typing.Optional[KgtkValue]] = [ ] + field: str + for field in row: + if len(field) == 0: + results.append(None) + else: + kv = KgtkValue(field, options=options) + if validate: + kv.validate() + results.append(kv) + return results + + def concise_kgtk_values(self, validate: bool = False)->typing.Iterator[typing.List[typing.Optional[KgtkValue]]]: + """ + Using a generator function, create an iterator that returns rows of fields + as KgtkValue objects, with empty fields returned as None. + + When validate is True, validate each KgtkValue object. + """ + while True: + # self.nextrow() will throw StopIteration when done. + yield self.to_concise_kgtk_values(self.nextrow(), validate=validate) + + def to_dict(self, row: typing.List[str], concise: bool=False)->typing.Mapping[str, str]: + """ + Convert an input row into a dict of named fields. + + If concise is True, then empty fields will be skipped. + """ + results: typing.MutableMapping[str, str] = { } + field: str + idx: int = 0 + + # We'll use two seperate loops in anticipation of a modest + # efficiency gain. + if concise: + for field in row: + if len(field) > 0: + results[self.column_names[idx]] = field + idx += 1 + else: + for field in row: + results[self.column_names[idx]] = field + idx += 1 + return results + + def dicts(self, concise: bool=False)->typing.Iterator[typing.Mapping[str, str]]: + """ + Using a generator function, create an iterator that returns each row as a dict of named fields. + + If concise is True, then empty fields will be skipped. + + """ + while True: + # self.nextrow() will throw StopIteration when done. + yield self.to_dict(self.nextrow(), concise=concise) + + def to_kgtk_value_dict(self, row: typing.List[str], validate: bool=False, concise: bool=False)->typing.Mapping[str, KgtkValue]: + """ + Convert an input row into a dict of named fields. + + If concise is True, then empty fields will be skipped. + + When validate is True, validate each KgtkValue object. + """ + options: KgtkValueOptions = self.value_options if self.value_options is not None else DEFAULT_KGTK_VALUE_OPTIONS + results: typing.MutableMapping[str, KgtkValue] = { } + idx: int = 0 + field: str + for field in row: + if concise and len(field) == 0: + pass # Skip the empty field. + else: + kv = KgtkValue(field, options=options) + if validate: + kv.validate() + results[self.column_names[idx]] = kv + idx += 1 + return results + + def kgtk_value_dicts(self, validate: bool=False, concise: bool=False)->typing.Iterator[typing.Mapping[str, KgtkValue]]: + """ + Using a generator function, create an iterator that returns each row as a + dict of named KgtkValue objects. + + If concise is True, then empty fields will be skipped. + + When validate is True, validate each KgtkValue object. + """ + while True: + # self.nextrow() will throw StopIteration when done. + yield self.to_kgtk_value_dict(self.nextrow(), validate=validate, concise=concise) + def _ignore_invalid_values(self, values: typing.List[str], line: str)->bool: """Give a row of values, validate each value. If we find one or more @@ -644,18 +800,6 @@ def merge_columns(self, additional_columns: typing.List[str])->typing.List[str]: return merged_columns - def to_map(self, row: typing.List[str])->typing.Mapping[str, str]: - """ - Convert an input line into a named map of fields. - """ - result: typing.MutableMapping[str, str] = { } - value: str - idx: int = 0 - for value in row: - result[self.column_names[idx]] = value - idx += 1 - return result - @classmethod def add_shared_arguments(cls, parser: ArgumentParser): parser.add_argument(dest="kgtk_file", help="The KGTK file to read", type=Path, nargs="?") From 6604bc99a140bf98af6690cae219df147135c52c Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Thu, 7 May 2020 12:19:35 -0700 Subject: [PATCH 109/278] Must catch and StopIteration and return in a generator. Add iterator tests. --- kgtk/join/kgtkreader.py | 95 +++++++++++++++++++++++++++++++++++------ 1 file changed, 81 insertions(+), 14 deletions(-) diff --git a/kgtk/join/kgtkreader.py b/kgtk/join/kgtkreader.py index 546974d57..c1dd3bbe4 100644 --- a/kgtk/join/kgtkreader.py +++ b/kgtk/join/kgtkreader.py @@ -593,15 +593,17 @@ def __iter__(self)->typing.Iterator[typing.List[str]]: def __next__(self)-> typing.List[str]: return self.nextrow() - def concise(self)->typing.Iterator[typing.List[typing.Optional[str]]]: + def concise_rows(self)->typing.Iterator[typing.List[typing.Optional[str]]]: """ Using a generator function, create an iterator that returns rows of fields as strings. Empty fields will be returned as None. """ while True: - # self.nextrow() will throw StopIteration when done. - row: typing.List[str] = self.nextrow() + try: + row: typing.List[str] = self.nextrow() + except StopIteration: + return # Copy the row, converting empty fields into None: results: typing.List[typing.Optional[str]] = [ ] @@ -638,8 +640,10 @@ def kgtk_values(self, validate: bool = False)->typing.Iterator[typing.List[KgtkV When validate is True, validate each KgtkValue object. """ while True: - # self.nextrow() will throw StopIteration when done. - yield self.to_kgtk_values(self.nextrow(), validate=validate) + try: + yield self.to_kgtk_values(self.nextrow(), validate=validate) + except StopIteration: + return def to_concise_kgtk_values(self, row: typing.List[str], validate: bool = False)->typing.List[typing.Optional[KgtkValue]]: """ @@ -668,8 +672,10 @@ def concise_kgtk_values(self, validate: bool = False)->typing.Iterator[typing.Li When validate is True, validate each KgtkValue object. """ while True: - # self.nextrow() will throw StopIteration when done. - yield self.to_concise_kgtk_values(self.nextrow(), validate=validate) + try: + yield self.to_concise_kgtk_values(self.nextrow(), validate=validate) + except StopIteration: + return def to_dict(self, row: typing.List[str], concise: bool=False)->typing.Mapping[str, str]: """ @@ -702,8 +708,10 @@ def dicts(self, concise: bool=False)->typing.Iterator[typing.Mapping[str, str]]: """ while True: - # self.nextrow() will throw StopIteration when done. - yield self.to_dict(self.nextrow(), concise=concise) + try: + yield self.to_dict(self.nextrow(), concise=concise) + except StopIteration: + return def to_kgtk_value_dict(self, row: typing.List[str], validate: bool=False, concise: bool=False)->typing.Mapping[str, KgtkValue]: """ @@ -738,9 +746,10 @@ def kgtk_value_dicts(self, validate: bool=False, concise: bool=False)->typing.It When validate is True, validate each KgtkValue object. """ while True: - # self.nextrow() will throw StopIteration when done. - yield self.to_kgtk_value_dict(self.nextrow(), validate=validate, concise=concise) - + try: + yield self.to_kgtk_value_dict(self.nextrow(), validate=validate, concise=concise) + except StopIteration: + return def _ignore_invalid_values(self, values: typing.List[str], line: str)->bool: """Give a row of values, validate each value. If we find one or more @@ -893,6 +902,14 @@ def main(): EdgeReader.add_arguments(parser) NodeReader.add_arguments(parser) KgtkValueOptions.add_arguments(parser) + + parser.add_argument( "--test", dest="test_method", help="The test to perform", + choices=["rows", "concise-rows", + "kgtk-values", "concise-kgtk-values", + "dicts", "concise-dicts", + "kgtk-value-dicts", "concise-kgtk-value-dicts"], + default="rows") + parser.add_argument( "--test-valdate", dest="test_validate", help="Validate KgtkValue objects in test.", action='store_true') args = parser.parse_args() error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr @@ -929,8 +946,58 @@ def main(): line_count: int = 0 row: typing.List[str] - for row in kr: - line_count += 1 + kgtk_values: typing.List[KgtkValue] + concise_kgtk_values: typing.List[typing.Optional[KgtkValue]] + dict_row: typing.Mapping[str, str] + kgtk_value_dict: typing.Mapping[str, str] + if args.test_method == "rows": + if args.verbose: + print("Testing iterating over rows.", flush=True) + for row in kr: + line_count += 1 + + elif args.test_method == "concise-rows": + if args.verbose: + print("Testing iterating over concise rows.", flush=True) + for row in kr.concise_rows(): + line_count += 1 + + elif args.test_method == "kgtk-values": + if args.verbose: + print("Testing iterating over KgtkValue rows.", flush=True) + for kgtk_values in kr.kgtk_values(validate=args.test_validate): + line_count += 1 + + elif args.test_method == "concise-kgtk-values": + if args.verbose: + print("Testing iterating over concise KgtkValue rows.", flush=True) + for kgtk_values in kr.concise_kgtk_values(validate=args.test_validate): + line_count += 1 + + elif args.test_method == "dicts": + if args.verbose: + print("Testing iterating over dicts.", flush=True) + for dict_row in kr.dicts(): + line_count += 1 + + elif args.test_method == "concise-dicts": + if args.verbose: + print("Testing iterating over concise dicts.", flush=True) + for dict_row in kr.dicts(concise=True): + line_count += 1 + + elif args.test_method == "kgtk-value-dicts": + if args.verbose: + print("Testing iterating over KgtkValue dicts.", flush=True) + for kgtk_value_dict in kr.kgtk_value_dicts(validate=args.test_validate): + line_count += 1 + + elif args.test_method == "concise-kgtk-value-dicts": + if args.verbose: + print("Testing iterating over concise KgtkValue dicts.", flush=True) + for kgtk_value_dict in kr.kgtk_value_dicts(concise=True, validate=args.test_validate): + line_count += 1 + print("Read %d lines" % line_count, file=error_file, flush=True) if __name__ == "__main__": From 6bc54bb6f77f6ce8e5a953b83048752da56089bb Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Thu, 7 May 2020 12:25:16 -0700 Subject: [PATCH 110/278] Add documentation on the available iterators. --- kgtk/join/kgtkreader.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/kgtk/join/kgtkreader.py b/kgtk/join/kgtkreader.py index c1dd3bbe4..1b2c321e8 100644 --- a/kgtk/join/kgtkreader.py +++ b/kgtk/join/kgtkreader.py @@ -1,7 +1,19 @@ -""" -Read a KGTK node or edge file in TSV format. +"""Read a KGTK node or edge file in TSV format. + +Normally, results are obtained as rows of string values obtained by iteration +on the KgtkReader object. Alternative iterators are available to return the results +as: + + * concise_rows: lists of strings with empty fields converted to None + * kgtk_values: lists of KgtkValue objects + * concise_kgtk_values: lists of KgtkValue objects with empty fields converted to None + * dicts: dicts of strings + * dicts(concise=True): dicts of strings with empty fields omitted + * kgtk_value_dicts: dicts of KgtkValue objects + * kgtk_value_dicts(concise=True): dicts of KgtkValue objects with empty fields omitted TODO: Add support for alternative envelope formats, such as JSON. + """ from argparse import ArgumentParser From 5e9b44ebb9adef799b45756b2fc8b13408a78e80 Mon Sep 17 00:00:00 2001 From: ckxz105 Date: Thu, 7 May 2020 13:06:16 -0700 Subject: [PATCH 111/278] update property-value template --- kgtk/cli/text_embedding_README.md | 14 ++++++++------ kgtk/gt/embedding_utils.py | 18 +++++++++++++----- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/kgtk/cli/text_embedding_README.md b/kgtk/cli/text_embedding_README.md index 17753bd8c..a2b431016 100644 --- a/kgtk/cli/text_embedding_README.md +++ b/kgtk/cli/text_embedding_README.md @@ -13,7 +13,7 @@ kgtk text_embedding \ --model/ -m \ # optional, default is `bert-base-wikipedia-sections-mean-tokens` --label-properties \ # optional, default is ["label"] --description-properties \ # optional, default is ["description"] - --isa-properties \ # optional, default is ["P279"] + --isa-properties \ # optional, default is ["P31"] --has-properties \ # optional, default is ["all"] --property-labels-file/ -p \ #optional --output-format # optional, default is `kgtk_format` @@ -113,15 +113,16 @@ If not given, the program will try to use the default edge(property) name as `de ##### --isa-properties an ordered list of properties. When a property contains multiple values, the first value will selected. When a property value is not a literal, output the label of the property value. When multiple isa-properties are present, the values are output comma-separated. -If not given, the program will try to use the default edge(property) name as `P279`. Those words in properties will be for vector embedding later. +If not given, the program will try to use the default edge(property) name as `P31`. Those words in properties will be for vector embedding later. ##### --has-properties an ordered list of properties. The output consists of a comma-separated text with the labels of the properties, using and for the last item, e.g., “country, place of birth, religion and canonization status” . If not given, the program will use all of the found properties found for the node. Those words in properties will be for vector embedding later. ##### --property-value -If the properties in `has-properties` is a property which need to check for details, specify the edge name here and the system will go further to get the property values of this node instead of use the name of this edge. Default is empty `[]` -For example: For wikidata node `Q41421` (Michael Jordan) `P544` (member of sports team), if specified here, the generated sentence will be "Michael Jordan has Chicago Bulls" instead of "Michael Jordan has member of sports team". +If the properties in `has-properties` is a property which need to check for details, specify the edge name here and the system will go further to get the property values of this node instead of use the name of this edge (using template `{property} {value}`) instead of `{property}` to represent this has-property). Default is empty `[]` + +For example: For wikidata node `Q41421` (Michael Jordan) `P544` (member of sports team), if specified here, the generated sentence will be `Michael Jordan, ..., has member of sports team Chicago Bulls` instead of `Michael Jordan,..., has member of sports team`. ##### --out-properties the property used to record the embedding. If not given, the program will use the edge(property) name as `text_embedding`. @@ -129,9 +130,10 @@ This option is only available when output format is set to `kgtk_format`. ##### --property-labels-file This parameter only works for KGTK format input. For some condition, KGTK format's value is just a reference to another P node. In this condition, user need to specify another label file for KGTK to read. + For example, if run without the labels file on the wikidata dump file, we will get some generated sentence like: -`WALS genus code is a Q19847637, Q20824104, and has P1466 and P1468` (sentence generated for P1467). After add the labels file, we will get the correct sentence as: `WALS genus code is a Wikidata property for an identifier, Wikidata property for items about languages, and has WALS family code and WALS lect code`. -This property labels file should also be a KGTK format file. One example file is [here](https://drive.google.com/open?id=1F7pb4LEx5MT1YTqycUCQcs8H2OWmBbB6 "here") (accessed only available for KGTK developers). +`WALS genus code is a Q19847637, Q20824104, and has P1855 and P2302` (sentence generated for P1467). After add the labels file, we will get the correct sentence as: `WALS genus code is a Wikidata property for an identifier, Wikidata property for items about languages, and has WALS family code and WALS lect code`. +This property labels file should also be a KGTK format file. One example file is [here](https://drive.google.com/open?id=1F7pb4LEx5MT1YTqycUCQcs8H2OWmBbB6 "here") (accessable only for KGTK developers). #### Dimensional Reduction Algorithm diff --git a/kgtk/gt/embedding_utils.py b/kgtk/gt/embedding_utils.py index 1e57491f2..429d37b47 100644 --- a/kgtk/gt/embedding_utils.py +++ b/kgtk/gt/embedding_utils.py @@ -411,6 +411,9 @@ def read_input(self, file_path: str, target_properties: dict, property_labels_di current_process_node_id = None if self._parallel_count > 1: + # need to set with spawn mode to initialize with multiple cuda in multiprocess + from multiprocessing import set_start_method + set_start_method('spawn') pp = ParallelProcessor(self._parallel_count, self._process_one, collector=self._multiprocess_collector) pp.start() @@ -428,9 +431,9 @@ def read_input(self, file_path: str, target_properties: dict, property_labels_di node_value = node_value[:node_value.index("@")] # remove extra double quote " and single quote ' - if node_value[0] == '"' and node_value[-1] == '"': + while node_value[0] == '"' and node_value[-1] == '"': node_value = node_value[1:-1] - if node_value[0] == "'" and node_value[-1] == "'": + while node_value[0] == "'" and node_value[-1] == "'": node_value = node_value[1:-1] if current_process_node_id != node_id: @@ -458,12 +461,17 @@ def read_input(self, file_path: str, target_properties: dict, property_labels_di if node_property in properties_reversed: roles = properties_reversed[node_property] if "property_values" in roles: + # for property values part, changed to be "{property} {value}" + node_value = self.get_real_label_name(node_property) + " " + self.get_real_label_name(node_value) + else: node_value = self.get_real_label_name(node_value) for each_role in roles: - if each_role != "property_values": + if each_role == "property_values" and "has_properties" not in roles: + each_node_attributes["has_properties"].append(node_value) + else: each_node_attributes[each_role].append(node_value) - if add_all_properties and each_line[column_references["value"]][0] == "P": - each_node_attributes["has_properties"].append(self.get_real_label_name(node_value)) + elif add_all_properties: # add remained properties if need all properties + each_node_attributes["has_properties"].append(self.get_real_label_name(node_property)) # close multiprocess pool if self._parallel_count > 1: From 3e87f3cb2d38b5290ef35b1ab03a898b7ab21218 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Thu, 7 May 2020 13:27:50 -0700 Subject: [PATCH 112/278] Support kgtk ifexists and kgtk ifnotexists. --- kgtk/cli/ifexists.py | 132 +++++++++++++++++++++++++++++++++++++++ kgtk/cli/ifnotexists.py | 134 ++++++++++++++++++++++++++++++++++++++++ kgtk/join/ifexists.py | 41 ++++++++---- 3 files changed, 295 insertions(+), 12 deletions(-) create mode 100644 kgtk/cli/ifexists.py create mode 100644 kgtk/cli/ifnotexists.py diff --git a/kgtk/cli/ifexists.py b/kgtk/cli/ifexists.py new file mode 100644 index 000000000..fdff15e56 --- /dev/null +++ b/kgtk/cli/ifexists.py @@ -0,0 +1,132 @@ +"""Filter a KGTK file based on whether one or more records exist in a second +KGTK file with matching values for one or more fields. +""" + +from pathlib import Path +import sys +import typing + +from kgtk.join.enumnameaction import EnumNameAction +from kgtk.join.kgtkformat import KgtkFormat +from kgtk.join.ifexists import IfExists +from kgtk.join.kgtkreader import KgtkReader +from kgtk.join.kgtkwriter import KgtkWriter +from kgtk.join.kgtkvalueoptions import KgtkValueOptions +from kgtk.join.validationaction import ValidationAction + +def parser(): + return { + 'help': 'Filter a KGTK file based on whether one or more records exist in a second KGTK file with matching values for one or more fields.' + } + + +def add_arguments(parser): + """ + Parse arguments + Args: + parser (argparse.ArgumentParser) + """ + parser.add_argument( "input_kgtk_file", nargs="?", help="The KGTK file to filter ('left' file). May be omitted or '-' for stdin.", type=Path) + + parser.add_argument( "--filter-on", dest="filter_kgtk_file", help="The KGTK file to filter against ('right' file).", type=Path, required=True) + + parser.add_argument("-o", "--output-file", dest="output_kgtk_file", help="The KGTK file to write", type=Path, default=None) + + parser.add_argument( "--left-keys", dest="left_keys", help="The key columns in the file being filtered.", nargs='*') + + parser.add_argument( "--right-keys", dest="right_keys", help="The key columns in the filter-on file.", nargs='*') + + + # A subset of common arguments: + errors_to = parser.add_mutually_exclusive_group() + errors_to.add_argument( "--errors-to-stdout", dest="errors_to_stdout", + help="Send errors to stdout instead of stderr (default)", action="store_true") + errors_to.add_argument( "--errors-to-stderr", dest="errors_to_stderr", + help="Send errors to stderr instead of stdout", action="store_true") + + parser.add_argument( "--error-limit", dest="error_limit", + help="The maximum number of errors to report before failing", type=int, default=KgtkReader.ERROR_LIMIT_DEFAULT) + + parser.add_argument( "--field-separator", dest="field_separator", + help="Field separator.", type=str, default=IfExists.FIELD_SEPARATOR_DEFAULT) + + parser.add_argument("-v", "--verbose", dest="verbose", help="Print additional progress messages.", action='store_true') + + parser.add_argument( "--very-verbose", dest="very_verbose", help="Print additional progress messages.", action='store_true') + + + + # Note: Any arguments described by KgtkValueOptions.add_arguments(...) + # need to be included in the arguments to run(...), below. + KgtkValueOptions.add_arguments(parser) + + +def run(input_kgtk_file: typing.Optional[Path], + filter_kgtk_file: Path, + output_kgtk_file: typing.Optional[Path], + left_keys: typing.Optional[typing.List[str]], + right_keys: typing.Optional[typing.List[str]], + + # Some common arguments: + errors_to_stdout: bool = False, + errors_to_stderr: bool = False, + error_limit: int = KgtkReader.ERROR_LIMIT_DEFAULT, + field_separator: str = IfExists.FIELD_SEPARATOR_DEFAULT, + verbose: bool = False, + very_verbose: bool = False, + + # Arguments from KgtkValueOptions: + additional_language_codes: typing.Optional[typing.List[str]] = None, + allow_language_suffixes: bool = False, + allow_lax_strings: bool = False, + allow_lax_lq_strings: bool = False, + allow_month_or_day_zero: bool = False, + repair_month_or_day_zero: bool = False, + minimum_valid_year: int = KgtkValueOptions.MINIMUM_VALID_YEAR, + maximum_valid_year: int = KgtkValueOptions.MAXIMUM_VALID_YEAR, + escape_list_separators: bool = False, + +)->int: + # import modules locally + from kgtk.exceptions import KGTKException + + + if input_kgtk_file is None: + input_kgtk_file = Path("-") + + # Select where to send error messages, defaulting to stderr. + # (Not used yet) + error_file: typing.TextIO = sys.stderr if errors_to_stderr else sys.stdout + + # Build the value parsing option structure. + value_options: KgtkValueOptions = KgtkValueOptions(allow_month_or_day_zero=allow_month_or_day_zero, + repair_month_or_day_zero=repair_month_or_day_zero, + allow_lax_strings=allow_lax_strings, + allow_lax_lq_strings=allow_lax_lq_strings, + allow_language_suffixes=allow_language_suffixes, + additional_language_codes=additional_language_codes, + minimum_valid_year=minimum_valid_year, + maximum_valid_year=maximum_valid_year, + escape_list_separators=escape_list_separators) + + try: + ie: IfExists = IfExists(left_file_path=input_kgtk_file, + right_file_path=filter_kgtk_file, + output_path=output_kgtk_file, + left_keys=left_keys, + right_keys=right_keys, + field_separator=field_separator, + value_options=value_options, + error_limit=error_limit, + verbose=verbose, + very_verbose=very_verbose) + + ie.process() + + return 0 + + except SystemExit as e: + raise KGTKException("Exit requested") + except Exception as e: + raise KGTKException(str(e)) + diff --git a/kgtk/cli/ifnotexists.py b/kgtk/cli/ifnotexists.py new file mode 100644 index 000000000..e9003a9ef --- /dev/null +++ b/kgtk/cli/ifnotexists.py @@ -0,0 +1,134 @@ +"""Filter a KGTK file based on whether one or more records do not exist in a +second KGTK file with matching values for one or more fields. + +""" + +from pathlib import Path +import sys +import typing + +from kgtk.join.enumnameaction import EnumNameAction +from kgtk.join.kgtkformat import KgtkFormat +from kgtk.join.ifexists import IfExists +from kgtk.join.kgtkreader import KgtkReader +from kgtk.join.kgtkwriter import KgtkWriter +from kgtk.join.kgtkvalueoptions import KgtkValueOptions +from kgtk.join.validationaction import ValidationAction + +def parser(): + return { + 'help': 'Filter a KGTK file based on whether one or more records do not exist in a second KGTK file with matching values for one or more fields.' + } + + +def add_arguments(parser): + """ + Parse arguments + Args: + parser (argparse.ArgumentParser) + """ + parser.add_argument( "input_kgtk_file", nargs="?", help="The KGTK file to filter ('left' file). May be omitted or '-' for stdin.", type=Path) + + parser.add_argument( "--filter-on", dest="filter_kgtk_file", help="The KGTK file to filter against ('right' file).", type=Path, required=True) + + parser.add_argument("-o", "--output-file", dest="output_kgtk_file", help="The KGTK file to write", type=Path, default=None) + + parser.add_argument( "--left-keys", dest="left_keys", help="The key columns in the file being filtered.", nargs='*') + + parser.add_argument( "--right-keys", dest="right_keys", help="The key columns in the filter-on file.", nargs='*') + + + # A subset of common arguments: + errors_to = parser.add_mutually_exclusive_group() + errors_to.add_argument( "--errors-to-stdout", dest="errors_to_stdout", + help="Send errors to stdout instead of stderr (default)", action="store_true") + errors_to.add_argument( "--errors-to-stderr", dest="errors_to_stderr", + help="Send errors to stderr instead of stdout", action="store_true") + + parser.add_argument( "--error-limit", dest="error_limit", + help="The maximum number of errors to report before failing", type=int, default=KgtkReader.ERROR_LIMIT_DEFAULT) + + parser.add_argument( "--field-separator", dest="field_separator", + help="Field separator.", type=str, default=IfExists.FIELD_SEPARATOR_DEFAULT) + + parser.add_argument("-v", "--verbose", dest="verbose", help="Print additional progress messages.", action='store_true') + + parser.add_argument( "--very-verbose", dest="very_verbose", help="Print additional progress messages.", action='store_true') + + + + # Note: Any arguments described by KgtkValueOptions.add_arguments(...) + # need to be included in the arguments to run(...), below. + KgtkValueOptions.add_arguments(parser) + + +def run(input_kgtk_file: typing.Optional[Path], + filter_kgtk_file: Path, + output_kgtk_file: typing.Optional[Path], + left_keys: typing.Optional[typing.List[str]], + right_keys: typing.Optional[typing.List[str]], + + # Some common arguments: + errors_to_stdout: bool = False, + errors_to_stderr: bool = False, + error_limit: int = KgtkReader.ERROR_LIMIT_DEFAULT, + field_separator: str = IfExists.FIELD_SEPARATOR_DEFAULT, + verbose: bool = False, + very_verbose: bool = False, + + # Arguments from KgtkValueOptions: + additional_language_codes: typing.Optional[typing.List[str]] = None, + allow_language_suffixes: bool = False, + allow_lax_strings: bool = False, + allow_lax_lq_strings: bool = False, + allow_month_or_day_zero: bool = False, + repair_month_or_day_zero: bool = False, + minimum_valid_year: int = KgtkValueOptions.MINIMUM_VALID_YEAR, + maximum_valid_year: int = KgtkValueOptions.MAXIMUM_VALID_YEAR, + escape_list_separators: bool = False, + +)->int: + # import modules locally + from kgtk.exceptions import KGTKException + + + if input_kgtk_file is None: + input_kgtk_file = Path("-") + + # Select where to send error messages, defaulting to stderr. + # (Not used yet) + error_file: typing.TextIO = sys.stderr if errors_to_stderr else sys.stdout + + # Build the value parsing option structure. + value_options: KgtkValueOptions = KgtkValueOptions(allow_month_or_day_zero=allow_month_or_day_zero, + repair_month_or_day_zero=repair_month_or_day_zero, + allow_lax_strings=allow_lax_strings, + allow_lax_lq_strings=allow_lax_lq_strings, + allow_language_suffixes=allow_language_suffixes, + additional_language_codes=additional_language_codes, + minimum_valid_year=minimum_valid_year, + maximum_valid_year=maximum_valid_year, + escape_list_separators=escape_list_separators) + + try: + ie: IfExists = IfExists(left_file_path=input_kgtk_file, + right_file_path=filter_kgtk_file, + output_path=output_kgtk_file, + invert=True, + left_keys=left_keys, + right_keys=right_keys, + field_separator=field_separator, + value_options=value_options, + error_limit=error_limit, + verbose=verbose, + very_verbose=very_verbose) + + ie.process() + + return 0 + + except SystemExit as e: + raise KGTKException("Exit requested") + except Exception as e: + raise KGTKException(str(e)) + diff --git a/kgtk/join/ifexists.py b/kgtk/join/ifexists.py index c5ce50257..beea083b6 100644 --- a/kgtk/join/ifexists.py +++ b/kgtk/join/ifexists.py @@ -37,6 +37,8 @@ class IfExists(KgtkFormat): right_file_path: Path = attr.ib(validator=attr.validators.instance_of(Path)) output_path: typing.Optional[Path] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(Path))) + invert: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) + left_keys: typing.Optional[typing.List[str]] = attr.ib(validator=attr.validators.optional(attr.validators.deep_iterable(member_validator=attr.validators.instance_of(str), iterable_validator=attr.validators.instance_of(list))), default=None) @@ -113,8 +115,13 @@ def get_key_columns(self, supplied_keys: typing.Optional[typing.List[str]], kr: def build_key(self, row: typing.List[str], key_columns: typing.List[int])->str: key: str = "" idx: int + first: bool = True for idx in key_columns: - key += self.field_separator+ row[idx] + if first: + first = False + else: + key += self.field_separator + key += row[idx] return key def extract_key_set(self, kr: KgtkReader, who: str, key_columns: typing.List[int])->typing.Set[str]: @@ -162,8 +169,10 @@ def process(self): if self.verbose: print("Building the input key set from %s" % self.right_file_path, flush=True) key_set: typint.Set[str] = self.extract_key_set(right_kr, "right", right_key_columns) - if self.verbose: + if self.verbose or self.very_verbose: print("There are %d entries in the key set." % len(key_set)) + if self.very_verbose: + print("Keys: %s" % " ".join(key_set)) if self.verbose: print("Opening the output file: %s" % self.output_path, flush=True) @@ -185,23 +194,29 @@ def process(self): for row in left_kr: input_line_count += 1 left_key: str = self.build_key(row, left_key_columns) - if left_key in key_set: - ew.write(row) - output_line_count += 1 - ew.close() + if self.invert: + if left_key not in key_set: + ew.write(row) + output_line_count += 1 + else: + if left_key in key_set: + ew.write(row) + output_line_count += 1 if self.verbose: - print("Read %d records, write %d records." % (input_line_count, output_line_count)) + print("Read %d records, wrote %d records." % (input_line_count, output_line_count), flush=True) + ew.close() + def main(): """ Test the KGTK file joiner. """ parser = ArgumentParser() - parser.add_argument(dest="left_file_path", help="The left KGTK file to join", type=Path) + parser.add_argument(dest="left_kgtk_file", help="The left KGTK file to join", type=Path) - parser.add_argument(dest="right_file_path", help="The right KGTK file to join", type=Path) + parser.add_argument(dest="right_kgtk_file", help="The right KGTK file to join", type=Path) parser.add_argument( "--error-limit", dest="error_limit", help="The maximum number of errors to report before failing", type=int, default=KgtkReader.ERROR_LIMIT_DEFAULT) @@ -213,6 +228,8 @@ def main(): parser.add_argument( "--gzip-in-parallel", dest="gzip_in_parallel", help="Execute gzip in parallel.", action='store_true') + parser.add_argument( "--invert", dest="invert", help="Invert the test (if not exists).", action='store_true') + parser.add_argument( "--left-keys", dest="left_keys", help="The key columns in the left file.", nargs='*') parser.add_argument( "--long-line-action", dest="long_line_action", @@ -241,9 +258,10 @@ def main(): # Build the value parsing option structure. value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) - ie: IfExists = IfExists(left_file_path=args.left_file_path, - right_file_path=args.right_file_path, + ie: IfExists = IfExists(left_file_path=args.left_kgtk_file, + right_file_path=args.right_kgtk_file, output_path=args.output_file_path, + invert=args.invert, left_keys=args.left_keys, right_keys=args.right_keys, field_separator=args.field_separator, @@ -261,4 +279,3 @@ def main(): if __name__ == "__main__": main() - From 33b2c91dc28707877702e39ea52c12baaf1e18a9 Mon Sep 17 00:00:00 2001 From: Rongpeng Date: Thu, 7 May 2020 14:33:38 -0700 Subject: [PATCH 113/278] added a readme --- kgtk/cli/generate_wikidata_triples.md | 114 ++++++++++++++++++++++++++ kgtk/cli/generate_wikidata_triples.py | 2 +- 2 files changed, 115 insertions(+), 1 deletion(-) create mode 100644 kgtk/cli/generate_wikidata_triples.md diff --git a/kgtk/cli/generate_wikidata_triples.md b/kgtk/cli/generate_wikidata_triples.md new file mode 100644 index 000000000..984db2541 --- /dev/null +++ b/kgtk/cli/generate_wikidata_triples.md @@ -0,0 +1,114 @@ +## The generate_wikidata_triples command converts a kgtk file to a ttl file that can be loaded into a wikidata Blazegraph. + +The triple generator take a tab-separated kgtk file from standard input. +``` +node1 property node2 id +Q2140726727_mag_author P6366 2140726727 id1 +Q2140726727_mag_author label Zunyou Wu@en id2 +Q2140726727_mag_author P1416 Q184490438_mag_affiliation id3 +Q184490438_mag_affiliation label Chinese Center For Disease Control And Prevention@en id4 +``` +to an rdf file like this. + +``` +rdfs:label "Zunyou Wu"@en ; +schema:name "Zunyou Wu"@en ; +skos:prefLabel "Zunyou Wu"@en ; +p:P1416 wds:Q2140726727_mag_author-abcdefg ; +p:P6366 wds:Q2140726727_mag_author-abcdefg ; +wdt:P1416 wd:Q184490438_mag_affiliation ; +wdt:P6366 "2140726727"^^xsd:string . + +``` + + +## Required Option + +- `--pf --property-types {path}`: path to the file which contains the property datatype mapping in kgtk format. + +## Optional Options + +- `-lp --label-property {str}`: property identifiers which will create labels, separated by comma','. Default to **label**. +- `-ap --alias-property {str}`: alias identifiers which will create labels, separated by comma','. Default to **aliases**. +- `-dp --description-property {str}`: description identifiers which will create labels, separated by comma','. Default to **descriptions**. +- `-gt --generate-truthy {bool}`: the default is to not generate truthy triples. Specify this option to generate truthy triples. Default to **Yes**. +- `-ig --ignore {bool}`: if set to yes, ignore various kinds of exceptions and mistakes and log them to a log file with line number in input file, rather than stopping. logging. Default to **False**. +- `-n --output-n-lines {number}`: output triples approximately every {n} lines of reading stdin. Default to **1000**. +- `-gz --use-gz {number}`: if set to yes, read from compressed gz file. Default to **False**. +- `-sid --use-id {bool}`: if set to yes, the id in the edge will be used as statement id when creating statement or truthy statement. Default to **False** + + +## Shared Options + +- `--debug` run the command in debug mode. + +### property-types + +**--property-types** is the most important input file. It is also a kgtk file. Here is an example file `example_prop.tsv` + +``` +node1 label node2 +P493 property_type external-identifier +P494 property_type external-identifier +P495 property_type item +P496 property_type external-identifier +P497 property_type external-identifier +P498 property_type external-identifier +P500 property_type item +P501 property_type item +P502 property_type string +``` +The header line is necessary. If property *P493* is used in the input kgtk file, then the edge `P493 property_type external-identifier` must exists in the `example_prop.tsv` to tell triple generator that the object of `P493` is an external-identifier. If `p495` is used in the input kgtk file, then the object of `P495` will be treated as an entity. + +### label, aliases and descriptions + +**-lp**, **-ap**, **-dp** defines how you want the triple generator to identify the label, description and aliases. + +For example, if you have `-ap aliases,alias`, then when the following edge is met, both `Alice` and `Alicia` will be treated as aliases to the node `q2020`. + +``` +node1 property node2 id +q2020 aliases Alice@en id1 +q2020 alias Alicia@sp id2 +``` + +### truthy + +If `-gt --generate-truthy` set to `True`, the statement will be truthy. Truthy statements will have an additional spo with propert prefix `wdt`. + +### ignore + +ignore allows you to ignore various kind of errors written to the `ignore.log` file. + +### n + +`n` controls after how many lines of reading the standard input, To achieve optimal performance, you can set n larb b d d d d d d d dger to reduce overhead of creating knowledge graph object and frequent serialization. However, large n also requires larger memory. + +### gz + +Use compressed file as input. + +### use-id + +If `--use-id` is set to true, the `id` column of the kgtk file will be used as the statement id if the corresponding edge is a statement edge. It is the user's responsiblity to make sure there is no duplicated statement id across the whole knowledge graph then. + +## Usage + + +### Standard Usage + +```bash + +kgtk generate_wikidata_triples -pf example_prop.tsv < input_file.tsv > output_file.ttl + +``` + +### Run in parallel + +You can split the input files into several smaller pieces and run the command simultaneuously. + +Let's say you are in a directory which contains the `tsv` files. The following command will generate the `ttl` files with the same file name. + +```bash +ls *tsv | parallel -j+0 --eta 'kgtk generate_wikidata_triples -pf example_props.tsv -n 1000 -ig no --debug -gt yes < {} > {.}.ttl' +``` diff --git a/kgtk/cli/generate_wikidata_triples.py b/kgtk/cli/generate_wikidata_triples.py index cdc8b44ad..cc21f5c2a 100644 --- a/kgtk/cli/generate_wikidata_triples.py +++ b/kgtk/cli/generate_wikidata_triples.py @@ -89,7 +89,7 @@ def add_arguments(parser): type=str2bool, required = False, default="yes", - help="the default is to not generate truthy triples. Specify this option to generate truthy triples. NOTIMPLEMENTED", + help="the default is to not generate truthy triples. Specify this option to generate truthy triples.", dest="truthy", ) parser.add_argument( From c6348bf538bf13ca4a4e5dae049d2a914876c724 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Thu, 7 May 2020 15:06:11 -0700 Subject: [PATCH 114/278] Put the vallue option arguments in one group. --- kgtk/join/kgtkvalueoptions.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/kgtk/join/kgtkvalueoptions.py b/kgtk/join/kgtkvalueoptions.py index b061b92e3..fddda19e7 100644 --- a/kgtk/join/kgtkvalueoptions.py +++ b/kgtk/join/kgtkvalueoptions.py @@ -59,51 +59,52 @@ class KgtkValueOptions: @classmethod def add_arguments(cls, parser: ArgumentParser): - parser.add_argument( "--additional-language-codes", dest="additional_language_codes", + vgroup = parser.add_argument_group("KgtkValueOptions", "Options controlling the parsing and repair of KGTK data values.") + vgroup.add_argument( "--additional-language-codes", dest="additional_language_codes", help="Additional language codes.", nargs="*", default=None) - lsgroup= parser.add_mutually_exclusive_group() + lsgroup= vgroup.add_mutually_exclusive_group() lsgroup.add_argument( "--allow-language-suffixes", dest="allow_language_suffixes", help="Allow language identifier suffixes starting with a dash.", action='store_true', default=True) lsgroup.add_argument( "--disallow-language-suffixes", dest="allow_language_suffixes", help="Disallow language identifier suffixes starting with a dash.", action='store_false') - laxgroup= parser.add_mutually_exclusive_group() + laxgroup= vgroup.add_mutually_exclusive_group() laxgroup.add_argument( "--allow-lax-strings", dest="allow_lax_strings", help="Do not check if double quotes are backslashed inside strings.", action='store_true', default=False) laxgroup.add_argument( "--disallow-lax-strings", dest="allow_lax_strings", help="Check if double quotes are backslashed inside strings.", action='store_false') - lqgroup= parser.add_mutually_exclusive_group() + lqgroup= vgroup.add_mutually_exclusive_group() lqgroup.add_argument( "--allow-lax-lq-strings", dest="allow_lax_lq_strings", help="Do not check if single quotes are backslashed inside language qualified strings.", action='store_true', default=False) lqgroup.add_argument( "--disallow-lax-lq-strings", dest="allow_lax_lq_strings", help="Check if single quotes are backslashed inside language qualified strings.", action='store_false') - amd0group= parser.add_mutually_exclusive_group() + amd0group= vgroup.add_mutually_exclusive_group() amd0group.add_argument( "--allow-month-or-day-zero", dest="allow_month_or_day_zero", help="Allow month or day zero in dates.", action='store_true', default=False) amd0group.add_argument( "--disallow-month-or-day-zero", dest="allow_month_or_day_zero", help="Allow month or day zero in dates.", action='store_false') - rmd0group= parser.add_mutually_exclusive_group() + rmd0group= vgroup.add_mutually_exclusive_group() rmd0group.add_argument( "--repair-month-or-day-zero", dest="repair_month_or_day_zero", help="Repair month or day zero in dates.", action='store_true', default=False) rmd0group.add_argument( "--no-repair-month-or-day-zero", dest="repair_month_or_day_zero", help="Do not repair month or day zero in dates.", action='store_false') - parser.add_argument( "--minimum-valid-year", dest="minimum_valid_year", + vgroup.add_argument( "--minimum-valid-year", dest="minimum_valid_year", help="The minimum valid year in dates.", type=int, default=cls.MINIMUM_VALID_YEAR) - parser.add_argument( "--maximum-valid-year", dest="maximum_valid_year", + vgroup.add_argument( "--maximum-valid-year", dest="maximum_valid_year", help="The maximum valid year in dates.", type=int, default=cls.MAXIMUM_VALID_YEAR) - elsgroup= parser.add_mutually_exclusive_group() + elsgroup= vgroup.add_mutually_exclusive_group() elsgroup.add_argument( "--escape-list-separators", dest="escape_list_separators", help="Escape all list separators instead of splitting on them.", action='store_true', default=False) From 521bb10c0bd01bfafa6c42f0e9d4b63304834b6a Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Thu, 7 May 2020 16:29:46 -0700 Subject: [PATCH 115/278] Provide a better organization for the arguments. --- kgtk/join/edgereader.py | 6 +- kgtk/join/kgtkreader.py | 106 +++++++++++++++++++--------------- kgtk/join/kgtkvalueoptions.py | 2 +- kgtk/join/nodereader.py | 6 +- 4 files changed, 67 insertions(+), 53 deletions(-) diff --git a/kgtk/join/edgereader.py b/kgtk/join/edgereader.py index d4d343148..3fe378fa0 100644 --- a/kgtk/join/edgereader.py +++ b/kgtk/join/edgereader.py @@ -149,7 +149,6 @@ def _skip_reserved_fields(self, column_name)->bool: @classmethod def add_arguments(cls, parser: ArgumentParser): - # super().add_arguments(parser) parser.add_argument( "--blank-node1-line-action", dest="blank_node1_line_action", help="The action to take when a blank node1 field is detected.", type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) @@ -163,8 +162,9 @@ def main(): Test the KGTK edge file reader. """ parser = ArgumentParser() - KgtkReader.add_shared_arguments(parser) - EdgeReader.add_arguments(parser) + KgtkReader.add_operation_arguments(parser) + (fgroup, hgroup, lgroup) = KgtkReader.add_shared_arguments(parser) + EdgeReader.add_arguments(lgroup) KgtkValueOptions.add_arguments(parser) args = parser.parse_args() diff --git a/kgtk/join/kgtkreader.py b/kgtk/join/kgtkreader.py index 1b2c321e8..b694e9e8e 100644 --- a/kgtk/join/kgtkreader.py +++ b/kgtk/join/kgtkreader.py @@ -16,7 +16,7 @@ """ -from argparse import ArgumentParser +from argparse import ArgumentParser, _ArgumentGroup import attr import bz2 from enum import Enum @@ -822,74 +822,87 @@ def merge_columns(self, additional_columns: typing.List[str])->typing.List[str]: return merged_columns @classmethod - def add_shared_arguments(cls, parser: ArgumentParser): - parser.add_argument(dest="kgtk_file", help="The KGTK file to read", type=Path, nargs="?") + def add_operation_arguments(cls, parser: ArgumentParser): + errors_to = parser.add_mutually_exclusive_group() + errors_to.add_argument( "--errors-to-stdout", dest="errors_to_stdout", + help="Send errors to stdout instead of stderr", action="store_true") + errors_to.add_argument( "--errors-to-stderr", dest="errors_to_stderr", + help="Send errors to stderr instead of stdout", action="store_true") - parser.add_argument( "--blank-required-field-line-action", dest="blank_line_action", - help="The action to take when a line with a blank node1, node2, or id field (per mode) is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - - parser.add_argument( "--comment-line-action", dest="comment_line_action", - help="The action to take when a comment line is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - - parser.add_argument( "--column-separator", dest="column_separator", - help="Column separator.", type=str, default=cls.COLUMN_SEPARATOR) + parser.add_argument("-v", "--verbose", dest="verbose", help="Print additional progress messages.", action='store_true') - parser.add_argument( "--compression-type", dest="compression_type", help="Specify the compression type.") + parser.add_argument( "--very-verbose", dest="very_verbose", help="Print additional progress messages.", action='store_true') + + @classmethod + def add_shared_arguments(cls, parser: ArgumentParser)->typing.Tuple[_ArgumentGroup, _ArgumentGroup, _ArgumentGroup]: + parser.add_argument(dest="kgtk_file", help="The KGTK file to read", type=Path, nargs="?") - parser.add_argument( "--empty-line-action", dest="empty_line_action", - help="The action to take when an empty line is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) + fgroup: _ArgumentGroup = parser.add_argument_group("File options", "Options affecting file processing") + fgroup.add_argument( "--column-separator", dest="column_separator", + help="Column separator.", type=str, default=cls.COLUMN_SEPARATOR) - parser.add_argument( "--errors-to-stdout", dest="errors_to_stdout", - help="Send errors to stdout instead of stderr", action="store_true") + fgroup.add_argument( "--compression-type", dest="compression_type", help="Specify the compression type.") - parser.add_argument( "--error-limit", dest="error_limit", + fgroup.add_argument( "--error-limit", dest="error_limit", help="The maximum number of errors to report before failing", type=int, default=cls.ERROR_LIMIT_DEFAULT) - parser.add_argument( "--fill-short-lines", dest="fill_short_lines", - help="Fill missing trailing columns in short lines with empty values.", action='store_true') + fgroup.add_argument( "--gzip-in-parallel", dest="gzip_in_parallel", help="Execute gzip in parallel.", action='store_true') - parser.add_argument( "--force-column-names", dest="force_column_names", help="Force the column names.", nargs='+') + fgroup.add_argument( "--gzip-queue-size", dest="gzip_queue_size", + help="Queue size for parallel gzip.", type=int, default=cls.GZIP_QUEUE_SIZE_DEFAULT) - parser.add_argument( "--gzip-in-parallel", dest="gzip_in_parallel", help="Execute gzip in parallel.", action='store_true') + hgroup: _ArgumentGroup = parser.add_argument_group("Header parsing", "Options affecting header parsing") - parser.add_argument( "--gzip-queue-size", dest="gzip_queue_size", - help="Queue size for parallel gzip.", type=int, default=cls.GZIP_QUEUE_SIZE_DEFAULT) + hgroup.add_argument( "--force-column-names", dest="force_column_names", help="Force the column names.", nargs='+') - parser.add_argument( "--header-error-action", dest="header_error_action", + hgroup.add_argument( "--header-error-action", dest="header_error_action", help="The action to take when a header error is detected Only ERROR or EXIT are supported.", type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXIT) - parser.add_argument( "--invalid-value-action", dest="invalid_value_action", - help="The action to take when a data cell value is invalid.", + hgroup.add_argument( "--skip-first-record", dest="skip_first_record", + help="Skip the first record when forcing column names.", action='store_true') + + hgroup.add_argument( "--unsafe-column-name-action", dest="unsafe_column_name_action", + help="The action to take when a column name is unsafe.", type=ValidationAction, action=EnumNameAction, default=ValidationAction.REPORT) - parser.add_argument( "--long-line-action", dest="long_line_action", - help="The action to take when a long line is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) + lgroup: _ArgumentGroup = parser.add_argument_group("Line parsing", "Options affecting data line parsing") - parser.add_argument( "--short-line-action", dest="short_line_action", - help="The action to take when a short line is detected.", + lgroup.add_argument( "--blank-required-field-line-action", dest="blank_line_action", + help="The action to take when a line with a blank node1, node2, or id field (per mode) is detected.", + type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) + + lgroup.add_argument( "--comment-line-action", dest="comment_line_action", + help="The action to take when a comment line is detected.", type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - parser.add_argument( "--skip-first-record", dest="skip_first_record", help="Skip the first record when forcing column names.", action='store_true') + lgroup.add_argument( "--empty-line-action", dest="empty_line_action", + help="The action to take when an empty line is detected.", + type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - parser.add_argument( "--truncate-long-lines", dest="truncate_long_lines", - help="Remove excess trailing columns in long lines.", action='store_true') + lgroup.add_argument( "--fill-short-lines", dest="fill_short_lines", + help="Fill missing trailing columns in short lines with empty values.", action='store_true') - parser.add_argument( "--unsafe-column-name-action", dest="unsafe_column_name_action", - help="The action to take when a column name is unsafe.", + lgroup.add_argument( "--invalid-value-action", dest="invalid_value_action", + help="The action to take when a data cell value is invalid.", type=ValidationAction, action=EnumNameAction, default=ValidationAction.REPORT) - parser.add_argument("-v", "--verbose", dest="verbose", help="Print additional progress messages.", action='store_true') + lgroup.add_argument( "--long-line-action", dest="long_line_action", + help="The action to take when a long line is detected.", + type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - parser.add_argument( "--very-verbose", dest="very_verbose", help="Print additional progress messages.", action='store_true') + lgroup.add_argument( "--short-line-action", dest="short_line_action", + help="The action to take when a short line is detected.", + type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - parser.add_argument( "--whitespace-line-action", dest="whitespace_line_action", + lgroup.add_argument( "--truncate-long-lines", dest="truncate_long_lines", + help="Remove excess trailing columns in long lines.", action='store_true') + + lgroup.add_argument( "--whitespace-line-action", dest="whitespace_line_action", help="The action to take when a whitespace line is detected.", type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) + + return (fgroup, hgroup, lgroup) # May be overridden @classmethod @@ -909,10 +922,11 @@ def main(): from kgtk.join.nodereader import NodeReader parser = ArgumentParser() - KgtkReader.add_shared_arguments(parser) - KgtkReader.add_arguments(parser) - EdgeReader.add_arguments(parser) - NodeReader.add_arguments(parser) + KgtkReader.add_operation_arguments(parser) + (fgroup, hgroup, lgroup) = KgtkReader.add_shared_arguments(parser) + KgtkReader.add_arguments(fgroup) + EdgeReader.add_arguments(lgroup) + NodeReader.add_arguments(lgroup) KgtkValueOptions.add_arguments(parser) parser.add_argument( "--test", dest="test_method", help="The test to perform", diff --git a/kgtk/join/kgtkvalueoptions.py b/kgtk/join/kgtkvalueoptions.py index fddda19e7..767f2d199 100644 --- a/kgtk/join/kgtkvalueoptions.py +++ b/kgtk/join/kgtkvalueoptions.py @@ -59,7 +59,7 @@ class KgtkValueOptions: @classmethod def add_arguments(cls, parser: ArgumentParser): - vgroup = parser.add_argument_group("KgtkValueOptions", "Options controlling the parsing and repair of KGTK data values.") + vgroup = parser.add_argument_group("Data value parsing", "Options controlling the parsing and processing of KGTK data values.") vgroup.add_argument( "--additional-language-codes", dest="additional_language_codes", help="Additional language codes.", nargs="*", default=None) diff --git a/kgtk/join/nodereader.py b/kgtk/join/nodereader.py index 4f4189a6f..9fb88d260 100644 --- a/kgtk/join/nodereader.py +++ b/kgtk/join/nodereader.py @@ -130,7 +130,6 @@ def _skip_reserved_fields(self, column_name)->bool: @classmethod def add_arguments(cls, parser: ArgumentParser): - # super().add_arguments(parser) parser.add_argument( "--blank-id-line-action", dest="blank_id_line_action", help="The action to take when a blank id field is detected.", type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) @@ -141,8 +140,9 @@ def main(): Test the KGTK node file reader. """ parser = ArgumentParser() - KgtkReader.add_shared_arguments(parser) - NodeReader.add_arguments(parser) + KgtkReader.add_operation_arguments(parser) + (fgroup, hgroup, lgroup) = KgtkReader.add_shared_arguments(parser) + NodeReader.add_arguments(lgroup) KgtkValueOptions.add_arguments(parser) args = parser.parse_args() From 80f2f94277c5749acd4005022c0e5676bdb4adfd Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Thu, 7 May 2020 16:57:54 -0700 Subject: [PATCH 116/278] Semicolons are frequently in column names, disable the check. --- kgtk/join/kgtkbase.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kgtk/join/kgtkbase.py b/kgtk/join/kgtkbase.py index 7f6ed4afd..2299725d2 100644 --- a/kgtk/join/kgtkbase.py +++ b/kgtk/join/kgtkbase.py @@ -75,7 +75,7 @@ def check_column_name(cls, # 1) except inside "" and '' quoted strings # 4) Check for commas # 5) Check for vertical bars - # 6) Check for semicolons + # 6) Check for semicolons (disabled) # # TODO: It might be possible to make some of these checks more efficient. results: typing.List[str] = [ ] @@ -90,8 +90,8 @@ def check_column_name(cls, results.append("Warning: Column name '%s' contains a comma (,)" % column_name) if "|" in column_name: results.append("Warning: Column name '%s' contains a vertical bar (|)" % column_name) - if ";" in column_name: - results.append("Warning: Column name '%s' contains a semicolon (;)" % column_name) + # if ";" in column_name: + # results.append("Warning: Column name '%s' contains a semicolon (;)" % column_name) kv: KgtkValue = KgtkValue(column_name) if not kv.is_valid(): results.append(kv.describe()) From a46eda01a01cabba1b2c4acce789f9c382107e39 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Thu, 7 May 2020 16:58:33 -0700 Subject: [PATCH 117/278] Plumb control over value checking into ifexists. --- kgtk/cli/ifexists.py | 1 + kgtk/join/ifexists.py | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/kgtk/cli/ifexists.py b/kgtk/cli/ifexists.py index fdff15e56..a1e288b25 100644 --- a/kgtk/cli/ifexists.py +++ b/kgtk/cli/ifexists.py @@ -116,6 +116,7 @@ def run(input_kgtk_file: typing.Optional[Path], left_keys=left_keys, right_keys=right_keys, field_separator=field_separator, + invalid_value_action=ValidationAction.PASS, value_options=value_options, error_limit=error_limit, verbose=verbose, diff --git a/kgtk/join/ifexists.py b/kgtk/join/ifexists.py index beea083b6..0cab4735b 100644 --- a/kgtk/join/ifexists.py +++ b/kgtk/join/ifexists.py @@ -57,6 +57,7 @@ class IfExists(KgtkFormat): fill_short_lines: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) truncate_long_lines: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) + invalid_value_action: ValidationAction = attr.ib(validator=attr.validators.instance_of(ValidationAction), default=ValidationAction.PASS) # TODO: find a working validator # value_options: typing.Optional[KgtkValueOptions] = attr.ib(attr.validators.optional(attr.validators.instance_of(KgtkValueOptions)), default=None) value_options: typing.Optional[KgtkValueOptions] = attr.ib(default=None) @@ -140,6 +141,7 @@ def process(self): long_line_action=self.long_line_action, fill_short_lines=self.fill_short_lines, truncate_long_lines=self.truncate_long_lines, + invalid_value_action=self.invalid_value_action, value_options = self.value_options, error_limit=self.error_limit, verbose=self.verbose, @@ -153,6 +155,7 @@ def process(self): long_line_action=self.long_line_action, fill_short_lines=self.fill_short_lines, truncate_long_lines=self.truncate_long_lines, + invalid_value_action=self.invalid_value_action, value_options = self.value_options, error_limit=self.error_limit, verbose=self.verbose, @@ -228,6 +231,10 @@ def main(): parser.add_argument( "--gzip-in-parallel", dest="gzip_in_parallel", help="Execute gzip in parallel.", action='store_true') + parser.add_argument( "--invalid-value-action", dest="invalid_value_action", + help="The action to take when an invalid data value is detected.", + type=ValidationAction, action=EnumNameAction, default=ValidationAction.PASS) + parser.add_argument( "--invert", dest="invert", help="Invert the test (if not exists).", action='store_true') parser.add_argument( "--left-keys", dest="left_keys", help="The key columns in the left file.", nargs='*') @@ -269,6 +276,7 @@ def main(): long_line_action=args.long_line_action, fill_short_lines=args.fill_short_lines, truncate_long_lines=args.truncate_long_lines, + invalid_value_action=args.invalid_valid_action, value_options=value_options, gzip_in_parallel=args.gzip_in_parallel, error_limit=args.error_limit, From 8b697e46b65b46ecb22c1b3032e3d00255a6fd84 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Thu, 7 May 2020 19:51:45 -0700 Subject: [PATCH 118/278] Add option prefixes. Share options differently. --- kgtk/join/edgereader.py | 13 +-- kgtk/join/kgtkreader.py | 183 ++++++++++++++++++++++++---------------- kgtk/join/nodereader.py | 10 +-- 3 files changed, 112 insertions(+), 94 deletions(-) diff --git a/kgtk/join/edgereader.py b/kgtk/join/edgereader.py index 3fe378fa0..93c25799b 100644 --- a/kgtk/join/edgereader.py +++ b/kgtk/join/edgereader.py @@ -147,24 +147,13 @@ def _skip_reserved_fields(self, column_name)->bool: return True return False - @classmethod - def add_arguments(cls, parser: ArgumentParser): - parser.add_argument( "--blank-node1-line-action", dest="blank_node1_line_action", - help="The action to take when a blank node1 field is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - - parser.add_argument( "--blank-node2-line-action", dest="blank_node2_line_action", - help="The action to take when a blank node2 field is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - def main(): """ Test the KGTK edge file reader. """ parser = ArgumentParser() KgtkReader.add_operation_arguments(parser) - (fgroup, hgroup, lgroup) = KgtkReader.add_shared_arguments(parser) - EdgeReader.add_arguments(lgroup) + KgtkReader.add_arguments(parser, edge_options=True) KgtkValueOptions.add_arguments(parser) args = parser.parse_args() diff --git a/kgtk/join/kgtkreader.py b/kgtk/join/kgtkreader.py index b694e9e8e..d79d20c9a 100644 --- a/kgtk/join/kgtkreader.py +++ b/kgtk/join/kgtkreader.py @@ -834,83 +834,123 @@ def add_operation_arguments(cls, parser: ArgumentParser): parser.add_argument( "--very-verbose", dest="very_verbose", help="Print additional progress messages.", action='store_true') @classmethod - def add_shared_arguments(cls, parser: ArgumentParser)->typing.Tuple[_ArgumentGroup, _ArgumentGroup, _ArgumentGroup]: - parser.add_argument(dest="kgtk_file", help="The KGTK file to read", type=Path, nargs="?") - - fgroup: _ArgumentGroup = parser.add_argument_group("File options", "Options affecting file processing") - fgroup.add_argument( "--column-separator", dest="column_separator", - help="Column separator.", type=str, default=cls.COLUMN_SEPARATOR) - - fgroup.add_argument( "--compression-type", dest="compression_type", help="Specify the compression type.") - - fgroup.add_argument( "--error-limit", dest="error_limit", - help="The maximum number of errors to report before failing", type=int, default=cls.ERROR_LIMIT_DEFAULT) - - fgroup.add_argument( "--gzip-in-parallel", dest="gzip_in_parallel", help="Execute gzip in parallel.", action='store_true') - - fgroup.add_argument( "--gzip-queue-size", dest="gzip_queue_size", - help="Queue size for parallel gzip.", type=int, default=cls.GZIP_QUEUE_SIZE_DEFAULT) - - hgroup: _ArgumentGroup = parser.add_argument_group("Header parsing", "Options affecting header parsing") + def add_arguments(cls, + parser: ArgumentParser, + node_options: bool = False, + edge_options: bool = False, + mode_options: bool = False, + who: str = ""): + prefix1: str = "--" if len(who) == 0 else "--" + who + "-" + prefix2: str = "" if len(who) == 0 else who + "_" + prefix3: str = "" if len(who) == 0 else who + " " + + parser.add_argument(dest=prefix2 + "kgtk_file", help="The KGTK file to read", type=Path, nargs="?") + + fgroup: _ArgumentGroup = parser.add_argument_group(prefix3 + "File options", + "Options affecting " + prefix3 + "processing") + fgroup.add_argument(prefix1 + "column-separator", + dest=prefix2 + "column_separator", + help="Column separator.", type=str, default=cls.COLUMN_SEPARATOR) + + fgroup.add_argument(prefix1 + "compression-type", + dest=prefix2 + "compression_type", help="Specify the compression type.") + + fgroup.add_argument(prefix1 + "error-limit", + dest=prefix2 + "error_limit", + help="The maximum number of errors to report before failing", type=int, default=cls.ERROR_LIMIT_DEFAULT) + + fgroup.add_argument(prefix1 + "gzip-in-parallel", + dest=prefix2 + "gzip_in_parallel", help="Execute gzip in parallel.", action='store_true') + + fgroup.add_argument(prefix1 + "gzip-queue-size", + dest=prefix2 + "gzip_queue_size", + help="Queue size for parallel gzip.", type=int, default=cls.GZIP_QUEUE_SIZE_DEFAULT) + + if mode_options: + fgroup.add_argument(prefix1 + "mode", + dest=prefix2 + "mode", + help="Determine the KGTK file mode.", + type=KgtkReader.Mode, action=EnumNameAction, default=KgtkReader.Mode.AUTO) + + hgroup: _ArgumentGroup = parser.add_argument_group(prefix3 + "Header parsing", "Options affecting header parsing") - hgroup.add_argument( "--force-column-names", dest="force_column_names", help="Force the column names.", nargs='+') + hgroup.add_argument(prefix1 + "force-column-names", + dest=prefix2 + "force_column_names", help="Force the column names.", nargs='+') - hgroup.add_argument( "--header-error-action", dest="header_error_action", - help="The action to take when a header error is detected Only ERROR or EXIT are supported.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXIT) + hgroup.add_argument(prefix1 + "header-error-action", + dest=prefix2 + "header_error_action", + help="The action to take when a header error is detected Only ERROR or EXIT are supported.", + type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXIT) - hgroup.add_argument( "--skip-first-record", dest="skip_first_record", - help="Skip the first record when forcing column names.", action='store_true') + hgroup.add_argument(prefix1 + "skip-first-record", + dest=prefix2 + "skip_first_record", + help="Skip the first record when forcing column names.", action='store_true') - hgroup.add_argument( "--unsafe-column-name-action", dest="unsafe_column_name_action", - help="The action to take when a column name is unsafe.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.REPORT) + hgroup.add_argument(prefix1 + "unsafe-column-name-action", + dest=prefix2 + "unsafe_column_name_action", + help="The action to take when a column name is unsafe.", + type=ValidationAction, action=EnumNameAction, default=ValidationAction.REPORT) lgroup: _ArgumentGroup = parser.add_argument_group("Line parsing", "Options affecting data line parsing") - lgroup.add_argument( "--blank-required-field-line-action", dest="blank_line_action", - help="The action to take when a line with a blank node1, node2, or id field (per mode) is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - - lgroup.add_argument( "--comment-line-action", dest="comment_line_action", - help="The action to take when a comment line is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - - lgroup.add_argument( "--empty-line-action", dest="empty_line_action", - help="The action to take when an empty line is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - - lgroup.add_argument( "--fill-short-lines", dest="fill_short_lines", - help="Fill missing trailing columns in short lines with empty values.", action='store_true') - - lgroup.add_argument( "--invalid-value-action", dest="invalid_value_action", - help="The action to take when a data cell value is invalid.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.REPORT) - - lgroup.add_argument( "--long-line-action", dest="long_line_action", - help="The action to take when a long line is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - - lgroup.add_argument( "--short-line-action", dest="short_line_action", - help="The action to take when a short line is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - - lgroup.add_argument( "--truncate-long-lines", dest="truncate_long_lines", - help="Remove excess trailing columns in long lines.", action='store_true') - - lgroup.add_argument( "--whitespace-line-action", dest="whitespace_line_action", - help="The action to take when a whitespace line is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - - return (fgroup, hgroup, lgroup) + if node_options: + lgroup.add_argument(prefix1 + "blank-id-line-action", + dest=prefix2 + "blank_id_line_action", + help="The action to take when a blank id field is detected.", + type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) + + if edge_options: + lgroup.add_argument(prefix1 + "blank-node1-line-action", + dest=prefix2 + "blank_node1_line_action", + help="The action to take when a blank node1 field is detected.", + type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) + + lgroup.add_argument(prefix1 + "blank-node2-line-action", + dest=prefix2 + "blank_node2_line_action", + help="The action to take when a blank node2 field is detected.", + type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) + lgroup.add_argument(prefix1 + "blank-required-field-line-action", + dest=prefix2 + "blank_line_action", + help="The action to take when a line with a blank node1, node2, or id field (per mode) is detected.", + type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - # May be overridden - @classmethod - def add_arguments(cls, parser: ArgumentParser): - parser.add_argument( "--mode", dest="mode", - help="Determine the KGTK file mode.", type=KgtkReader.Mode, action=EnumNameAction, default=KgtkReader.Mode.AUTO) - - + lgroup.add_argument(prefix1 + "comment-line-action", + dest=prefix2 + "comment_line_action", + help="The action to take when a comment line is detected.", + type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) + + lgroup.add_argument(prefix1 + "empty-line-action", + dest=prefix2 + "empty_line_action", + help="The action to take when an empty line is detected.", + type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) + + lgroup.add_argument(prefix1 + "fill-short-lines", + dest=prefix2 + "fill_short_lines", + help="Fill missing trailing columns in short lines with empty values.", action='store_true') + + lgroup.add_argument(prefix1 + "invalid-value-action", + dest=prefix2 + "invalid_value_action", + help="The action to take when a data cell value is invalid.", + type=ValidationAction, action=EnumNameAction, default=ValidationAction.REPORT) + + lgroup.add_argument(prefix1 + "long-line-action", + dest=prefix2 + "long_line_action", + help="The action to take when a long line is detected.", + type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) + + lgroup.add_argument(prefix1 + "short-line-action", + dest=prefix2 + "short_line_action", + help="The action to take when a short line is detected.", + type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) + + lgroup.add_argument(prefix1 + "truncate-long-lines", + dest=prefix2 + "truncate_long_lines", + help="Remove excess trailing columns in long lines.", action='store_true') + + lgroup.add_argument(prefix1 + "whitespace-line-action", + dest=prefix2 + "whitespace_line_action", + help="The action to take when a whitespace line is detected.", + type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) def main(): """ @@ -923,10 +963,7 @@ def main(): parser = ArgumentParser() KgtkReader.add_operation_arguments(parser) - (fgroup, hgroup, lgroup) = KgtkReader.add_shared_arguments(parser) - KgtkReader.add_arguments(fgroup) - EdgeReader.add_arguments(lgroup) - NodeReader.add_arguments(lgroup) + KgtkReader.add_arguments(parser, node_options=True, edge_options=True, mode_options=True) KgtkValueOptions.add_arguments(parser) parser.add_argument( "--test", dest="test_method", help="The test to perform", diff --git a/kgtk/join/nodereader.py b/kgtk/join/nodereader.py index 9fb88d260..3a804398a 100644 --- a/kgtk/join/nodereader.py +++ b/kgtk/join/nodereader.py @@ -128,21 +128,13 @@ def _skip_reserved_fields(self, column_name)->bool: return True return False - @classmethod - def add_arguments(cls, parser: ArgumentParser): - parser.add_argument( "--blank-id-line-action", dest="blank_id_line_action", - help="The action to take when a blank id field is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - - def main(): """ Test the KGTK node file reader. """ parser = ArgumentParser() KgtkReader.add_operation_arguments(parser) - (fgroup, hgroup, lgroup) = KgtkReader.add_shared_arguments(parser) - NodeReader.add_arguments(lgroup) + KgtkReader.add_arguments(parser, node_options=True) KgtkValueOptions.add_arguments(parser) args = parser.parse_args() From d89e32b5fe8966aa353dd65187a15bb7cfcf889e Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Thu, 7 May 2020 19:54:12 -0700 Subject: [PATCH 119/278] Make the name more descriptive. --- kgtk/join/edgereader.py | 2 +- kgtk/join/kgtkreader.py | 14 +++++++------- kgtk/join/nodereader.py | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/kgtk/join/edgereader.py b/kgtk/join/edgereader.py index 93c25799b..b382c11c8 100644 --- a/kgtk/join/edgereader.py +++ b/kgtk/join/edgereader.py @@ -153,7 +153,7 @@ def main(): """ parser = ArgumentParser() KgtkReader.add_operation_arguments(parser) - KgtkReader.add_arguments(parser, edge_options=True) + KgtkReader.add_file_arguments(parser, edge_options=True) KgtkValueOptions.add_arguments(parser) args = parser.parse_args() diff --git a/kgtk/join/kgtkreader.py b/kgtk/join/kgtkreader.py index d79d20c9a..43d251ca0 100644 --- a/kgtk/join/kgtkreader.py +++ b/kgtk/join/kgtkreader.py @@ -834,12 +834,12 @@ def add_operation_arguments(cls, parser: ArgumentParser): parser.add_argument( "--very-verbose", dest="very_verbose", help="Print additional progress messages.", action='store_true') @classmethod - def add_arguments(cls, - parser: ArgumentParser, - node_options: bool = False, - edge_options: bool = False, - mode_options: bool = False, - who: str = ""): + def add_file_arguments(cls, + parser: ArgumentParser, + node_options: bool = False, + edge_options: bool = False, + mode_options: bool = False, + who: str = ""): prefix1: str = "--" if len(who) == 0 else "--" + who + "-" prefix2: str = "" if len(who) == 0 else who + "_" prefix3: str = "" if len(who) == 0 else who + " " @@ -963,7 +963,7 @@ def main(): parser = ArgumentParser() KgtkReader.add_operation_arguments(parser) - KgtkReader.add_arguments(parser, node_options=True, edge_options=True, mode_options=True) + KgtkReader.add_file_arguments(parser, node_options=True, edge_options=True, mode_options=True) KgtkValueOptions.add_arguments(parser) parser.add_argument( "--test", dest="test_method", help="The test to perform", diff --git a/kgtk/join/nodereader.py b/kgtk/join/nodereader.py index 3a804398a..7291089ee 100644 --- a/kgtk/join/nodereader.py +++ b/kgtk/join/nodereader.py @@ -134,7 +134,7 @@ def main(): """ parser = ArgumentParser() KgtkReader.add_operation_arguments(parser) - KgtkReader.add_arguments(parser, node_options=True) + KgtkReader.add_file_arguments(parser, node_options=True) KgtkValueOptions.add_arguments(parser) args = parser.parse_args() From 7b2f7b47c85cd0a750a80c897042c85ba77c15a7 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Fri, 8 May 2020 10:36:51 -0700 Subject: [PATCH 120/278] Incomplete changes. --- kgtk/join/ifexists.py | 106 ++++++++++++---------------------------- kgtk/join/kgtkreader.py | 6 ++- 2 files changed, 36 insertions(+), 76 deletions(-) diff --git a/kgtk/join/ifexists.py b/kgtk/join/ifexists.py index 0cab4735b..00ebc9ce5 100644 --- a/kgtk/join/ifexists.py +++ b/kgtk/join/ifexists.py @@ -16,7 +16,7 @@ """ -from argparse import ArgumentParser +from argparse import ArgumentParser, Namespace import attr import gzip from pathlib import Path @@ -33,39 +33,25 @@ @attr.s(slots=True, frozen=True) class IfExists(KgtkFormat): - left_file_path: Path = attr.ib(validator=attr.validators.instance_of(Path)) - right_file_path: Path = attr.ib(validator=attr.validators.instance_of(Path)) - output_path: typing.Optional[Path] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(Path))) + input_reader_args: typing.Mapping[str, typing.Any] = attr.ib() + input_keys: typing.Optional[typing.List[str]] = attr.ib(validator=attr.validators.optional(attr.validators.deep_iterable(member_validator=attr.validators.instance_of(str), + iterable_validator=attr.validators.instance_of(list)))) - invert: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) + filter_reader_args: typing.Mapping[str, typing.Any] = attr.ib() + filter_keys: typing.Optional[typing.List[str]] = attr.ib(validator=attr.validators.optional(attr.validators.deep_iterable(member_validator=attr.validators.instance_of(str), + iterable_validator=attr.validators.instance_of(list)))) - left_keys: typing.Optional[typing.List[str]] = attr.ib(validator=attr.validators.optional(attr.validators.deep_iterable(member_validator=attr.validators.instance_of(str), - iterable_validator=attr.validators.instance_of(list))), - default=None) - right_keys: typing.Optional[typing.List[str]] = attr.ib(validator=attr.validators.optional(attr.validators.deep_iterable(member_validator=attr.validators.instance_of(str), - iterable_validator=attr.validators.instance_of(list))), - default=None) + output_file_path: typing.Optional[Path] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(Path))) # The field separator used in multifield joins. The KGHT list character should be safe. field_separator: str = attr.ib(validator=attr.validators.instance_of(str), default=KgtkFormat.LIST_SEPARATOR) - # Ignore records with too many or too few fields? - short_line_action: ValidationAction = attr.ib(validator=attr.validators.instance_of(ValidationAction), default=ValidationAction.EXCLUDE) - long_line_action: ValidationAction = attr.ib(validator=attr.validators.instance_of(ValidationAction), default=ValidationAction.EXCLUDE) - - # Require or fill trailing fields? - fill_short_lines: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) - truncate_long_lines: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) + invert: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) - invalid_value_action: ValidationAction = attr.ib(validator=attr.validators.instance_of(ValidationAction), default=ValidationAction.PASS) # TODO: find a working validator # value_options: typing.Optional[KgtkValueOptions] = attr.ib(attr.validators.optional(attr.validators.instance_of(KgtkValueOptions)), default=None) value_options: typing.Optional[KgtkValueOptions] = attr.ib(default=None) - gzip_in_parallel: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) - - error_limit: int = attr.ib(validator=attr.validators.instance_of(int), default=KgtkReader.ERROR_LIMIT_DEFAULT) - verbose: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) very_verbose: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) @@ -135,8 +121,8 @@ def extract_key_set(self, kr: KgtkReader, who: str, key_columns: typing.List[int def process(self): # Open the input files once. if self.verbose: - print("Opening the left input file: %s" % self.left_file_path, flush=True) - left_kr: KgtkReader = KgtkReader.open(self.left_file_path, + print("Opening the input file: %s" % self.left_file_path, flush=True) + left_kr: KgtkReader = KgtkReader.open(self.left_file_path, short_line_action=self.short_line_action, long_line_action=self.long_line_action, fill_short_lines=self.fill_short_lines, @@ -215,71 +201,41 @@ def main(): """ Test the KGTK file joiner. """ - parser = ArgumentParser() - - parser.add_argument(dest="left_kgtk_file", help="The left KGTK file to join", type=Path) - - parser.add_argument(dest="right_kgtk_file", help="The right KGTK file to join", type=Path) - - parser.add_argument( "--error-limit", dest="error_limit", - help="The maximum number of errors to report before failing", type=int, default=KgtkReader.ERROR_LIMIT_DEFAULT) + parser: ArgumentParser = ArgumentParser() + KgtkReader.add_operation_arguments(parser) + parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to read", type=Path, default=None) + parser.add_argument( "--field-separator", dest="field_separator", help="Separator for multifield keys", default=IfExists.FIELD_SEPARATOR_DEFAULT) - - parser.add_argument( "--fill-short-lines", dest="fill_short_lines", - help="Fill missing trailing columns in short lines with empty values.", action='store_true') - - parser.add_argument( "--gzip-in-parallel", dest="gzip_in_parallel", help="Execute gzip in parallel.", action='store_true') - - parser.add_argument( "--invalid-value-action", dest="invalid_value_action", - help="The action to take when an invalid data value is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.PASS) - + parser.add_argument( "--invert", dest="invert", help="Invert the test (if not exists).", action='store_true') - parser.add_argument( "--left-keys", dest="left_keys", help="The key columns in the left file.", nargs='*') - - parser.add_argument( "--long-line-action", dest="long_line_action", - help="The action to take when a long line is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) + parser.add_argument( "--input-keys", dest="_input_keys", help="The key columns in the input file.", nargs='*') + parser.add_argument( "--filter-keys", dest="_filter_keys", help="The key columns in the filter file.", nargs='*') - parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to read", type=Path, default=None) - - parser.add_argument( "--right-keys", dest="right_keys", help="The key columns in the right file.", nargs='*') + KgtkReader.add_file_arguments(parser, mode_options=True, who="input") - parser.add_argument( "--short-line-action", dest="short_line_action", - help="The action to take whe a short line is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - - parser.add_argument( "--truncate-long-lines", dest="truncate_long_lines", - help="Remove excess trailing columns in long lines.", action='store_true') - - parser.add_argument("-v", "--verbose", dest="verbose", help="Print additional progress messages.", action='store_true') - - parser.add_argument( "--very-verbose", dest="very_verbose", help="Print additional progress messages.", action='store_true') + # TODO: Find a way to use "--filter-on" + KgtkReader.add_file_arguments(parser, mode_options=True, who="filter", optional_file=True) KgtkValueOptions.add_arguments(parser) - args = parser.parse_args() + args: Namespace = parser.parse_args() + + input_args: typing.Mapping[str, typing.Any] = dict(((item[0][len("input_"):], item[1]) for item in vars(args) if item[0].startswith("input_"))) + filter_args: typing.Mapping[str, typing.Any] = dict(((item[0][len("filter_"):], item[1]) for item in vars(args) if item[0].startswith("filter_"))) # Build the value parsing option structure. value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) - ie: IfExists = IfExists(left_file_path=args.left_kgtk_file, - right_file_path=args.right_kgtk_file, - output_path=args.output_file_path, - invert=args.invert, - left_keys=args.left_keys, - right_keys=args.right_keys, + ie: IfExists = IfExists(input_reader_args=input_args, + input_keys=args._input_keys, + filter_reader_args=filter_args, + filter_keys=args._filter_keys, + output_file_path=args.output_file_path, field_separator=args.field_separator, - short_line_action=args.short_line_action, - long_line_action=args.long_line_action, - fill_short_lines=args.fill_short_lines, - truncate_long_lines=args.truncate_long_lines, - invalid_value_action=args.invalid_valid_action, + invert=args.invert, value_options=value_options, - gzip_in_parallel=args.gzip_in_parallel, - error_limit=args.error_limit, verbose=args.verbose, very_verbose=args.very_verbose) diff --git a/kgtk/join/kgtkreader.py b/kgtk/join/kgtkreader.py index 43d251ca0..2375c3fcc 100644 --- a/kgtk/join/kgtkreader.py +++ b/kgtk/join/kgtkreader.py @@ -839,12 +839,16 @@ def add_file_arguments(cls, node_options: bool = False, edge_options: bool = False, mode_options: bool = False, + optional_file: bool = True, who: str = ""): prefix1: str = "--" if len(who) == 0 else "--" + who + "-" prefix2: str = "" if len(who) == 0 else who + "_" prefix3: str = "" if len(who) == 0 else who + " " - parser.add_argument(dest=prefix2 + "kgtk_file", help="The KGTK file to read", type=Path, nargs="?") + if optional_file: + parser.add_argument(dest=prefix2 + "kgtk_file", help="The " + who + " KGTK file to read", type=Path, nargs="?") + else: + parser.add_argument(dest=prefix2 + "kgtk_file", help="The " + who + " KGTK file to read", type=Path) fgroup: _ArgumentGroup = parser.add_argument_group(prefix3 + "File options", "Options affecting " + prefix3 + "processing") From aef6af01950a47c72c575e426a021403534d2aa9 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Fri, 8 May 2020 10:37:24 -0700 Subject: [PATCH 121/278] Reorganize new source files. --- kgtk/io/__init__.py | 0 kgtk/{join => io}/edgereader.py | 0 kgtk/{join => io}/kgtkbase.py | 0 kgtk/{join => io}/kgtkreader.py | 0 kgtk/{join => io}/kgtkwriter.py | 0 kgtk/{join => io}/nodereader.py | 0 kgtk/{join => }/kgtkformat.py | 0 kgtk/utils/__init__.py | 0 kgtk/{join => utils}/closableiter.py | 0 kgtk/{join => utils}/enumnameaction.py | 0 kgtk/{join => utils}/gzipprocess.py | 0 kgtk/{join => utils}/validationaction.py | 0 kgtk/value/__init__.py | 0 kgtk/{join => value}/kgtkvalue.py | 0 kgtk/{join => value}/kgtkvalueoptions.py | 0 kgtk/{join => value}/languagevalidator.py | 0 16 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 kgtk/io/__init__.py rename kgtk/{join => io}/edgereader.py (100%) rename kgtk/{join => io}/kgtkbase.py (100%) rename kgtk/{join => io}/kgtkreader.py (100%) rename kgtk/{join => io}/kgtkwriter.py (100%) rename kgtk/{join => io}/nodereader.py (100%) rename kgtk/{join => }/kgtkformat.py (100%) create mode 100644 kgtk/utils/__init__.py rename kgtk/{join => utils}/closableiter.py (100%) rename kgtk/{join => utils}/enumnameaction.py (100%) rename kgtk/{join => utils}/gzipprocess.py (100%) rename kgtk/{join => utils}/validationaction.py (100%) create mode 100644 kgtk/value/__init__.py rename kgtk/{join => value}/kgtkvalue.py (100%) rename kgtk/{join => value}/kgtkvalueoptions.py (100%) rename kgtk/{join => value}/languagevalidator.py (100%) diff --git a/kgtk/io/__init__.py b/kgtk/io/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/kgtk/join/edgereader.py b/kgtk/io/edgereader.py similarity index 100% rename from kgtk/join/edgereader.py rename to kgtk/io/edgereader.py diff --git a/kgtk/join/kgtkbase.py b/kgtk/io/kgtkbase.py similarity index 100% rename from kgtk/join/kgtkbase.py rename to kgtk/io/kgtkbase.py diff --git a/kgtk/join/kgtkreader.py b/kgtk/io/kgtkreader.py similarity index 100% rename from kgtk/join/kgtkreader.py rename to kgtk/io/kgtkreader.py diff --git a/kgtk/join/kgtkwriter.py b/kgtk/io/kgtkwriter.py similarity index 100% rename from kgtk/join/kgtkwriter.py rename to kgtk/io/kgtkwriter.py diff --git a/kgtk/join/nodereader.py b/kgtk/io/nodereader.py similarity index 100% rename from kgtk/join/nodereader.py rename to kgtk/io/nodereader.py diff --git a/kgtk/join/kgtkformat.py b/kgtk/kgtkformat.py similarity index 100% rename from kgtk/join/kgtkformat.py rename to kgtk/kgtkformat.py diff --git a/kgtk/utils/__init__.py b/kgtk/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/kgtk/join/closableiter.py b/kgtk/utils/closableiter.py similarity index 100% rename from kgtk/join/closableiter.py rename to kgtk/utils/closableiter.py diff --git a/kgtk/join/enumnameaction.py b/kgtk/utils/enumnameaction.py similarity index 100% rename from kgtk/join/enumnameaction.py rename to kgtk/utils/enumnameaction.py diff --git a/kgtk/join/gzipprocess.py b/kgtk/utils/gzipprocess.py similarity index 100% rename from kgtk/join/gzipprocess.py rename to kgtk/utils/gzipprocess.py diff --git a/kgtk/join/validationaction.py b/kgtk/utils/validationaction.py similarity index 100% rename from kgtk/join/validationaction.py rename to kgtk/utils/validationaction.py diff --git a/kgtk/value/__init__.py b/kgtk/value/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/kgtk/join/kgtkvalue.py b/kgtk/value/kgtkvalue.py similarity index 100% rename from kgtk/join/kgtkvalue.py rename to kgtk/value/kgtkvalue.py diff --git a/kgtk/join/kgtkvalueoptions.py b/kgtk/value/kgtkvalueoptions.py similarity index 100% rename from kgtk/join/kgtkvalueoptions.py rename to kgtk/value/kgtkvalueoptions.py diff --git a/kgtk/join/languagevalidator.py b/kgtk/value/languagevalidator.py similarity index 100% rename from kgtk/join/languagevalidator.py rename to kgtk/value/languagevalidator.py From 60b5051b1d39b3d386e3bae1a3ba5806ea683d58 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Fri, 8 May 2020 10:37:48 -0700 Subject: [PATCH 122/278] Reorganize new source files. --- kgtk/cli/clean_data.py | 10 +++++----- kgtk/cli/ifexists.py | 12 ++++++------ kgtk/cli/ifnotexists.py | 12 ++++++------ kgtk/cli/validate.py | 10 +++++----- kgtk/io/edgereader.py | 10 +++++----- kgtk/io/kgtkbase.py | 6 +++--- kgtk/io/kgtkreader.py | 24 ++++++++++++------------ kgtk/io/kgtkwriter.py | 12 ++++++------ kgtk/io/nodereader.py | 10 +++++----- kgtk/join/ifexists.py | 12 ++++++------ kgtk/join/kgtkjoiner.py | 12 ++++++------ kgtk/utils/gzipprocess.py | 2 +- kgtk/value/kgtkvalue.py | 6 +++--- kgtk/value/languagevalidator.py | 2 +- 14 files changed, 70 insertions(+), 70 deletions(-) diff --git a/kgtk/cli/clean_data.py b/kgtk/cli/clean_data.py index 5fdb9dcd4..bb4059d6b 100644 --- a/kgtk/cli/clean_data.py +++ b/kgtk/cli/clean_data.py @@ -8,11 +8,11 @@ import sys import typing -from kgtk.join.enumnameaction import EnumNameAction -from kgtk.join.kgtkformat import KgtkFormat -from kgtk.join.kgtkreader import KgtkReader -from kgtk.join.kgtkwriter import KgtkWriter -from kgtk.join.validationaction import ValidationAction +from kgtk.kgtkformat import KgtkFormat +from kgtk.io.kgtkreader import KgtkReader +from kgtk.io.kgtkwriter import KgtkWriter +from kgtk.utils.enumnameaction import EnumNameAction +from kgtk.utils.validationaction import ValidationAction def parser(): return { diff --git a/kgtk/cli/ifexists.py b/kgtk/cli/ifexists.py index a1e288b25..d72f320a6 100644 --- a/kgtk/cli/ifexists.py +++ b/kgtk/cli/ifexists.py @@ -6,13 +6,13 @@ import sys import typing -from kgtk.join.enumnameaction import EnumNameAction -from kgtk.join.kgtkformat import KgtkFormat +from kgtk.kgtkformat import KgtkFormat +from kgtk.io.kgtkreader import KgtkReader +from kgtk.io.kgtkwriter import KgtkWriter from kgtk.join.ifexists import IfExists -from kgtk.join.kgtkreader import KgtkReader -from kgtk.join.kgtkwriter import KgtkWriter -from kgtk.join.kgtkvalueoptions import KgtkValueOptions -from kgtk.join.validationaction import ValidationAction +from kgtk.utils.enumnameaction import EnumNameAction +from kgtk.utils.validationaction import ValidationAction +from kgtk.value.kgtkvalueoptions import KgtkValueOptions def parser(): return { diff --git a/kgtk/cli/ifnotexists.py b/kgtk/cli/ifnotexists.py index e9003a9ef..69bbbff61 100644 --- a/kgtk/cli/ifnotexists.py +++ b/kgtk/cli/ifnotexists.py @@ -7,13 +7,13 @@ import sys import typing -from kgtk.join.enumnameaction import EnumNameAction -from kgtk.join.kgtkformat import KgtkFormat +from kgtk.kgtkformat import KgtkFormat +from kgtk.io.kgtkreader import KgtkReader +from kgtk.io.kgtkwriter import KgtkWriter from kgtk.join.ifexists import IfExists -from kgtk.join.kgtkreader import KgtkReader -from kgtk.join.kgtkwriter import KgtkWriter -from kgtk.join.kgtkvalueoptions import KgtkValueOptions -from kgtk.join.validationaction import ValidationAction +from kgtk.utils.enumnameaction import EnumNameAction +from kgtk.utils.validationaction import ValidationAction +from kgtk.value.kgtkvalueoptions import KgtkValueOptions def parser(): return { diff --git a/kgtk/cli/validate.py b/kgtk/cli/validate.py index ce2fce9e2..a674baac0 100644 --- a/kgtk/cli/validate.py +++ b/kgtk/cli/validate.py @@ -15,11 +15,11 @@ import sys import typing -from kgtk.join.enumnameaction import EnumNameAction -from kgtk.join.kgtkformat import KgtkFormat -from kgtk.join.kgtkreader import KgtkReader -from kgtk.join.kgtkvalueoptions import KgtkValueOptions -from kgtk.join.validationaction import ValidationAction +from kgtk.kgtkformat import KgtkFormat +from kgtk.io.kgtkreader import KgtkReader +from kgtk.utils.enumnameaction import EnumNameAction +from kgtk.utils.validationaction import ValidationAction +from kgtk.value.kgtkvalueoptions import KgtkValueOptions def parser(): return { diff --git a/kgtk/io/edgereader.py b/kgtk/io/edgereader.py index 3fe378fa0..92aa60d17 100644 --- a/kgtk/io/edgereader.py +++ b/kgtk/io/edgereader.py @@ -10,11 +10,11 @@ import sys import typing -from kgtk.join.closableiter import ClosableIter -from kgtk.join.enumnameaction import EnumNameAction -from kgtk.join.kgtkreader import KgtkReader -from kgtk.join.kgtkvalueoptions import KgtkValueOptions -from kgtk.join.validationaction import ValidationAction +from kgtk.io.kgtkreader import KgtkReader +from kgtk.utils.closableiter import ClosableIter +from kgtk.utils.enumnameaction import EnumNameAction +from kgtk.utils.validationaction import ValidationAction +from kgtk.value.kgtkvalueoptions import KgtkValueOptions @attr.s(slots=True, frozen=False) class EdgeReader(KgtkReader): diff --git a/kgtk/io/kgtkbase.py b/kgtk/io/kgtkbase.py index 2299725d2..7cbcef44c 100644 --- a/kgtk/io/kgtkbase.py +++ b/kgtk/io/kgtkbase.py @@ -7,9 +7,9 @@ import sys import typing -from kgtk.join.validationaction import ValidationAction -from kgtk.join.kgtkformat import KgtkFormat -from kgtk.join.kgtkvalue import KgtkValue +from kgtk.kgtkformat import KgtkFormat +from kgtk.utils.validationaction import ValidationAction +from kgtk.value.kgtkvalue import KgtkValue class KgtkBase(KgtkFormat): @classmethod diff --git a/kgtk/io/kgtkreader.py b/kgtk/io/kgtkreader.py index b694e9e8e..707fbdb01 100644 --- a/kgtk/io/kgtkreader.py +++ b/kgtk/io/kgtkreader.py @@ -28,14 +28,14 @@ import sys import typing -from kgtk.join.closableiter import ClosableIter, ClosableIterTextIOWrapper -from kgtk.join.enumnameaction import EnumNameAction -from kgtk.join.gzipprocess import GunzipProcess -from kgtk.join.kgtkbase import KgtkBase -from kgtk.join.kgtkformat import KgtkFormat -from kgtk.join.kgtkvalue import KgtkValue -from kgtk.join.kgtkvalueoptions import KgtkValueOptions, DEFAULT_KGTK_VALUE_OPTIONS -from kgtk.join.validationaction import ValidationAction +from kgtk.kgtkformat import KgtkFormat +from kgtk.io.kgtkbase import KgtkBase +from kgtk.utils.closableiter import ClosableIter, ClosableIterTextIOWrapper +from kgtk.utils.enumnameaction import EnumNameAction +from kgtk.utils.gzipprocess import GunzipProcess +from kgtk.utils.validationaction import ValidationAction +from kgtk.value.kgtkvalue import KgtkValue +from kgtk.value.kgtkvalueoptions import KgtkValueOptions, DEFAULT_KGTK_VALUE_OPTIONS @attr.s(slots=True, frozen=False) class KgtkReader(KgtkBase, ClosableIter[typing.List[str]]): @@ -216,7 +216,7 @@ def open(cls, if is_edge_file: # We'll instantiate an EdgeReader, which is a subclass of KgtkReader. # The EdgeReader import is deferred to avoid circular imports. - from kgtk.join.edgereader import EdgeReader + from kgtk.io.edgereader import EdgeReader # Get the indices of the required columns. node1_column_idx: int @@ -276,7 +276,7 @@ def open(cls, elif is_node_file: # We'll instantiate an NodeReader, which is a subclass of KgtkReader. # The NodeReader import is deferred to avoid circular imports. - from kgtk.join.nodereader import NodeReader + from kgtk.io.nodereader import NodeReader # Get the index of the required column: id_column_idx: int = cls.required_node_column(column_name_map, @@ -917,9 +917,9 @@ def main(): Test the KGTK file reader. """ # The EdgeReader import is deferred to avoid circular imports. - from kgtk.join.edgereader import EdgeReader + from kgtk.io.edgereader import EdgeReader # The NodeReader import is deferred to avoid circular imports. - from kgtk.join.nodereader import NodeReader + from kgtk.io.nodereader import NodeReader parser = ArgumentParser() KgtkReader.add_operation_arguments(parser) diff --git a/kgtk/io/kgtkwriter.py b/kgtk/io/kgtkwriter.py index 367b6f88f..d3d256c07 100644 --- a/kgtk/io/kgtkwriter.py +++ b/kgtk/io/kgtkwriter.py @@ -15,12 +15,12 @@ import sys import typing -from kgtk.join.kgtkreader import KgtkReader -from kgtk.join.enumnameaction import EnumNameAction -from kgtk.join.gzipprocess import GzipProcess -from kgtk.join.kgtkbase import KgtkBase -from kgtk.join.kgtkformat import KgtkFormat -from kgtk.join.validationaction import ValidationAction +from kgtk.kgtkformat import KgtkFormat +from kgtk.io.kgtkbase import KgtkBase +from kgtk.io.kgtkreader import KgtkReader +from kgtk.utils.enumnameaction import EnumNameAction +from kgtk.utils.gzipprocess import GzipProcess +from kgtk.utils.validationaction import ValidationAction @attr.s(slots=True, frozen=False) class KgtkWriter(KgtkBase): diff --git a/kgtk/io/nodereader.py b/kgtk/io/nodereader.py index 9fb88d260..9d7df148f 100644 --- a/kgtk/io/nodereader.py +++ b/kgtk/io/nodereader.py @@ -10,11 +10,11 @@ import sys import typing -from kgtk.join.closableiter import ClosableIter -from kgtk.join.enumnameaction import EnumNameAction -from kgtk.join.kgtkreader import KgtkReader -from kgtk.join.kgtkvalueoptions import KgtkValueOptions -from kgtk.join.validationaction import ValidationAction +from kgtk.io.kgtkreader import KgtkReader +from kgtk.utils.closableiter import ClosableIter +from kgtk.utils.enumnameaction import EnumNameAction +from kgtk.utils.validationaction import ValidationAction +from kgtk.value.kgtkvalueoptions import KgtkValueOptions @attr.s(slots=True, frozen=False) class NodeReader(KgtkReader): diff --git a/kgtk/join/ifexists.py b/kgtk/join/ifexists.py index 0cab4735b..bed0d6408 100644 --- a/kgtk/join/ifexists.py +++ b/kgtk/join/ifexists.py @@ -24,12 +24,12 @@ import sys import typing -from kgtk.join.enumnameaction import EnumNameAction -from kgtk.join.kgtkformat import KgtkFormat -from kgtk.join.kgtkreader import KgtkReader -from kgtk.join.kgtkwriter import KgtkWriter -from kgtk.join.kgtkvalueoptions import KgtkValueOptions -from kgtk.join.validationaction import ValidationAction +from kgtk.kgtkformat import KgtkFormat +from kgtk.io.kgtkreader import KgtkReader +from kgtk.io.kgtkwriter import KgtkWriter +from kgtk.utils.enumnameaction import EnumNameAction +from kgtk.utils.validationaction import ValidationAction +from kgtk.value.kgtkvalueoptions import KgtkValueOptions @attr.s(slots=True, frozen=True) class IfExists(KgtkFormat): diff --git a/kgtk/join/kgtkjoiner.py b/kgtk/join/kgtkjoiner.py index d512d7a0b..4855ac774 100644 --- a/kgtk/join/kgtkjoiner.py +++ b/kgtk/join/kgtkjoiner.py @@ -14,12 +14,12 @@ import sys import typing -from kgtk.join.enumnameaction import EnumNameAction -from kgtk.join.kgtkreader import KgtkReader -from kgtk.join.kgtkformat import KgtkFormat -from kgtk.join.kgtkwriter import KgtkWriter -from kgtk.join.kgtkvalueoptions import KgtkValueOptions -from kgtk.join.validationaction import ValidationAction +from kgtk.kgtkformat import KgtkFormat +from kgtk.io.kgtkreader import KgtkReader +from kgtk.io.kgtkwriter import KgtkWriter +from kgtk.utils.enumnameaction import EnumNameAction +from kgtk.utils.validationaction import ValidationAction +from kgtk.value.kgtkvalueoptions import KgtkValueOptions @attr.s(slots=True, frozen=True) class KgtkJoiner(KgtkFormat): diff --git a/kgtk/utils/gzipprocess.py b/kgtk/utils/gzipprocess.py index acacacb83..b63e86719 100644 --- a/kgtk/utils/gzipprocess.py +++ b/kgtk/utils/gzipprocess.py @@ -5,7 +5,7 @@ from multiprocessing import Process, Queue import typing -from kgtk.join.closableiter import ClosableIter +from kgtk.utils.closableiter import ClosableIter # This helper class supports running gzip in parallel. # diff --git a/kgtk/value/kgtkvalue.py b/kgtk/value/kgtkvalue.py index 9bec2605e..ea65b66bd 100644 --- a/kgtk/value/kgtkvalue.py +++ b/kgtk/value/kgtkvalue.py @@ -8,9 +8,9 @@ import sys import typing -from kgtk.join.kgtkformat import KgtkFormat -from kgtk.join.kgtkvalueoptions import KgtkValueOptions, DEFAULT_KGTK_VALUE_OPTIONS -from kgtk.join.languagevalidator import LanguageValidator +from kgtk.kgtkformat import KgtkFormat +from kgtk.value.kgtkvalueoptions import KgtkValueOptions, DEFAULT_KGTK_VALUE_OPTIONS +from kgtk.value.languagevalidator import LanguageValidator @attr.s(slots=True, frozen=False) class KgtkValue(KgtkFormat): diff --git a/kgtk/value/languagevalidator.py b/kgtk/value/languagevalidator.py index 4c7c9ff2e..f9606f221 100644 --- a/kgtk/value/languagevalidator.py +++ b/kgtk/value/languagevalidator.py @@ -8,7 +8,7 @@ import pycountry # type: ignore import typing -from kgtk.join.kgtkvalueoptions import KgtkValueOptions, DEFAULT_KGTK_VALUE_OPTIONS +from kgtk.value.kgtkvalueoptions import KgtkValueOptions, DEFAULT_KGTK_VALUE_OPTIONS # Problem: pycountry incorporates the Debian team's ISO 639-3 table, # which as of 03-May-2020 has not been updated in four years! From 5b1fab022c329b93bc880373426582a67feccf3a Mon Sep 17 00:00:00 2001 From: rongpenl <45610532+rongpenl@users.noreply.github.com> Date: Fri, 8 May 2020 10:55:03 -0700 Subject: [PATCH 123/278] Deleting the hard-coded test --- kgtk/cli/generate_wikidata_triples.py | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/kgtk/cli/generate_wikidata_triples.py b/kgtk/cli/generate_wikidata_triples.py index cc21f5c2a..474ee51d5 100644 --- a/kgtk/cli/generate_wikidata_triples.py +++ b/kgtk/cli/generate_wikidata_triples.py @@ -161,30 +161,3 @@ def run( else: generator.entry_point(line_num+1,edge) generator.finalize() - -# testing profiling locally with direct call -# pip3 install snakeviz -# run `snakeviz /tmp/tmp.dat` to visualize the call stacks. -# python3 -m cProfile -o /tmp/tmp.dat generate_wikidata_triples.py -if __name__ == "__main__": - import gzip - from kgtk.triple_generator import TripleGenerator - import sys - with open("/tmp/gwt.log","w") as dest_fp: - generator = TripleGenerator( - prop_file="/Users/rongpeng/Documents/ISI/Covid19/covid_data/v1.3/heng_props.tsv", - label_set="label", - alias_set="aliases", - description_set="descriptions", - n=10000, - ignore=True, - truthy=True, - dest_fp = dest_fp - ) - with open("/Users/rongpeng/Documents/ISI/Covid19/covid_data/v1.3/kgtk_sample_sorted.tsv","r") as fp: - for num, edge in enumerate(fp.readlines()): - if edge.startswith("#") or num == 0: - continue - else: - generator.entry_point(num+1,edge) - generator.finalize() \ No newline at end of file From f74029d9029b86f43e709a828788c96587be1d10 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Fri, 8 May 2020 10:55:36 -0700 Subject: [PATCH 124/278] Support prefixed KgtkValueOptions initialization. --- kgtk/value/kgtkvalueoptions.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/kgtk/value/kgtkvalueoptions.py b/kgtk/value/kgtkvalueoptions.py index 767f2d199..be957d808 100644 --- a/kgtk/value/kgtkvalueoptions.py +++ b/kgtk/value/kgtkvalueoptions.py @@ -113,16 +113,21 @@ def add_arguments(cls, parser: ArgumentParser): @classmethod # Build the value parsing option structure. - def from_args(cls, args: Namespace)->'KgtkValueOptions': - return cls(allow_month_or_day_zero=args.allow_month_or_day_zero, - repair_month_or_day_zero=args.repair_month_or_day_zero, - allow_language_suffixes=args.allow_language_suffixes, - allow_lax_strings=args.allow_lax_strings, - allow_lax_lq_strings=args.allow_lax_lq_strings, - additional_language_codes=args.additional_language_codes, - minimum_valid_year=args.minimum_valid_year, - maximum_valid_year=args.maximum_valid_year, - escape_list_separators=args.escape_list_separators) + def from_dict(cls, d: dict, prefix: str = "")->'KgtkValueOptions': + return cls(allow_month_or_day_zero=d.get(prefix + "allow_month_or_day_zero", False), + repair_month_or_day_zero=d.get(prefix + "repair_month_or_day_zero", False), + allow_language_suffixes=d.get(prefix + "allow_language_suffixes", True), + allow_lax_strings=d.get(prefix + "allow_lax_strings", False), + allow_lax_lq_strings=d.get(prefix + "allow_lax_lq_strings", False), + additional_language_codes=d.get(prefix + "additional_language_codes", None), + minimum_valid_year=d.get(prefix + "minimum_valid_year", cls.MINIMUM_VALID_YEAR), + maximum_valid_year=d.get(prefix + "maximum_valid_year", cls.MAXIMUM_VALID_YEAR), + escape_list_separators=d.get(prefix + "escape_list_separators", False)) + + @classmethod + # Build the value parsing option structure. + def from_args(cls, args: Namespace, prefix: str = "")->'KgtkValueOptions': + return cls.from_dict(vars(args), prefix=prefix) DEFAULT_KGTK_VALUE_OPTIONS: KgtkValueOptions = KgtkValueOptions() From 0db82df2c16ac34828e7326a5887d65d780104db Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Fri, 8 May 2020 13:52:18 -0700 Subject: [PATCH 125/278] Prefixed KGTK value option arguments. --- kgtk/value/kgtkvalueoptions.py | 100 +++++++++++++++++++++------------ 1 file changed, 65 insertions(+), 35 deletions(-) diff --git a/kgtk/value/kgtkvalueoptions.py b/kgtk/value/kgtkvalueoptions.py index be957d808..124fc17be 100644 --- a/kgtk/value/kgtkvalueoptions.py +++ b/kgtk/value/kgtkvalueoptions.py @@ -58,62 +58,83 @@ class KgtkValueOptions: @classmethod - def add_arguments(cls, parser: ArgumentParser): - vgroup = parser.add_argument_group("Data value parsing", "Options controlling the parsing and processing of KGTK data values.") - vgroup.add_argument( "--additional-language-codes", dest="additional_language_codes", - help="Additional language codes.", nargs="*", default=None) + def add_arguments(cls, parser: ArgumentParser, who: str = "", desc: str = "."): + """Add arguments for KgtkValue option processing. + + When "who" is not empty, it prefixes the options, destinations, and + help messages. This facilitates constructing command lines with + multiple sets of KGTKValue options, such as for different input files. + """ + prefix1: str = "--" # The command line argument prefix. + prefix2: str = "" # The destination name prefix. + prefix3: str = "" # The help message prefix. + + if len(who) > 0: + prefix1 = "--" + who + "-" + prefix2 = who + "_" + prefix3 = who + ": " + + vgroup = parser.add_argument_group(prefix3 + "Data value parsing", "Options controlling the parsing and processing of KGTK data values" + desc) + vgroup.add_argument( prefix1 + "additional-language-codes", dest=prefix2 + "additional_language_codes", + help=prefix3 + "Additional language codes.", nargs="*", default=None) lsgroup= vgroup.add_mutually_exclusive_group() - lsgroup.add_argument( "--allow-language-suffixes", dest="allow_language_suffixes", - help="Allow language identifier suffixes starting with a dash.", action='store_true', default=True) + lsgroup.add_argument( prefix1 + "allow-language-suffixes", dest=prefix2 + "allow_language_suffixes", + help=prefix3 + "Allow language identifier suffixes starting with a dash.", action='store_true', default=True) - lsgroup.add_argument( "--disallow-language-suffixes", dest="allow_language_suffixes", - help="Disallow language identifier suffixes starting with a dash.", action='store_false') + lsgroup.add_argument( prefix1 + "disallow-language-suffixes", dest=prefix2 + "allow_language_suffixes", + help=prefix3 + "Disallow language identifier suffixes starting with a dash.", action='store_false') laxgroup= vgroup.add_mutually_exclusive_group() - laxgroup.add_argument( "--allow-lax-strings", dest="allow_lax_strings", - help="Do not check if double quotes are backslashed inside strings.", action='store_true', default=False) + laxgroup.add_argument( prefix1 + "allow-lax-strings", dest=prefix2 + "allow_lax_strings", + help=prefix3 + "Do not check if double quotes are backslashed inside strings.", action='store_true', default=False) - laxgroup.add_argument( "--disallow-lax-strings", dest="allow_lax_strings", - help="Check if double quotes are backslashed inside strings.", action='store_false') + laxgroup.add_argument( prefix1 + "disallow-lax-strings", dest=prefix2 + "allow_lax_strings", + help=prefix3 + "Check if double quotes are backslashed inside strings.", action='store_false') lqgroup= vgroup.add_mutually_exclusive_group() - lqgroup.add_argument( "--allow-lax-lq-strings", dest="allow_lax_lq_strings", - help="Do not check if single quotes are backslashed inside language qualified strings.", action='store_true', default=False) + lqgroup.add_argument( prefix1 + "allow-lax-lq-strings", dest=prefix2 + "allow_lax_lq_strings", + help=prefix3 + "Do not check if single quotes are backslashed inside language qualified strings.", + action='store_true', default=False) - lqgroup.add_argument( "--disallow-lax-lq-strings", dest="allow_lax_lq_strings", - help="Check if single quotes are backslashed inside language qualified strings.", action='store_false') + lqgroup.add_argument( prefix1 + "disallow-lax-lq-strings", dest=prefix2 + "allow_lax_lq_strings", + help=prefix3 + "Check if single quotes are backslashed inside language qualified strings.", + action='store_false') amd0group= vgroup.add_mutually_exclusive_group() - amd0group.add_argument( "--allow-month-or-day-zero", dest="allow_month_or_day_zero", - help="Allow month or day zero in dates.", action='store_true', default=False) + amd0group.add_argument( prefix1 + "allow-month-or-day-zero", dest=prefix2 + "allow_month_or_day_zero", + help=prefix3 + "Allow month or day zero in dates.", action='store_true', default=False) - amd0group.add_argument( "--disallow-month-or-day-zero", dest="allow_month_or_day_zero", - help="Allow month or day zero in dates.", action='store_false') + amd0group.add_argument( prefix1 + "disallow-month-or-day-zero", dest=prefix2 + "allow_month_or_day_zero", + help=prefix3 + "Allow month or day zero in dates.", action='store_false') rmd0group= vgroup.add_mutually_exclusive_group() - rmd0group.add_argument( "--repair-month-or-day-zero", dest="repair_month_or_day_zero", - help="Repair month or day zero in dates.", action='store_true', default=False) + rmd0group.add_argument( prefix1 + "repair-month-or-day-zero", dest=prefix2 + "repair_month_or_day_zero", + help=prefix3 + "Repair month or day zero in dates.", action='store_true', default=False) - rmd0group.add_argument( "--no-repair-month-or-day-zero", dest="repair_month_or_day_zero", - help="Do not repair month or day zero in dates.", action='store_false') + rmd0group.add_argument( prefix1 + "no-repair-month-or-day-zero", dest=prefix2 + "repair_month_or_day_zero", + help=prefix3 + "Do not repair month or day zero in dates.", action='store_false') - vgroup.add_argument( "--minimum-valid-year", dest="minimum_valid_year", - help="The minimum valid year in dates.", type=int, default=cls.MINIMUM_VALID_YEAR) + vgroup.add_argument( prefix1 + "minimum-valid-year", dest=prefix2 + "minimum_valid_year", + help=prefix3 + "The minimum valid year in dates.", type=int, default=cls.MINIMUM_VALID_YEAR) - vgroup.add_argument( "--maximum-valid-year", dest="maximum_valid_year", - help="The maximum valid year in dates.", type=int, default=cls.MAXIMUM_VALID_YEAR) + vgroup.add_argument( prefix1 + "maximum-valid-year", dest=prefix2 + "maximum_valid_year", + help=prefix3 + "The maximum valid year in dates.", type=int, default=cls.MAXIMUM_VALID_YEAR) elsgroup= vgroup.add_mutually_exclusive_group() - elsgroup.add_argument( "--escape-list-separators", dest="escape_list_separators", - help="Escape all list separators instead of splitting on them.", action='store_true', default=False) + elsgroup.add_argument( prefix1 + "escape-list-separators", dest=prefix2 + "escape_list_separators", + help=prefix3 + "Escape all list separators instead of splitting on them.", action='store_true', default=False) - elsgroup.add_argument( "--no-escape-list-separators", dest="escape_list_separators", - help="Do not escape list separators.", action='store_false') + elsgroup.add_argument( prefix1 + "no-escape-list-separators", dest=prefix2 + "escape_list_separators", + help=prefix3 + "Do not escape list separators.", action='store_false') @classmethod # Build the value parsing option structure. - def from_dict(cls, d: dict, prefix: str = "")->'KgtkValueOptions': + def from_dict(cls, d: dict, who: str = "")->'KgtkValueOptions': + prefix: str = "" # The destination name prefix. + if len(who) > 0: + prefix = who + "_" + return cls(allow_month_or_day_zero=d.get(prefix + "allow_month_or_day_zero", False), repair_month_or_day_zero=d.get(prefix + "repair_month_or_day_zero", False), allow_language_suffixes=d.get(prefix + "allow_language_suffixes", True), @@ -126,8 +147,8 @@ def from_dict(cls, d: dict, prefix: str = "")->'KgtkValueOptions': @classmethod # Build the value parsing option structure. - def from_args(cls, args: Namespace, prefix: str = "")->'KgtkValueOptions': - return cls.from_dict(vars(args), prefix=prefix) + def from_args(cls, args: Namespace, who: str = "")->'KgtkValueOptions': + return cls.from_dict(vars(args), who=who) DEFAULT_KGTK_VALUE_OPTIONS: KgtkValueOptions = KgtkValueOptions() @@ -137,6 +158,8 @@ def main(): """ parser: ArgumentParser = ArgumentParser() KgtkValueOptions.add_arguments(parser) + KgtkValueOptions.add_arguments(parser, who="left", desc=" for the left file.") + KgtkValueOptions.add_arguments(parser, who="right", desc=" for the right file.") args: Namespace = parser.parse_args() # Build the value parsing option structure. @@ -150,6 +173,13 @@ def main(): print("additional_language_codes: None") else: print("additional_language_codes: [ %s ]" % ", ".join(value_options.additional_language_codes)) + + # Test prefixed value option processing. + left_value_options: KgtkValueOptions = KgtkValueOptions.from_args(args, who="left") + print("left_allow_month_or_day_zero: %s" % str(left_value_options.allow_month_or_day_zero)) + + right_value_options: KgtkValueOptions = KgtkValueOptions.from_args(args, who="right") + print("right_allow_month_or_day_zero: %s" % str(right_value_options.allow_month_or_day_zero)) if __name__ == "__main__": main() From 35ae6f98a8687b5ffe2f9d9ff23edf82d0c590a3 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Fri, 8 May 2020 13:55:59 -0700 Subject: [PATCH 126/278] Add options for minimum/maximum lat/lon override. --- kgtk/value/kgtkvalueoptions.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kgtk/value/kgtkvalueoptions.py b/kgtk/value/kgtkvalueoptions.py index 124fc17be..d37a99243 100644 --- a/kgtk/value/kgtkvalueoptions.py +++ b/kgtk/value/kgtkvalueoptions.py @@ -121,6 +121,18 @@ def add_arguments(cls, parser: ArgumentParser, who: str = "", desc: str = "."): vgroup.add_argument( prefix1 + "maximum-valid-year", dest=prefix2 + "maximum_valid_year", help=prefix3 + "The maximum valid year in dates.", type=int, default=cls.MAXIMUM_VALID_YEAR) + vgroup.add_argument( prefix1 + "minimum-valid-lat", dest=prefix2 + "minimum_valid_lat", + help=prefix3 + "The minimum valid latitude.", type=int, default=cls.MINIMUM_VALID_LAT) + + vgroup.add_argument( prefix1 + "maximum-valid-lat", dest=prefix2 + "maximum_valid_lat", + help=prefix3 + "The maximum valid latitude.", type=int, default=cls.MAXIMUM_VALID_LAT) + + vgroup.add_argument( prefix1 + "minimum-valid-lon", dest=prefix2 + "minimum_valid_lon", + help=prefix3 + "The minimum valid longitude.", type=int, default=cls.MINIMUM_VALID_LON) + + vgroup.add_argument( prefix1 + "maximum-valid-lon", dest=prefix2 + "maximum_valid_lon", + help=prefix3 + "The maximum valid longitude.", type=int, default=cls.MAXIMUM_VALID_LON) + elsgroup= vgroup.add_mutually_exclusive_group() elsgroup.add_argument( prefix1 + "escape-list-separators", dest=prefix2 + "escape_list_separators", help=prefix3 + "Escape all list separators instead of splitting on them.", action='store_true', default=False) From e595c287611e23f632f30af4811a37fe8b2a40b4 Mon Sep 17 00:00:00 2001 From: saggu Date: Fri, 8 May 2020 14:42:28 -0700 Subject: [PATCH 127/278] remove removal of very small values --- kgtk/triple_generator.py | 50 ++++++++++++++++++++-------------------- requirements.txt | 1 + 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/kgtk/triple_generator.py b/kgtk/triple_generator.py index 675ceffe9..1ed5b5414 100644 --- a/kgtk/triple_generator.py +++ b/kgtk/triple_generator.py @@ -21,7 +21,7 @@ ) BAD_CHARS = [":", "-", "&", ",", " ", - "(", ")", "\'", '\"', "/", "\\", "[", "]", ";","|"] + "(", ")", "\'", '\"', "/", "\\", "[", "]", ";", "|"] class TripleGenerator: @@ -30,16 +30,16 @@ class TripleGenerator: """ def __init__( - self, - prop_file: str, - label_set: str, - alias_set: str, - description_set: str, - ignore: bool, - n: int, - dest_fp: TextIO = sys.stdout, - truthy: bool = False, - use_id:bool=False, + self, + prop_file: str, + label_set: str, + alias_set: str, + description_set: str, + ignore: bool, + n: int, + dest_fp: TextIO = sys.stdout, + truthy: bool = False, + use_id: bool = False, ): from etk.wikidata.statement import Rank self.ignore = ignore @@ -200,7 +200,7 @@ def generate_prop_declaration_triple(self, node1: str, label: str, node2: str) - return True def generate_normal_triple( - self, node1: str, label: str, node2: str, is_qualifier_edge: bool,e_id:str) -> bool: + self, node1: str, label: str, node2: str, is_qualifier_edge: bool, e_id: str) -> bool: if self.use_id: e_id = TripleGenerator.replace_illegal_string(e_id) entity = self._node_2_entity(node1) @@ -244,7 +244,7 @@ def generate_normal_triple( dateTimeString.split("-00-00")) elif dateTimeString[8:10] == "00": dateTimeString = dateTimeString[:8] + \ - "01" + dateTimeString[10:] + "01" + dateTimeString[10:] object = TimeValue( value=dateTimeString, calendar=Item("Q1985727"), @@ -267,9 +267,6 @@ def generate_normal_triple( res = self.quantity_pattern.match(node2).groups() amount, lower_bound, upper_bound, unit = res - # Handle extra small numbers for now. TODO - if TripleGenerator.is_invalid_decimal_string(amount) or TripleGenerator.is_invalid_decimal_string(lower_bound) or TripleGenerator.is_invalid_decimal_string(upper_bound): - return False amount = TripleGenerator.clean_number_string(amount) lower_bound = TripleGenerator.clean_number_string(lower_bound) upper_bound = TripleGenerator.clean_number_string(upper_bound) @@ -285,6 +282,7 @@ def generate_normal_triple( amount, upper_bound=upper_bound, lower_bound=lower_bound) else: object = QuantityValue(amount) + elif edge_type == MonolingualText: text_string, lang = TripleGenerator.process_text_string(node2) object = MonolingualText(text_string, lang) @@ -314,10 +312,10 @@ def generate_normal_triple( self.doc.kg.add_subject(object) if self.truthy: self.to_append_statement = entity.add_truthy_statement( - label, object,statement_id=e_id) if self.use_id else entity.add_truthy_statement(label,object) + label, object, statement_id=e_id) if self.use_id else entity.add_truthy_statement(label, object) else: self.to_append_statement = entity.add_statement( - label, object,statement_id=e_id) if self.use_id else entity.add_statement(label, object) + label, object, statement_id=e_id) if self.use_id else entity.add_statement(label, object) self.doc.kg.add_subject(entity) return True @@ -370,8 +368,9 @@ def entry_point(self, line_number: int, edge: str): node2_index = edge_list.index("node2") prop_index = edge_list.index("property") id_index = edge_list.index("id") - if not all([node1_index>-1,node2_index>-1,prop_index>-1,id_index>-1]): - raise KGTKException("Header of kgtk file misses at least one of required column names: (node1, node2, property and id)") + if not all([node1_index > -1, node2_index > -1, prop_index > -1, id_index > -1]): + raise KGTKException( + "Header of kgtk file misses at least one of required column names: (node1, node2, property and id)") else: self.order_map["node1"] = node1_index self.order_map["node2"] = node2_index @@ -380,12 +379,12 @@ def entry_point(self, line_number: int, edge: str): return # use the order_map to map the node - + node1 = edge_list[self.order_map["node1"]].strip() node2 = edge_list[self.order_map["node2"]].strip() prop = edge_list[self.order_map["prop"]].strip() e_id = edge_list[self.order_map["id"]].strip() - if line_number == 2: + if line_number == 2: # by default a statement edge is_qualifier_edge = False # print("#Debug Info: ",line_number, self.to_append_statement_id, e_id, is_qualifier_edge,self.to_append_statement) @@ -428,7 +427,7 @@ def entry_point(self, line_number: int, edge: str): else: if prop in self.prop_types: success = self.generate_normal_triple( - node1, prop, node2, is_qualifier_edge,e_id) + node1, prop, node2, is_qualifier_edge, e_id) else: if not self.ignore: raise KGTKException( @@ -438,8 +437,9 @@ def entry_point(self, line_number: int, edge: str): success = False if (not success) and (not is_qualifier_edge) and (not self.ignore): # We have a corrupted edge here. - self.ignore_file.write("Corrupted statement at line number: {} with id {} with current corrupted id {}\n".format( - line_number, e_id, self.corrupted_statement_id)) + self.ignore_file.write( + "Corrupted statement at line number: {} with id {} with current corrupted id {}\n".format( + line_number, e_id, self.corrupted_statement_id)) self.ignore_file.flush() self.corrupted_statement_id = e_id else: diff --git a/requirements.txt b/requirements.txt index 254c51aa1..27e52a2c2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,3 +16,4 @@ attrs pycountry iso-639 redis +rfc3986 From 0be3326e1ccd51f9709c91761c073915bf15591c Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Fri, 8 May 2020 15:52:04 -0700 Subject: [PATCH 128/278] Return parsed fields in a seperate object. --- kgtk/value/kgtkvalue.py | 620 +++++++++++++++++---------------- kgtk/value/kgtkvalueoptions.py | 5 +- 2 files changed, 317 insertions(+), 308 deletions(-) diff --git a/kgtk/value/kgtkvalue.py b/kgtk/value/kgtkvalue.py index ea65b66bd..e38cf575f 100644 --- a/kgtk/value/kgtkvalue.py +++ b/kgtk/value/kgtkvalue.py @@ -12,27 +12,15 @@ from kgtk.value.kgtkvalueoptions import KgtkValueOptions, DEFAULT_KGTK_VALUE_OPTIONS from kgtk.value.languagevalidator import LanguageValidator -@attr.s(slots=True, frozen=False) -class KgtkValue(KgtkFormat): - value: str = attr.ib(validator=attr.validators.instance_of(str)) - options: KgtkValueOptions = attr.ib(validator=attr.validators.instance_of(KgtkValueOptions), default=DEFAULT_KGTK_VALUE_OPTIONS) - - # TODO: proper validation. - parent: typing.Optional['KgtkValue'] = attr.ib(default=None) - # Cache some properties of the value that would be expensive to - # continuously recompute. - data_type: typing.Optional[KgtkFormat.DataType] = None - valid: typing.Optional[bool] = None - - # If this is a list, cache a KgtkValue object for each item of the list. - # - # Note: Please do not access this list directly. Use get_list_items(). - list_items: typing.Optional[typing.List['KgtkValue']] = None +@attr.s(slots=True, frozen=False) +class KgtkValueFields(): + data_type: KgtkFormat.DataType = attr.ib(validator=attr.validators.instance_of(KgtkFormat.DataType)) + valid: bool = attr.ib(validator=attr.validators.instance_of(bool)) # The following members offer access to the components (fields) of a # KgtkValue. They are accessible immediately after validating the - # contents of the KgtkValue object: + # contents of the KgtkValue object when kgtk_value.parse_fields is True. # # obj.is_valid() return True # obj.validate() returns True @@ -43,44 +31,157 @@ class KgtkValue(KgtkFormat): # The fields may be accessed directly from this object or they may be # obtained as a map via obj.get_fields() + # >0 if this is a list. + list_len: int = attr.ib(validator=attr.validators.instance_of(int), default=0) + # Offer the components of a string or language-qualified string, after validating the item. - contents: typing.Optional[str] = None # String contents without the enclosing quotes - lang: typing.Optional[str] = None # 2- or 3-character code without suffix. - suffix: typing.Optional[str] = None # Language code suffix, including the leading dash. + # String contents without the enclosing quotes + contents: typing.Optional[str] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(str)), default=None) + + # 2- or 3-character language code code without suffix. + lang: typing.Optional[str] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(str)), default=None) + + # The language code suffix, including the leading dash. + suffix: typing.Optional[str] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(str)), default=None) # Offer the components of a number or quantity, after validating the item. - numberstr: typing.Optional[str] = None # Note: not converted to int or float - number: typing.Optional[typing.Union[int, float]] = None - low_tolerancestr: typing.Optional[str] = None # Note: not converted to int or float - high_tolerancestr: typing.Optional[str] = None # Note: not converted to int or float - si_units: typing.Optional[str] = None - wikidata_node: typing.Optional[str] = None + numberstr: typing.Optional[str] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(str)), default=None) + number: typing.Optional[typing.Union[int, float]] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of((int, float))), default=None) + + low_tolerancestr: typing.Optional[str] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(str)), default=None) + + high_tolerancestr: typing.Optional[str] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(str)), default=None) + + si_units: typing.Optional[str] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(str)), default=None) + + wikidata_node: typing.Optional[str] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(str)), default=None) # Offer the components of a location coordinates, after validaating the item: - latstr: typing.Optional[str] = None - lat: typing.Optional[float] = None - lonstr: typing.Optional[str] = None - lon: typing.Optional[float] = None + latstr: typing.Optional[str] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(str)), default=None) + lat: typing.Optional[float] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(float)), default=None) + + lonstr: typing.Optional[str] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(str)), default=None) + lon: typing.Optional[float] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(float)), default=None) # Offer the components of a date and times, after validating the item: - yearstr: typing.Optional[str] = None # Note: before conversion to int - year: typing.Optional[int] = None - monthstr: typing.Optional[str] = None # Note: before conversion to int - month: typing.Optional[int] = None - daystr: typing.Optional[str] = None # Note: before conversion to int - day: typing.Optional[int] = None - hourstr: typing.Optional[str] = None # Note: before conversion to int or float - hour: typing.Optional[int] = None - minutesstr: typing.Optional[str] = None # Note: before conversion to int or float - minutes: typing.Optional[int] = None - secondsstr: typing.Optional[str] = None # Note: before conversion to int or float - seconds: typing.Optional[int] = None - zonestr: typing.Optional[str] = None # Z or [-+]HH or [-+]HHSS or [-+]HH:SS - precisionstr: typing.Optional[str] = None - iso8601extended: typing.Optional[bool] = None # True when hyphens/colons are present. + yearstr: typing.Optional[str] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(str)), default=None) + year: typing.Optional[int] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(int)), default=None) + + monthstr: typing.Optional[str] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(str)), default=None) + month: typing.Optional[int] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(int)), default=None) + + daystr: typing.Optional[str] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(str)), default=None) + day: typing.Optional[int] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(int)), default=None) + + hourstr: typing.Optional[str] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(str)), default=None) + hour: typing.Optional[int] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(int)), default=None) + + minutesstr: typing.Optional[str] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(str)), default=None) + minutes: typing.Optional[int] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(int)), default=None) + + secondsstr: typing.Optional[str] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(str)), default=None) + seconds: typing.Optional[int] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(int)), default=None) + + # Z or [-+]HH or [-+]HHSS or [-+]HH:SS + zonestr: typing.Optional[str] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(str)), default=None) + + precisionstr: typing.Optional[str] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(str)), default=None) + + # True when hyphens/colons are present. + iso8601extended: typing.Optional[bool] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(bool)), default=None) # Offer the contents of a boolean, after validating the item: - truth: typing.Optional[bool] = None + truth: typing.Optional[bool] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(bool)), default=None) + + def to_map(self)->typing.Mapping[str, typing.Union[str, int, float, bool]]: + results: typing.MutableMapping[str, typing.Union[str, int, float, bool]] = { } + results["list_len"] = self.list_len + if self.data_type is not None: + results["data_type"] = self.data_type.name + if self.valid is not None: + results["valid"] = self.valid + if self.contents is not None: + results["contents"] = self.contents + if self.lang is not None: + results["lang"] = self.lang + if self.suffix is not None: + results["suffix"] = self.suffix + if self.numberstr is not None: + results["numberstr"] = self.numberstr + if self.number is not None: + results["number"] = self.number + if self.low_tolerancestr is not None: + results["low_tolerancestr"] = self.low_tolerancestr + if self.high_tolerancestr is not None: + results["high_tolerancestr"] = self.high_tolerancestr + if self.si_units is not None: + results["si_units"] = self.si_units + if self.wikidata_node is not None: + results["wikidata_node"] = self.wikidata_node + if self.latstr is not None: + results["latstr"] = self.latstr + if self.lat is not None: + results["lat"] = self.lat + if self.lonstr is not None: + results["lonstr"] = self.lonstr + if self.lon is not None: + results["lon"] = self.lon + if self.yearstr is not None: + results["yearstr"] = self.yearstr + if self.year is not None: + results["year"] = self.year + if self.monthstr is not None: + results["monthstr"] = self.monthstr + if self.month is not None: + results["month"] = self.month + if self.daystr is not None: + results["daystr"] = self.daystr + if self.day is not None: + results["day"] = self.day + if self.hourstr is not None: + results["hourstr"] = self.hourstr + if self.hour is not None: + results["hour"] = self.hour + if self.minutesstr is not None: + results["minutesstr"] = self.minutesstr + if self.minutes is not None: + results["minutes"] = self.minutes + if self.secondsstr is not None: + results["secondsstr"] = self.secondsstr + if self.seconds is not None: + results["seconds"] = self.seconds + if self.zonestr is not None: + results["zonestr"] = self.zonestr + if self.precisionstr is not None: + results["precisionstr"] = self.precisionstr + if self.iso8601extended is not None: + results["iso8601extended"] = self.iso8601extended + if self.truth is not None: + results["truth"] = self.truth + return results + +@attr.s(slots=True, frozen=False) +class KgtkValue(KgtkFormat): + value: str = attr.ib(validator=attr.validators.instance_of(str)) + options: KgtkValueOptions = attr.ib(validator=attr.validators.instance_of(KgtkValueOptions), default=DEFAULT_KGTK_VALUE_OPTIONS) + parse_fields: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) + + # The current fields when available: + # fields: typing.Optional[KgtkValueFields] = attr.ib(attr.validators.instance_of(KgtkValueFields), default=None, init=False) + fields: typing.Optional[KgtkValueFields] = attr.ib(default=None, init=False) + + # TODO: proper validation. + parent: typing.Optional['KgtkValue'] = attr.ib(default=None) + + # Cache some properties of the value that would be expensive to + # continuously recompute. + data_type: typing.Optional[KgtkFormat.DataType] = None + valid: typing.Optional[bool] = None + + # If this is a list, cache a KgtkValue object for each item of the list. + # + # Note: Please do not access this list directly. Use get_list_items(). + list_items: typing.Optional[typing.List['KgtkValue']] = None def is_valid(self)->bool: # Is this a valid whatever it is? @@ -101,6 +202,8 @@ def is_empty(self, validate: bool = False)->bool: # We are certain that this is an empty value. We can be certain it is valid. self.data_type = KgtkFormat.DataType.EMPTY self.valid = True + if self.parse_fields: + self.fields = KgtkValueFields(data_type=self.data_type, valid=self.valid) return True split_list_re: typing.Pattern = re.compile(r"(?bool: if self.valid is not None: return self.valid + # We will save the list length even if invalid. + if self.parse_fields: + self.fields = KgtkValueFields(data_type=KgtkFormat.DataType.LIST, + valid=False, + list_len=len(self.get_list_items())) + # Validate the list. item: 'KgtkValue' for item in self.get_list_items(): @@ -156,6 +265,10 @@ def is_list(self, validate: bool = False)->bool: # This is a valid list. self.valid = True + if self.parse_fields: + self.fields = KgtkValueFields(data_type=KgtkFormat.DataType.LIST, + valid=self.valid, + list_len=len(self.get_list_items())) return True def rebuild_list(self): @@ -169,7 +282,6 @@ def rebuild_list(self): for item in list_items: values.append(item.value) self.value = KgtkFormat.LIST_SEPARATOR.join(values) - def _is_number_or_quantity(self)->bool: return self.value.startswith(("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "+", "-", ".")) @@ -275,21 +387,8 @@ def is_number_or_quantity(self, validate: bool=False)->bool: elif self.data_type == KgtkFormat.DataType.QUANTITY: return self.is_quantity(validate=validate) else: - # Clear the number or quantity components: - self.numberstr = None - self.low_tolerancestr = None - self.high_tolerancestr = None - self.si_units = None - self.wikidata_node = None return False # Not a number or quantity. - # Clear the number or quantity components: - self.numberstr = None - self.low_tolerancestr = None - self.high_tolerancestr = None - self.si_units = None - self.wikidata_node = None - if not self._is_number_or_quantity(): return False @@ -304,24 +403,25 @@ def is_number_or_quantity(self, validate: bool=False)->bool: return False # Extract the number or quantity components: - self.numberstr = m.group("number") - self.low_tolerancestr = m.group("low_tolerance") - self.high_tolerancestr = m.group("high_tolerance") - self.si_units = m.group("si_units") - self.wikidata_node = m.group("wikidata_node") + numberstr: str = m.group("number") + low_tolerancestr: str = m.group("low_tolerance") + high_tolerancestr: str = m.group("high_tolerance") + si_units: str = m.group("si_units") + wikidata_node: str = m.group("wikidata_node") # For convenience, convert the numeric part to int or float: # # TODO: go to this extra work only when requested? - if self.numberstr is None: + if numberstr is None: raise ValueError("Missing numeric part") - n: str = self.numberstr.lower() + n: str = numberstr.lower() + number: typing.Union[float, int] if "." in n or ("e" in n and not n.startswith("0x")): - self.number = float(n) + number = float(n) else: - self.number = int(n) + number = int(n) - if self.low_tolerancestr is not None or self.high_tolerancestr is not None or self.si_units is not None or self.wikidata_node is not None: + if low_tolerancestr is not None or high_tolerancestr is not None or si_units is not None or wikidata_node is not None: # We can be certain that this is a quantity. self.data_type = KgtkFormat.DataType.QUANTITY else: @@ -329,6 +429,15 @@ def is_number_or_quantity(self, validate: bool=False)->bool: self.data_type = KgtkFormat.DataType.NUMBER self.valid = True + if self.parse_fields: + self.fields = KgtkValueFields(data_type=self.data_type, + valid=self.valid, + numberstr=numberstr, + number=number, + low_tolerancestr=low_tolerancestr, + high_tolerancestr=high_tolerancestr, + si_units=si_units, + wikidata_node=wikidata_node) return True def is_number(self, validate: bool=False)->bool: @@ -373,22 +482,28 @@ def is_number(self, validate: bool=False)->bool: return False # Extract the number components: - self.numberstr = m.group("number") + numberstr: str = m.group("number") # For convenience, convert the numeric part to int or float: # # TODO: go to this extra work only when requested? - if self.numberstr is None: + if numberstr is None: raise ValueError("Missing numeric part") - n: str = self.numberstr.lower() + n: str = numberstr.lower() + number: typing.Union[float, int] if "." in n or ("e" in n and not n.startswith("0x")): - self.number = float(n) + number = float(n) else: - self.number = int(n) + number = int(n) # Now we can be certain that this is a number. self.data_type = KgtkFormat.DataType.NUMBER self.valid = True + if self.parse_fields: + self.fields = KgtkValueFields(data_type=self.data_type, + valid=self.valid, + numberstr=numberstr, + number=number) return True @@ -399,12 +514,6 @@ def is_quantity(self, validate: bool=False)->bool: """ if self.data_type is not None: if self.data_type != KgtkFormat.DataType.QUANTITY: - # Clear the quantity components: - self.numberstr = None - self.low_tolerancestr = None - self.high_tolerancestr = None - self.si_units = None - self.wikidata_node = None return False if not validate: @@ -412,13 +521,6 @@ def is_quantity(self, validate: bool=False)->bool: if self.valid is not None: return self.valid - # Clear the quantity components: - self.numberstr = None - self.low_tolerancestr = None - self.high_tolerancestr = None - self.si_units = None - self.wikidata_node = None - if not self._is_number_or_quantity(): return False # We don't know yet if this is a quantity. It could be a number. @@ -428,36 +530,51 @@ def is_quantity(self, validate: bool=False)->bool: return False # Extract the quantity components: - self.numberstr = m.group("number") - self.low_tolerancestr = m.group("low_tolerance") - self.high_tolerancestr = m.group("high_tolerance") - self.si_units = m.group("si_units") - self.wikidata_node = m.group("wikidata_node") + numberstr:str = m.group("number") + low_tolerancestr:str = m.group("low_tolerance") + high_tolerancestr:str = m.group("high_tolerance") + si_units:str = m.group("si_units") + wikidata_node:str = m.group("wikidata_node") # For convenience, convert the numeric part to int or float: # # TODO: go to this extra work only when requested? - if self.numberstr is None: + if numberstr is None: raise ValueError("Missing numeric part") - n: str = self.numberstr.lower() + n: str = numberstr.lower() + number: typing.Union[float, int] if "." in n or ("e" in n and not n.startswith("0x")): - self.number = float(n) + number = float(n) else: - self.number = int(n) + number = int(n) - if self.low_tolerancestr is None and self.high_tolerancestr is None and self.si_units is None and self.wikidata_node is None: + if low_tolerancestr is None and high_tolerancestr is None and si_units is None and wikidata_node is None: # This is a number, not a quantity self.data_type = KgtkFormat.DataType.NUMBER self.valid = True + if self.parse_fields: + self.fields = KgtkValueFields(data_type=self.data_type, + valid=self.valid, + numberstr=numberstr, + number=number) return False # Now we can be certain that this is a quantity. self.data_type = KgtkFormat.DataType.QUANTITY self.valid = True + if self.parse_fields: + self.fields = KgtkValueFields(data_type=self.data_type, + valid=self.valid, + numberstr=numberstr, + number=number, + low_tolerancestr=low_tolerancestr, + high_tolerancestr=high_tolerancestr, + si_units=si_units, + wikidata_node=wikidata_node) return True lax_string_re: typing.Pattern = re.compile(r'^"(?P.*)"$') - strict_string_re: typing.Pattern = re.compile(r'^"(?P(?:[^"\\]|\\.)*"$)') + strict_string_re: typing.Pattern = re.compile(r'^"(?P(?:[^"\\]|\\.)*)"$') def is_string(self, validate: bool = False)->bool: """ @@ -470,15 +587,11 @@ def is_string(self, validate: bool = False)->bool: """ if self.data_type is None: if not self.value.startswith('"'): - # Clear the string components: - self.contents = None return False # We are certain this is a string. We don't yet know if it is valid. self.data_type = KgtkFormat.DataType.STRING else: if self.data_type != KgtkFormat.DataType.STRING: - # Clear the string components: - self.contents = None return False if not validate: @@ -486,9 +599,6 @@ def is_string(self, validate: bool = False)->bool: if self.valid is not None: return self.valid - # Clear the string components: - self.contents = None - # Validate the string: m: typing.Optional[typing.Match] if self.options.allow_lax_strings: @@ -498,11 +608,12 @@ def is_string(self, validate: bool = False)->bool: if m is None: return False - # Extract the contents components: - self.contents = m.group("contents") - # We are certain that this is a valid string. self.valid = True + if self.parse_fields: + self.fields = KgtkValueFields(data_type=KgtkFormat.DataType.STRING, + valid=self.valid, + contents=m.group("contents")) return True def is_structured_literal(self)->bool: @@ -527,30 +638,31 @@ def is_symbol(self, validate: bool = False)->bool: # We are certain this is a symbol. We assume that it is valid. self.data_type = KgtkFormat.DataType.SYMBOL self.valid = True + if self.parse_fields: + self.fields = KgtkValueFields(data_type=self.data_type, + valid=self.valid) return True def is_boolean(self, validate: bool = False)->bool: """ Return True if the value matches one of the special boolean symbols. - The validate parameter is ignored. + The validate parameter is ignored, we always validate. """ if self.data_type is not None: - if self.data_type != KgtkFormat.DataType.BOOLEAN: - self.truth = None - return False - self.truth = self.value == KgtkFormat.TRUE_SYMBOL - return True + return self.data_type == KgtkFormat.DataType.BOOLEAN # Is this a boolean? if self.value != KgtkFormat.TRUE_SYMBOL and self.value != KgtkFormat.FALSE_SYMBOL: - self.truth = None return False # We are certain this is a valid boolean. self.data_type = KgtkFormat.DataType.BOOLEAN self.valid = True - self.truth = self.value == KgtkFormat.TRUE_SYMBOL + if self.parse_fields: + self.fields = KgtkValueFields(data_type=self.data_type, + valid=self.valid, + truth=self.value == KgtkFormat.TRUE_SYMBOL) return True # Support two or three character language codes. Suports hyphenated codes @@ -564,19 +676,11 @@ def is_language_qualified_string(self, validate: bool=False)->bool: """ if self.data_type is None: if not self.value.startswith("'"): - # Clear the cached components of the language qualified string: - self.contents = None - self.lang = None - self.suffix = None return False # We are certain that this is a language qualified string, although we haven't checked validity. self.data_type = KgtkFormat.DataType.LANGUAGE_QUALIFIED_STRING else: if self.data_type != KgtkFormat.DataType.LANGUAGE_QUALIFIED_STRING: - # Clear the cached components of the language qualified string: - self.contents = None - self.lang = None - self.suffix = None return False if not validate: @@ -584,11 +688,6 @@ def is_language_qualified_string(self, validate: bool=False)->bool: if self.valid is not None: return self.valid - # Clear the cached components of the language qualified string: - self.contents = None - self.lang = None - self.suffix = None - # Validate the language qualified string. # print("checking %s" % self.value) m: typing.Optional[typing.Match] @@ -600,22 +699,23 @@ def is_language_qualified_string(self, validate: bool=False)->bool: # print("match failed for %s" % self.value) return False - # Extract the contents, lang, and optional suffix components: - self.contents = m.group("contents") - self.lang = m.group("lang") - self.suffix = m.group("suffix") - # Extract the combined lang and suffix for use by the LanguageValidator. - lang_suffix: str = m.group("lang_suffix") - # print("lang: %s" % lang_suffix) + lang_and_suffix: str = m.group("lang_suffix") + # print("lang_and_suffix: %s" % lang_and_suffix) # Validate the language code: - if not LanguageValidator.validate(lang_suffix.lower(), options=self.options): + if not LanguageValidator.validate(lang_and_suffix.lower(), options=self.options): # print("language validation failed for %s" % self.value) return False # We are certain that this is a valid language qualified string. self.valid = True + if self.parse_fields: + self.fields = KgtkValueFields(data_type=KgtkFormat.DataType.LANGUAGE_QUALIFIED_STRING, + valid=self.valid, + contents=m.group("contents"), + lang=m.group("lang"), + suffix=m.group("suffix")) return True #location_coordinates_re: typing.Pattern = re.compile(r"^@(?P[-+]?\d{3}\.\d{5})/(?P[-+]?\d{3}\.\d{5})$") @@ -631,19 +731,11 @@ def is_location_coordinates(self, validate: bool=False)->bool: """ if self.data_type is None: if not self.value.startswith("@"): - self.latstr = None - self.lat = None - self.lonstr = None - self.lon = None return False # We are certain that this is location coordinates, although we haven't checked validity. self.data_type = KgtkFormat.DataType.LOCATION_COORDINATES else: if self.data_type != KgtkFormat.DataType.LOCATION_COORDINATES: - self.latstr = None - self.lat = None - self.lonstr = None - self.lon = None return False if not validate: @@ -651,40 +743,39 @@ def is_location_coordinates(self, validate: bool=False)->bool: if self.valid is not None: return self.valid - # Clear the lat/lon components: - self.latstr = None - self.lat = None - self.lonstr = None - self.lon = None - # Validate the location coordinates: m: typing.Optional[typing.Match] = KgtkValue.location_coordinates_re.match(self.value) if m is None: return False latstr: str = m.group("lat") - self.latstr = latstr lonstr: str = m.group("lon") - self.lonstr = lonstr # Latitude normally runs from -90 to +90: try: - self.lat = float(latstr) - if self.lat < self.options.minimum_valid_lat or self.lat > self.options.maximum_valid_lat: + lat: float = float(latstr) + if lat < self.options.minimum_valid_lat or lat > self.options.maximum_valid_lat: return False except ValueError: return False # Longitude normally runs from -180 to +180: try: - self.lon = float(lonstr) - if self.lon < self.options.minimum_valid_lon or self.lon > self.options.maximum_valid_lon: + lon: float = float(lonstr) + if lon < self.options.minimum_valid_lon or lon > self.options.maximum_valid_lon: return False except ValueError: return False # We are certain that this is valid. self.valid = True + if self.parse_fields: + self.fields = KgtkValueFields(data_type=KgtkFormat.DataType.LOCATION_COORDINATES, + valid=self.valid, + latstr=latstr, + lat=lat, + lonstr=lonstr, + lon=lon) return True # https://en.wikipedia.org/wiki/ISO_8601 @@ -780,43 +871,11 @@ def is_date_and_times(self, validate: bool=False)->bool: """ if self.data_type is None: if not self.value.startswith("^"): - # Clear the cached date and times components: - self.yearstr = None - self.monthstr = None - self.daystr = None - self.hourstr = None - self.minutesstr = None - self.secondsstr = None - self.year = None - self.month = None - self.day = None - self.hour = None - self.minutes = None - self.seconds = None - self.zonestr = None - self.precisionstr = None - self.iso8601extended = None return False # We are certain that this is location coordinates, although we haven't checked validity. self.data_type = KgtkFormat.DataType.DATE_AND_TIMES else: if self.data_type != KgtkFormat.DataType.DATE_AND_TIMES: - # Clear the cached date and times components: - self.yearstr = None - self.monthstr = None - self.daystr = None - self.hourstr = None - self.minutesstr = None - self.secondsstr = None - self.year = None - self.month = None - self.day = None - self.hour = None - self.minutes = None - self.seconds = None - self.zonestr = None - self.precisionstr = None - self.iso8601extended = None return False if not validate: @@ -825,105 +884,109 @@ def is_date_and_times(self, validate: bool=False)->bool: return self.valid # Clear the cached date and times components: - self.yearstr = None - self.monthstr = None - self.daystr = None - self.hourstr = None - self.minutesstr = None - self.secondsstr = None - self.year = None - self.month = None - self.day = None - self.hour = None - self.minutes = None - self.seconds = None - self.zonestr = None - self.precisionstr = None - self.iso8601extended = None # Validate the date and times: m: typing.Optional[typing.Match] = KgtkValue.lax_date_and_times_re.match(self.value) if m is None: return False - self.yearstr = m.group("year") - self.monthstr = m.group("month") - self.daystr = m.group("day") - self.hourstr = m.group("hour") - self.minutesstr = m.group("minutes") - self.secondsstr = m.group("seconds") - self.zonestr = m.group("zone") - self.precisionstr = m.group("precision") - self.iso8601extended = m.group("hyphen") is not None + yearstr: str = m.group("year") + monthstr: str = m.group("month") + daystr: str = m.group("day") + hourstr: str = m.group("hour") + minutesstr: str = m.group("minutes") + secondsstr: str = m.group("seconds") + zonestr: str = m.group("zone") + precisionstr: str = m.group("precision") + iso8601extended: bool = m.group("hyphen") is not None fixup_needed: bool = False # Validate the year: - if self.yearstr is None or len(self.yearstr) == 0: + if yearstr is None or len(yearstr) == 0: return False # Years are mandatory try: - self.year: int = int(self.yearstr) + year: int = int(yearstr) except ValueError: return False - if self.year < self.options.minimum_valid_year: + if year < self.options.minimum_valid_year: return False - if self.year > self.options.maximum_valid_year: + if year > self.options.maximum_valid_year: return False - if self.monthstr is not None: + if monthstr is not None: try: - self.month: int = int(self.monthstr) + month: int = int(monthstr) except ValueError: return False # shouldn't happen - if self.month == 0: + if month == 0: if self.options.repair_month_or_day_zero: - self.month = 1 - self.monthstr = "01" + month = 1 + monthstr = "01" fixup_needed = True elif not self.options.allow_month_or_day_zero: return False # month 0 was disallowed. - if self.daystr is not None: + if daystr is not None: try: - self.day: int = int(self.daystr) + day: int = int(daystr) except ValueError: return False # shouldn't happen - if self.day == 0: + if day == 0: if self.options.repair_month_or_day_zero: - self.day = 1 - self.daystr = "01" + day = 1 + daystr = "01" fixup_needed = True elif not self.options.allow_month_or_day_zero: return False # day 0 was disallowed. # Convert the time fields to ints: - if self.hourstr is not None: + if hourstr is not None: try: - self.hour: int = int(self.hourstr) + hour: int = int(hourstr) except ValueError: return False # shouldn't happen - if self.minutesstr is not None: + if minutesstr is not None: try: - self.minutes: int = int(self.minutesstr) + minutes: int = int(minutesstr) except ValueError: return False # shouldn't happen - if self.secondsstr is not None: + if secondsstr is not None: try: - self.seconds: int = int(self.secondsstr) + seconds: int = int(secondsstr) except ValueError: return False # shouldn't happen if fixup_needed: - # Rapair a month or day zero problem. If this value is the child - #of a list, repair the list parent value, too. + # Repair a month or day zero problem. If this value is the child + # of a list, repair the list parent value, too. self.update_date_and_times() if self.parent is not None: self.parent.rebuild_list() # We are fairly certain that this is a valid date and times. self.valid = True + if self.parse_fields: + self.fields = KgtkValueFields(data_type=KgtkFormat.DataType.DATE_AND_TIMES, + valid=self.valid, + yearstr=yearstr, + monthstr=monthstr, + daystr=daystr, + hourstr=hourstr, + minutesstr=minutesstr, + secondsstr=secondsstr, + year=year, + month=month, + day=day, + hour=hour, + minutes=minutes, + seconds=seconds, + zonestr=zonestr, + precisionstr=precisionstr, + iso8601extended=iso8601extended, + ) return True def update_date_and_times(self): @@ -1019,6 +1082,7 @@ def reclassify(self)->KgtkFormat.DataType: # Classify this KgtkValue into a KgtkDataType, ignoring any cached data_type. self.data_type = None self.valid = None + self.fields = None return self.classify() def validate(self)->bool: @@ -1030,6 +1094,9 @@ def validate(self)->bool: # If the valid flag has already been cached, return that. if self.valid is not None: return self.valid + + # Clear any fields from prior validation: + self.fields = None # Validate the value. if dt == KgtkFormat.DataType.EMPTY: @@ -1062,6 +1129,7 @@ def revalidate(self, reclassify: bool=False)->bool: if reclassify: self.data_type = None self.valid = None + self.fields = None return self.validate() def describe(self)->str: @@ -1089,7 +1157,7 @@ def describe(self)->str: elif dt == KgtkFormat.DataType.STRING: return "String" if self.is_string(validate=True) else "Invalid String" elif dt == KgtkFormat.DataType.LANGUAGE_QUALIFIED_STRING: - return "Language Qualified String (%s)" % self.lang if self.is_language_qualified_string(validate=True) else "Invalid Language Qualified String" + return "Language Qualified String" if self.is_language_qualified_string(validate=True) else "Invalid Language Qualified String" elif dt == KgtkFormat.DataType.LOCATION_COORDINATES: return "Location Coordinates" if self.is_location_coordinates(validate=True) else "Invalid Location Coordinates" elif dt == KgtkFormat.DataType.DATE_AND_TIMES: @@ -1103,79 +1171,19 @@ def describe(self)->str: else: return "Unknown" - def get_fields(self)->typing.Mapping[str, typing.Union[str, int, float, bool]]: - results: typing.MutableMapping[str, typing.Union[str, int, float, bool]] = { } - if self.data_type is not None: - results["data_type"] = str(self.data_type) - if self.valid is not None: - results["valid"] = self.valid - if self.contents is not None: - results["contents"] = self.contents - if self.lang is not None: - results["lang"] = self.lang - if self.suffix is not None: - results["suffix"] = self.suffix - if self.numberstr is not None: - results["numberstr"] = self.numberstr - if self.number is not None: - results["number"] = self.number - if self.low_tolerancestr is not None: - results["low_tolerancestr"] = self.low_tolerancestr - if self.high_tolerancestr is not None: - results["high_tolerancestr"] = self.high_tolerancestr - if self.si_units is not None: - results["si_units"] = self.si_units - if self.wikidata_node is not None: - results["wikidata_node"] = self.wikidata_node - if self.latstr is not None: - results["latstr"] = self.latstr - if self.lat is not None: - results["lat"] = self.lat - if self.lonstr is not None: - results["lonstr"] = self.lonstr - if self.lon is not None: - results["lon"] = self.lon - if self.yearstr is not None: - results["yearstr"] = self.yearstr - if self.year is not None: - results["year"] = self.year - if self.monthstr is not None: - results["monthstr"] = self.monthstr - if self.month is not None: - results["month"] = self.month - if self.daystr is not None: - results["daystr"] = self.daystr - if self.day is not None: - results["day"] = self.day - if self.hourstr is not None: - results["hourstr"] = self.hourstr - if self.hour is not None: - results["hour"] = self.hour - if self.minutesstr is not None: - results["minutesstr"] = self.minutesstr - if self.minutes is not None: - results["minutes"] = self.minutes - if self.secondsstr is not None: - results["secondsstr"] = self.secondsstr - if self.seconds is not None: - results["seconds"] = self.seconds - if self.zonestr is not None: - results["zonestr"] = self.zonestr - if self.precisionstr is not None: - results["precisionstr"] = self.precisionstr - if self.iso8601extended is not None: - results["iso8601extended"] = self.iso8601extended - list_items: typing.List[KgtkValue] = self.get_list_items() - if len(list_items) > 0: - results["list_len"] = len(list_items) - return results - + def get_field_map(self)->typing.Mapping[str, typing.Union[str, int, float, bool]]: + if self.fields is None: + return { } + else: + return self.fields.to_map() + def main(): """ Test the KGTK value parser. """ parser: ArgumentParser = ArgumentParser() parser.add_argument(dest="values", help="The values(s) to test", type=str, nargs="+") + parser.add_argument("-p", "--parse-fields", dest="parse_fields", help="Print additional progress messages.", action='store_true') parser.add_argument("-v", "--verbose", dest="verbose", help="Print additional progress messages.", action='store_true') parser.add_argument( "--very-verbose", dest="very_verbose", help="Print additional progress messages.", action='store_true') KgtkValueOptions.add_arguments(parser) @@ -1186,7 +1194,7 @@ def main(): value: str for value in args.values: - kv: KgtkValue = KgtkValue(value, options=value_options) + kv: KgtkValue = KgtkValue(value, options=value_options, parse_fields=args.parse_fields) kv.validate() if value == kv.value: print("%s: %s" % (value, kv.describe()), flush=True) @@ -1194,7 +1202,7 @@ def main(): print("%s => %s: %s" % (value, kv.value, kv.describe()), flush=True) if args.verbose: - fields = kv.get_fields() + fields: typing.Mapping[str, typing.Any] = kv.get_field_map() for key in sorted(fields.keys()): print("%s: %s" % (key, str(fields[key]))) list_items: typing.List[KgtkValue] = kv.get_list_items() diff --git a/kgtk/value/kgtkvalueoptions.py b/kgtk/value/kgtkvalueoptions.py index d37a99243..a51b16dc7 100644 --- a/kgtk/value/kgtkvalueoptions.py +++ b/kgtk/value/kgtkvalueoptions.py @@ -9,8 +9,9 @@ @attr.s(slots=True, frozen=True) class KgtkValueOptions: """ - These options will affect some aspects of value processing. They are in a - seperate class for efficiency. + These options control various aspects of value processing. They are in a + seperate class for code isolation and efficiency. + """ # Allow month 00 or day 00 in dates? This isn't really allowed by ISO From 44124f07c147b8d852a1023d827e15fbb50c2f29 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Fri, 8 May 2020 16:07:02 -0700 Subject: [PATCH 129/278] Adapt to new KgtkValueOptions initialization. --- kgtk/cli/ifexists.py | 22 ++-------------------- kgtk/cli/ifnotexists.py | 22 ++-------------------- kgtk/cli/validate.py | 20 ++------------------ 3 files changed, 6 insertions(+), 58 deletions(-) diff --git a/kgtk/cli/ifexists.py b/kgtk/cli/ifexists.py index d72f320a6..d6c2a8b23 100644 --- a/kgtk/cli/ifexists.py +++ b/kgtk/cli/ifexists.py @@ -75,17 +75,7 @@ def run(input_kgtk_file: typing.Optional[Path], verbose: bool = False, very_verbose: bool = False, - # Arguments from KgtkValueOptions: - additional_language_codes: typing.Optional[typing.List[str]] = None, - allow_language_suffixes: bool = False, - allow_lax_strings: bool = False, - allow_lax_lq_strings: bool = False, - allow_month_or_day_zero: bool = False, - repair_month_or_day_zero: bool = False, - minimum_valid_year: int = KgtkValueOptions.MINIMUM_VALID_YEAR, - maximum_valid_year: int = KgtkValueOptions.MAXIMUM_VALID_YEAR, - escape_list_separators: bool = False, - + **kwargs # Whatever KgtkValueOptions wants. )->int: # import modules locally from kgtk.exceptions import KGTKException @@ -99,15 +89,7 @@ def run(input_kgtk_file: typing.Optional[Path], error_file: typing.TextIO = sys.stderr if errors_to_stderr else sys.stdout # Build the value parsing option structure. - value_options: KgtkValueOptions = KgtkValueOptions(allow_month_or_day_zero=allow_month_or_day_zero, - repair_month_or_day_zero=repair_month_or_day_zero, - allow_lax_strings=allow_lax_strings, - allow_lax_lq_strings=allow_lax_lq_strings, - allow_language_suffixes=allow_language_suffixes, - additional_language_codes=additional_language_codes, - minimum_valid_year=minimum_valid_year, - maximum_valid_year=maximum_valid_year, - escape_list_separators=escape_list_separators) + value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) try: ie: IfExists = IfExists(left_file_path=input_kgtk_file, diff --git a/kgtk/cli/ifnotexists.py b/kgtk/cli/ifnotexists.py index 69bbbff61..bd9f5a52b 100644 --- a/kgtk/cli/ifnotexists.py +++ b/kgtk/cli/ifnotexists.py @@ -76,17 +76,7 @@ def run(input_kgtk_file: typing.Optional[Path], verbose: bool = False, very_verbose: bool = False, - # Arguments from KgtkValueOptions: - additional_language_codes: typing.Optional[typing.List[str]] = None, - allow_language_suffixes: bool = False, - allow_lax_strings: bool = False, - allow_lax_lq_strings: bool = False, - allow_month_or_day_zero: bool = False, - repair_month_or_day_zero: bool = False, - minimum_valid_year: int = KgtkValueOptions.MINIMUM_VALID_YEAR, - maximum_valid_year: int = KgtkValueOptions.MAXIMUM_VALID_YEAR, - escape_list_separators: bool = False, - + **kwargs # Whatever KgtkValueOptions wants. )->int: # import modules locally from kgtk.exceptions import KGTKException @@ -100,15 +90,7 @@ def run(input_kgtk_file: typing.Optional[Path], error_file: typing.TextIO = sys.stderr if errors_to_stderr else sys.stdout # Build the value parsing option structure. - value_options: KgtkValueOptions = KgtkValueOptions(allow_month_or_day_zero=allow_month_or_day_zero, - repair_month_or_day_zero=repair_month_or_day_zero, - allow_lax_strings=allow_lax_strings, - allow_lax_lq_strings=allow_lax_lq_strings, - allow_language_suffixes=allow_language_suffixes, - additional_language_codes=additional_language_codes, - minimum_valid_year=minimum_valid_year, - maximum_valid_year=maximum_valid_year, - escape_list_separators=escape_list_separators) + value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) try: ie: IfExists = IfExists(left_file_path=input_kgtk_file, diff --git a/kgtk/cli/validate.py b/kgtk/cli/validate.py index a674baac0..ccf442bc0 100644 --- a/kgtk/cli/validate.py +++ b/kgtk/cli/validate.py @@ -148,15 +148,6 @@ def run(kgtk_files: typing.Optional[typing.List[typing.Optional[Path]]], invalid_value_action: ValidationAction = ValidationAction.REPORT, header_error_action: ValidationAction = ValidationAction.EXIT, unsafe_column_name_action: ValidationAction = ValidationAction.REPORT, - additional_language_codes: typing.Optional[typing.List[str]] = None, - allow_language_suffixes: bool = False, - allow_lax_strings: bool = False, - allow_lax_lq_strings: bool = False, - allow_month_or_day_zero: bool = False, - repair_month_or_day_zero: bool = False, - escape_list_separators: bool = False, - minimum_valid_year: int = KgtkValueOptions.MINIMUM_VALID_YEAR, - maximum_valid_year: int = KgtkValueOptions.MAXIMUM_VALID_YEAR, compression_type: typing.Optional[str] = None, gzip_in_parallel: bool = False, gzip_queue_size: int = KgtkReader.GZIP_QUEUE_SIZE_DEFAULT, @@ -165,6 +156,7 @@ def run(kgtk_files: typing.Optional[typing.List[typing.Optional[Path]]], header_only: bool = False, verbose: bool = False, very_verbose: bool = False, + **kwargs # Whatever KgtkValueOptions wants. )->int: # import modules locally from kgtk.exceptions import KGTKException @@ -176,15 +168,7 @@ def run(kgtk_files: typing.Optional[typing.List[typing.Optional[Path]]], error_file: typing.TextIO = sys.stderr if errors_to_stderr else sys.stdout # Build the value parsing option structure. - value_options: KgtkValueOptions = KgtkValueOptions(allow_month_or_day_zero=allow_month_or_day_zero, - repair_month_or_day_zero=repair_month_or_day_zero, - allow_lax_strings=allow_lax_strings, - allow_lax_lq_strings=allow_lax_lq_strings, - allow_language_suffixes=allow_language_suffixes, - additional_language_codes=additional_language_codes, - minimum_valid_year=minimum_valid_year, - maximum_valid_year=maximum_valid_year, - escape_list_separators=escape_list_separators) + value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) try: kgtk_file: typing.Optional[Path] From 03d5f85426a4bd5e1cb951215c830e25010bb504 Mon Sep 17 00:00:00 2001 From: ckxz105 Date: Fri, 8 May 2020 17:37:55 -0700 Subject: [PATCH 130/278] update embeddindg sentence generating algorithm - for property-values, move to isa-properties part --- kgtk/gt/embedding_utils.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/kgtk/gt/embedding_utils.py b/kgtk/gt/embedding_utils.py index 429d37b47..505636bbf 100644 --- a/kgtk/gt/embedding_utils.py +++ b/kgtk/gt/embedding_utils.py @@ -460,16 +460,17 @@ def read_input(self, file_path: str, target_properties: dict, property_labels_di if node_property in properties_reversed: roles = properties_reversed[node_property] + node_value = self.get_real_label_name(node_value) + # if we get property_values, it should be saved to isa-properties part if "property_values" in roles: # for property values part, changed to be "{property} {value}" - node_value = self.get_real_label_name(node_property) + " " + self.get_real_label_name(node_value) - else: - node_value = self.get_real_label_name(node_value) + node_value_combine = self.get_real_label_name(node_property) + " " + self.get_real_label_name(node_value) + each_node_attributes["isa_properties"].append(node_value_combine) + # remove those 2 roles in case we have duplicate using of this node later + roles.discard("property_values") + roles.discard("has_properties") for each_role in roles: - if each_role == "property_values" and "has_properties" not in roles: - each_node_attributes["has_properties"].append(node_value) - else: - each_node_attributes[each_role].append(node_value) + each_node_attributes[each_role].append(node_value) elif add_all_properties: # add remained properties if need all properties each_node_attributes["has_properties"].append(self.get_real_label_name(node_property)) From b9ef7079c35371642220a36a9c12a9cdd396929b Mon Sep 17 00:00:00 2001 From: ckxz105 Date: Fri, 8 May 2020 18:42:45 -0700 Subject: [PATCH 131/278] bug fix on property-value part --- kgtk/gt/embedding_utils.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/kgtk/gt/embedding_utils.py b/kgtk/gt/embedding_utils.py index 505636bbf..8a4b2b833 100644 --- a/kgtk/gt/embedding_utils.py +++ b/kgtk/gt/embedding_utils.py @@ -407,7 +407,7 @@ def read_input(self, file_path: str, target_properties: dict, property_labels_di self._logger.debug(str(column_references)) # read contents each_node_attributes = {"has_properties": [], "isa_properties": [], "label_properties": [], - "description_properties": []} + "description_properties": [], "has_properties_values": []} current_process_node_id = None if self._parallel_count > 1: @@ -454,18 +454,18 @@ def read_input(self, file_path: str, target_properties: dict, property_labels_di # after write down finish, we can clear and start parsing next one each_node_attributes = {"has_properties": [], "isa_properties": [], "label_properties": [], - "description_properties": []} + "description_properties": [], "has_properties_values": []} # update to new id current_process_node_id = node_id if node_property in properties_reversed: - roles = properties_reversed[node_property] + roles = properties_reversed[node_property].copy() node_value = self.get_real_label_name(node_value) # if we get property_values, it should be saved to isa-properties part if "property_values" in roles: # for property values part, changed to be "{property} {value}" node_value_combine = self.get_real_label_name(node_property) + " " + self.get_real_label_name(node_value) - each_node_attributes["isa_properties"].append(node_value_combine) + each_node_attributes["has_properties_values"].append(node_value_combine) # remove those 2 roles in case we have duplicate using of this node later roles.discard("property_values") roles.discard("has_properties") @@ -514,11 +514,21 @@ def attribute_to_sentence(self, attribute_dict: dict, node_id=None): each = each.replace("||", " ") temp += each + ", " if concated_sentence != "" and temp != "": - concated_sentence += " is a " + concated_sentence += " is " elif concated_sentence == "": - concated_sentence += "It is a " + concated_sentence += "It is " # remove last ", " concated_sentence += temp[:-2] + if "has_properties_values" in attribute_dict and len(attribute_dict["has_properties_values"]) > 0: + temp = [self.get_real_label_name(each) for each in attribute_dict["has_properties_values"]] + if concated_sentence != "": + if not have_isa_properties: + concated_sentence += " is " + else: + concated_sentence += ", " + else: + concated_sentence += "It is " + concated_sentence += " and ".join(temp) if "has_properties" in attribute_dict and len(attribute_dict["has_properties"]) > 0: temp = [self.get_real_label_name(each) for each in attribute_dict["has_properties"]] if concated_sentence != "" and temp[0] != "": @@ -529,6 +539,9 @@ def attribute_to_sentence(self, attribute_dict: dict, node_id=None): elif temp[0] != "": concated_sentence += "It has " concated_sentence += " and ".join(temp) + # add ending period + if concated_sentence != "": + concated_sentence += "." self._logger.debug("Transform node {} --> {}".format(node_id, concated_sentence)) return concated_sentence From a75c95b7446c214c1d99e5a52f7e10020f780743 Mon Sep 17 00:00:00 2001 From: Naren Date: Sat, 9 May 2020 13:07:04 -0700 Subject: [PATCH 132/278] bug fix --- kgtk/cli/connected_components.py | 35 +++++++++++++++++++------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/kgtk/cli/connected_components.py b/kgtk/cli/connected_components.py index 8b6f3e890..0df997fa4 100644 --- a/kgtk/cli/connected_components.py +++ b/kgtk/cli/connected_components.py @@ -15,20 +15,21 @@ def add_arguments(parser): Args: parser (argparse.ArgumentParser) """ - parser.add_argument('-i','--inp',action="store", type=str, dest="filename", metavar='filename', help='filename here') - parser.add_argument('-o', '--out', action='store', type=str, dest='output', help='File to output the nodes file with respective components') - parser.add_argument("--header", action="store",type=bool, dest="header_bool", help="Does the file contain a header in its first row",default=True) + parser.add_argument(action="store", type=str, dest="filename", metavar='filename', help='input filename here') + parser.add_argument('-o', '--out', action='store', type=str, dest='output', help='File to output the nodes file with respective components,if empty will be written out to standard output',default=None) + parser.add_argument("--header", action="store_true", dest="header_bool", help="Does the file contain a header in its first row") parser.add_argument("--subj", action="store", type=int, dest="sub", help='Column in which the subject is given, default 0', default=0) parser.add_argument("--obj", action="store", type=int, dest="obj", help='Column in which the subject is given, default 2', default=2) parser.add_argument("--props", action="store", type=str, dest="props",help='Properties to consider while finding connected components - comma-separated string, default all properties considered',default=None) - parser.add_argument('--directed', action='store',type=bool, dest="directed", help="Is the graph directed or not?",default=True) - parser.add_argument('--strong', action='store',type=bool, dest="strong", help="If graph is directed, strongly connected components or treat graph as undirected",default=False) + parser.add_argument('--directed', action='store_true', dest="directed", help="Is the graph directed or not?") + parser.add_argument('--strong', action='store_true', dest="strong", help="If graph is directed, strongly connected components or treat graph as undirected") -def run(filename,output,directed,header,sub,obj,props,strong): +def run(filename,output,header_bool,sub,obj,props,directed,strong): # import modules locally import csv + import sys from graph_tool import load_graph_from_csv from graph_tool.util import find_edge from graph_tool.topology import label_components @@ -36,7 +37,8 @@ def run(filename,output,directed,header,sub,obj,props,strong): from kgtk.cli_argparse import KGTKArgumentParser try: - g=load_graph_from_csv(filename,directed,skip_first=header,hashed=True,csv_options={'delimiter': '\t'},ecols=(sub,obj)) + header=['node1','label','node2'] + g=load_graph_from_csv(filename,directed,skip_first=header_bool,hashed=True,csv_options={'delimiter': '\t'},ecols=(sub,obj)) es=[] if props: properties=props.split(',') @@ -45,11 +47,16 @@ def run(filename,output,directed,header,sub,obj,props,strong): g.clear_edges() g.add_edge_list(list(set(es))) comp, hist= label_components(g,directed=strong) - f=open(output,'w') - wr = csv.writer(f, quoting=csv.QUOTE_NONE,delimiter="\t",escapechar="\n",quotechar='') - wr.writerow(['node','component']) - for v,c in enumerate(comp): - wr.writerow([g.vertex_properties['name'][v],c]) - f.close() + if output: + f=open(output,'w') + wr = csv.writer(f, quoting=csv.QUOTE_NONE,delimiter="\t",escapechar="\n",quotechar='') + wr.writerow(header) + for v,c in enumerate(comp): + wr.writerow([g.vertex_properties['name'][v],'connected_component',c]) + f.close() + else: + sys.stdout.write('%s\t%s\t%s\n' % ('node1', 'label', 'node2')) + for v,c in enumerate(comp): + sys.stdout.write('%s\t%s\t%s\n' % (g.vertex_properties['name'][v], 'connected_component', str(c))) except: - raise KGTKException + raise KGTKException \ No newline at end of file From d737e7c20dfc000226770402bb12b83b7e69265d Mon Sep 17 00:00:00 2001 From: Naren Date: Sun, 10 May 2020 21:25:56 -0700 Subject: [PATCH 133/278] Add reachability command and minor bug fixes --- kgtk/cli/connected_components.py | 22 +++++-- kgtk/cli/import_wikidata.py | 38 ++++++++--- kgtk/cli/reachable_nodes.py | 109 +++++++++++++++++++++++++++++++ 3 files changed, 153 insertions(+), 16 deletions(-) create mode 100644 kgtk/cli/reachable_nodes.py diff --git a/kgtk/cli/connected_components.py b/kgtk/cli/connected_components.py index 0df997fa4..630796254 100644 --- a/kgtk/cli/connected_components.py +++ b/kgtk/cli/connected_components.py @@ -16,17 +16,18 @@ def add_arguments(parser): parser (argparse.ArgumentParser) """ parser.add_argument(action="store", type=str, dest="filename", metavar='filename', help='input filename here') - parser.add_argument('-o', '--out', action='store', type=str, dest='output', help='File to output the nodes file with respective components,if empty will be written out to standard output',default=None) - parser.add_argument("--header", action="store_true", dest="header_bool", help="Does the file contain a header in its first row") + parser.add_argument('-o', '--out', action='store', type=str, dest='output', help='File to output the edge file with respective components,if empty will be written out to standard output',default=None) + parser.add_argument("--noheader", action="store_true", dest="header_bool", help="Option to specify that file does not have a header") parser.add_argument("--subj", action="store", type=int, dest="sub", help='Column in which the subject is given, default 0', default=0) parser.add_argument("--obj", action="store", type=int, dest="obj", help='Column in which the subject is given, default 2', default=2) + parser.add_argument("--pred",action="store" ,type=int, dest="pred",help='Column in which predicate is given, default 1',default=1) parser.add_argument("--props", action="store", type=str, dest="props",help='Properties to consider while finding connected components - comma-separated string, default all properties considered',default=None) - parser.add_argument('--directed', action='store_true', dest="directed", help="Is the graph directed or not?") + parser.add_argument('--undirected', action='store_true', dest="undirected", help="Option to specify graph as undirected?") parser.add_argument('--strong', action='store_true', dest="strong", help="If graph is directed, strongly connected components or treat graph as undirected") -def run(filename,output,header_bool,sub,obj,props,directed,strong): +def run(filename,output,header_bool,sub,obj,pred,props,undirected,strong): # import modules locally import csv import sys @@ -36,14 +37,23 @@ def run(filename,output,header_bool,sub,obj,props,directed,strong): from kgtk.exceptions import KGTKException from kgtk.cli_argparse import KGTKArgumentParser + + def find_pred_position(sub,pred,obj): + if pred < sub and pred < obj: + return pred + elif (pred > sub and pred < obj) or (predobj): + return pred-1 + else: + return pred-2 try: header=['node1','label','node2'] - g=load_graph_from_csv(filename,directed,skip_first=header_bool,hashed=True,csv_options={'delimiter': '\t'},ecols=(sub,obj)) + label='c'+str(find_pred_position(sub,pred,obj)) + g=load_graph_from_csv(filename,not(undirected),skip_first=not(header_bool),hashed=True,csv_options={'delimiter': '\t'},ecols=(sub,obj)) es=[] if props: properties=props.split(',') for e in properties: - es+=(find_edge(g,g.edge_properties['c0'],e)) + es+=(find_edge(g,g.edge_properties[label],e)) g.clear_edges() g.add_edge_list(list(set(es))) comp, hist= label_components(g,directed=strong) diff --git a/kgtk/cli/import_wikidata.py b/kgtk/cli/import_wikidata.py index 0019fef32..aadb01493 100644 --- a/kgtk/cli/import_wikidata.py +++ b/kgtk/cli/import_wikidata.py @@ -155,8 +155,8 @@ def process(self,line,node_file,edge_file,qual_file,languages,doc_id): if self.parse_labels: labels = obj["labels"] + label_list=[] if labels: - label_list=[] for lang in languages: lang_label = labels.get(lang, None) if lang_label: @@ -171,8 +171,8 @@ def process(self,line,node_file,edge_file,qual_file,languages,doc_id): if self.parse_descr: descriptions = obj["descriptions"] + descr_list=[] if descriptions: - descr_list=[] for lang in languages: lang_descr = descriptions.get(lang, None) if lang_descr: @@ -186,8 +186,8 @@ def process(self,line,node_file,edge_file,qual_file,languages,doc_id): if self.parse_aliases: aliases = obj["aliases"] + alias_list = [] if aliases: - alias_list = [] for lang in languages: lang_aliases = aliases.get(lang, None) if lang_aliases: @@ -235,6 +235,8 @@ def process(self,line,node_file,edge_file,qual_file,languages,doc_id): value = '' mag = '' unit = '' + date='' + item='' lower = '' upper = '' precision = '' @@ -245,6 +247,7 @@ def process(self,line,node_file,edge_file,qual_file,languages,doc_id): if typ.startswith('wikibase'): enttype = val.get('entity-type') value = val.get('id', '') + item=value elif typ == 'quantity': value = val['amount'] mag = val['amount'] @@ -267,11 +270,15 @@ def process(self,line,node_file,edge_file,qual_file,languages,doc_id): precision = val.get('precision', '') value = '@' + lat + '/' + long elif typ == 'time': - mag = "^" + val['time'][1:] + if val['time'][0]=='-': + pre="^-" + else: + pre="^" + date = pre + val['time'][1:] precision = str(val['precision']) calendar = val.get( 'calendarmodel', '').split('/')[-1] - value = "^" + \ + value = pre + \ val['time'][1:] + '/' + str(val['precision']) elif typ == 'monolingualtext': value = '\'' + \ @@ -286,6 +293,8 @@ def process(self,line,node_file,edge_file,qual_file,languages,doc_id): rank, mag, unit, + date, + item, lower, upper, lat, @@ -304,6 +313,8 @@ def process(self,line,node_file,edge_file,qual_file,languages,doc_id): value = '' mag = '' unit = '' + date= '' + item='' lower = '' upper = '' precision = '' @@ -323,6 +334,7 @@ def process(self,line,node_file,edge_file,qual_file,languages,doc_id): 'entity-type') value = val.get( 'id', '') + item=value elif typ == 'quantity': value = val['amount'] mag = val['amount'] @@ -351,13 +363,17 @@ def process(self,line,node_file,edge_file,qual_file,languages,doc_id): 'precision', '') value = '@' + lat + '/' + long elif typ == 'time': - mag = "^" + \ + if val['time'][0]=='-': + pre="^-" + else: + pre="^" + date = pre + \ val['time'][1:] precision = str( val['precision']) calendar = val.get( 'calendarmodel', '').split('/')[-1] - value = "^" + \ + value = pre + \ val['time'][1:] + '/' + str(val['precision']) elif typ == 'monolingualtext': value = '\'' + \ @@ -372,6 +388,8 @@ def process(self,line,node_file,edge_file,qual_file,languages,doc_id): value, mag, unit, + date, + item, lower, upper, lat, @@ -390,11 +408,11 @@ def process(self,line,node_file,edge_file,qual_file,languages,doc_id): sitelang=link.split('wiki')[0].replace('_','-') sitelink='http://'+sitelang+'.wikipedia.org/wiki/'+sitetitle if edge_file: - erows.append([sid, qnode, 'wikipedia_sitelink', sitelink,'','','','','', + erows.append([sid, qnode, 'wikipedia_sitelink', sitelink,'','','','','','','', '','','','','']) if qual_file: tempid=sid+'-language-1' - qrows.append([tempid,sid,'language',sitelang,'','','','','','','','','']) + qrows.append([tempid,sid,'language',sitelang,'','','','','','','','','','','']) if node_file: with open(node_file+'_{}'.format(self._idx), write_mode, newline='') as myfile: @@ -442,7 +460,7 @@ def process(self,line,node_file,edge_file,qual_file,languages,doc_id): escapechar="\n", quotechar='') wr.writerow(header) - header = ['id','node1','label','node2','rank','node2;magnitude','node2;unit','node2;lower','node2;upper', + header = ['id','node1','label','node2','rank','node2;magnitude','node2;unit','node2;date','node2;item','node2;lower','node2;upper', 'node2;latitude','node2;longitude','node2;precision','node2;calendar','node2;entity-type'] if edge_file: with open(edge_file+'_header', 'w', newline='') as myfile: diff --git a/kgtk/cli/reachable_nodes.py b/kgtk/cli/reachable_nodes.py new file mode 100644 index 000000000..bfa071574 --- /dev/null +++ b/kgtk/cli/reachable_nodes.py @@ -0,0 +1,109 @@ +""" +Find reachable nodes given a set of root nodes and properties +""" + + +def parser(): + return { + 'help': 'Find reachable nodes in a graph.' + } + + +def add_arguments(parser): + """ + Parse arguments + Args: + parser (argparse.ArgumentParser) + """ + parser.add_argument(action="store", type=str, dest="filename", metavar='filename', help='input filename here') + parser.add_argument('--root',action='store',dest='root',help='File containing the set of root nodes') + parser.add_argument('-o', '--out', action='store', type=str, dest='output', help='File to output the reachable nodes,if empty will be written out to standard output',default=None) + parser.add_argument("--noheader", action="store_true", dest="header_bool", help="Option to specify that file does not have a header") + parser.add_argument("--subj", action="store", type=int, dest="sub", help='Column in which the subject is given, default 0', default=0) + parser.add_argument("--obj", action="store", type=int, dest="obj", help='Column in which the subject is given, default 2', default=2) + parser.add_argument("--pred",action="store" ,type=int, dest="pred",help='Column in which predicate is given, default 1',default=1) + parser.add_argument("--props", action="store", type=str, dest="props",help='Properties to consider while finding reachable nodes - comma-separated string, default all properties considered',default=None) + parser.add_argument('--undirected', action='store_true', dest="undirected", help="Option to specify graph as undirected?") + + +def run(filename,root,output,header_bool,sub,obj,pred,props,undirected): + import sys + import csv + from graph_tool.search import dfs_iterator + from graph_tool import load_graph_from_csv + from graph_tool.util import find_edge + from kgtk.exceptions import KGTKException + from kgtk.cli_argparse import KGTKArgumentParser + + def find_pred_position(sub,pred,obj): + if pred < sub and pred < obj: + return pred + elif (pred > sub and pred < obj) or (predobj): + return pred-1 + else: + return pred-2 + + def get_edges_by_edge_prop(g, p, v): + return find_edge(g, prop=g.properties[('e', p)], match=v) + + label='c'+str(find_pred_position(sub,pred,obj)) + header=['node1','label','node2'] + root_list=[] + property_list=[] + + tsv_file = open(root) + read_tsv = csv.reader(tsv_file, delimiter="\t") + + for row in read_tsv: + root_list.append(row[0]) + tsv_file.close() + property_list = [item for item in props.split(',')] + G = load_graph_from_csv(filename,not(undirected),skip_first=not(header_bool),hashed=True,csv_options={'delimiter': '\t'},ecols=(sub,obj)) + + name = G.vp["name"] + + index_list = [] + for v in G.vertices(): + if name[v] in root_list: + index_list.append(v) + + edge_filter_set = set() + for prop in property_list: + edge_filter_set.update(get_edges_by_edge_prop(G, label,prop)); + e_prop= G.new_edge_property("bool") + + v_prop= G.new_vertex_property("bool") + for e in G.edges(): + if e in edge_filter_set: + e_prop[e] = True + v_prop[e.source()] = True + v_prop[e.target()] = True + else: + e_prop[e] = False + if(v_prop[e.source()] is None): + v_prop[e.source()] = False + if(v_prop[e.target()] is None): + v_prop[e.target()] = False + G.set_edge_filter(e_prop) + G.set_vertex_filter(v_prop) + + + if output: + f=open(output,'w') + tsv_writer = csv.writer(f, quoting=csv.QUOTE_NONE,delimiter="\t",escapechar="\n",quotechar='') + if index_list == []: + print("No root nodes found in the graph") + else: + tsv_writer.writerow(header) + for index in index_list: + for e in dfs_iterator(G, G.vertex(index)): + tsv_writer.writerow([name[index], 'reachable', name[e.target()]]) + f.close() + else: + if index_list == []: + print("No root nodes found in the graph") + else: + sys.stdout.write('%s\t%s\t%s\n' % ('node1', 'label', 'node2')) + for index in index_list: + for e in dfs_iterator(G, G.vertex(index)): + sys.stdout.write('%s\t%s\t%s\n' % (name[index], 'reachable', name[e.target()])) \ No newline at end of file From 5d42b921d6e1685a88c0fe1e5206a52c27be6c4b Mon Sep 17 00:00:00 2001 From: Naren Date: Mon, 11 May 2020 03:31:35 -0700 Subject: [PATCH 134/278] improved performance of reachable script --- kgtk/cli/reachable_nodes.py | 58 ++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/kgtk/cli/reachable_nodes.py b/kgtk/cli/reachable_nodes.py index bfa071574..fa988ae5b 100644 --- a/kgtk/cli/reachable_nodes.py +++ b/kgtk/cli/reachable_nodes.py @@ -16,25 +16,31 @@ def add_arguments(parser): parser (argparse.ArgumentParser) """ parser.add_argument(action="store", type=str, dest="filename", metavar='filename', help='input filename here') - parser.add_argument('--root',action='store',dest='root',help='File containing the set of root nodes') + parser.add_argument('--root',action='store',dest='root',help='Set of root nodes to use, comma-separated string',default=None) + parser.add_argument('--rootfile',action='store',dest='rootfile',help='Option to specify a file containing the set of root nodes',default=None) + parser.add_argument('--rootfilecolumn',action='store',type=int,dest='rootfilecolumn',help='Option to specify column of roots file to use, default 0',default=0) + parser.add_argument('--norootheader',action='store_true',dest='root_header_bool',help='Option to specify that root file has no header') parser.add_argument('-o', '--out', action='store', type=str, dest='output', help='File to output the reachable nodes,if empty will be written out to standard output',default=None) parser.add_argument("--noheader", action="store_true", dest="header_bool", help="Option to specify that file does not have a header") parser.add_argument("--subj", action="store", type=int, dest="sub", help='Column in which the subject is given, default 0', default=0) parser.add_argument("--obj", action="store", type=int, dest="obj", help='Column in which the subject is given, default 2', default=2) parser.add_argument("--pred",action="store" ,type=int, dest="pred",help='Column in which predicate is given, default 1',default=1) - parser.add_argument("--props", action="store", type=str, dest="props",help='Properties to consider while finding reachable nodes - comma-separated string, default all properties considered',default=None) + parser.add_argument("--props", action="store", type=str, dest="props",help='Properties to consider while finding reachable nodes - comma-separated string',default=None) parser.add_argument('--undirected', action='store_true', dest="undirected", help="Option to specify graph as undirected?") -def run(filename,root,output,header_bool,sub,obj,pred,props,undirected): +def run(filename,root,rootfile,rootfilecolumn,root_header_bool,output,header_bool,sub,obj,pred,props,undirected): import sys import csv + import time from graph_tool.search import dfs_iterator from graph_tool import load_graph_from_csv from graph_tool.util import find_edge from kgtk.exceptions import KGTKException from kgtk.cli_argparse import KGTKArgumentParser + + #Graph-tool names columns that are not subject or object c0, c1... This function finds the number that graph tool assigned to the predicate column def find_pred_position(sub,pred,obj): if pred < sub and pred < obj: return pred @@ -46,17 +52,26 @@ def find_pred_position(sub,pred,obj): def get_edges_by_edge_prop(g, p, v): return find_edge(g, prop=g.properties[('e', p)], match=v) + label='c'+str(find_pred_position(sub,pred,obj)) header=['node1','label','node2'] + root_set=set() root_list=[] property_list=[] - - tsv_file = open(root) - read_tsv = csv.reader(tsv_file, delimiter="\t") - - for row in read_tsv: - root_list.append(row[0]) - tsv_file.close() + if (rootfile): + tsv_file = open(rootfile) + read_tsv = csv.reader(tsv_file, delimiter="\t") + first_row=True + for row in read_tsv: + if first_row and not root_header_bool: + first_row=False + continue + root_set.add(row[rootfilecolumn]) + tsv_file.close() + if (root): + for r in root.split(','): + root_set.add(r) + root_list=list(root_set) property_list = [item for item in props.split(',')] G = load_graph_from_csv(filename,not(undirected),skip_first=not(header_bool),hashed=True,csv_options={'delimiter': '\t'},ecols=(sub,obj)) @@ -70,24 +85,9 @@ def get_edges_by_edge_prop(g, p, v): edge_filter_set = set() for prop in property_list: edge_filter_set.update(get_edges_by_edge_prop(G, label,prop)); - e_prop= G.new_edge_property("bool") - - v_prop= G.new_vertex_property("bool") - for e in G.edges(): - if e in edge_filter_set: - e_prop[e] = True - v_prop[e.source()] = True - v_prop[e.target()] = True - else: - e_prop[e] = False - if(v_prop[e.source()] is None): - v_prop[e.source()] = False - if(v_prop[e.target()] is None): - v_prop[e.target()] = False - G.set_edge_filter(e_prop) - G.set_vertex_filter(v_prop) - - + + G.clear_edges() + G.add_edge_list(list(edge_filter_set)) if output: f=open(output,'w') tsv_writer = csv.writer(f, quoting=csv.QUOTE_NONE,delimiter="\t",escapechar="\n",quotechar='') @@ -106,4 +106,4 @@ def get_edges_by_edge_prop(g, p, v): sys.stdout.write('%s\t%s\t%s\n' % ('node1', 'label', 'node2')) for index in index_list: for e in dfs_iterator(G, G.vertex(index)): - sys.stdout.write('%s\t%s\t%s\n' % (name[index], 'reachable', name[e.target()])) \ No newline at end of file + sys.stdout.write('%s\t%s\t%s\n' % (name[index], 'reachable', name[e.target()])) From 97bf304db7a2920deb010fcaf63aca90a250be95 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Mon, 11 May 2020 10:29:32 -0700 Subject: [PATCH 135/278] Conversion to KgtkReaderOptions (incomplete). --- kgtk/cli/validate.py | 141 +-------- kgtk/io/edgereader.py | 114 ++------ kgtk/io/kgtkreader.py | 652 ++++++++++++++++++------------------------ kgtk/io/nodereader.py | 111 ++----- 4 files changed, 349 insertions(+), 669 deletions(-) diff --git a/kgtk/cli/validate.py b/kgtk/cli/validate.py index ccf442bc0..2b7c1a5dc 100644 --- a/kgtk/cli/validate.py +++ b/kgtk/cli/validate.py @@ -16,7 +16,7 @@ import typing from kgtk.kgtkformat import KgtkFormat -from kgtk.io.kgtkreader import KgtkReader +from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.utils.enumnameaction import EnumNameAction from kgtk.utils.validationaction import ValidationAction from kgtk.value.kgtkvalueoptions import KgtkValueOptions @@ -35,124 +35,18 @@ def add_arguments(parser): """ parser.add_argument( "kgtk_files", nargs="*", help="The KGTK file(s) to validate. May be omitted or '-' for stdin.", type=Path) - parser.add_argument( "--blank-id-line-action", dest="blank_id_line_action", - help="The action to take when a blank id field is detected.", - type=ValidationAction, action=EnumNameAction, default=None) - - parser.add_argument( "--blank-node1-line-action", dest="blank_node1_line_action", - help="The action to take when a blank node1 field is detected.", - type=ValidationAction, action=EnumNameAction, default=None) - - parser.add_argument( "--blank-node2-line-action", dest="blank_node2_line_action", - help="The action to take when a blank node2 field is detected.", - type=ValidationAction, action=EnumNameAction, default=None) - - parser.add_argument( "--blank-required-field-line-action", dest="blank_line_action", - help="The action to take when a line with a blank node1, node2, or id field (per mode) is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.COMPLAIN) - - parser.add_argument( "--column-separator", dest="column_separator", - help="Column separator.", type=str, default=KgtkReader.COLUMN_SEPARATOR) - - parser.add_argument( "--comment-line-action", dest="comment_line_action", - help="The action to take when a comment line is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.COMPLAIN) - - parser.add_argument( "--compression-type", dest="compression_type", - help="Specify the input file compression type, otherwise use the extension.") - - parser.add_argument( "--empty-line-action", dest="empty_line_action", - help="The action to take when an empty line is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.COMPLAIN) - - errors_to = parser.add_mutually_exclusive_group() - errors_to.add_argument( "--errors-to-stdout", dest="errors_to_stdout", - help="Send errors to stdout instead of stderr (default)", action="store_true") - errors_to.add_argument( "--errors-to-stderr", dest="errors_to_stderr", - help="Send errors to stderr instead of stdout", action="store_true") - - parser.add_argument( "--error-limit", dest="error_limit", - help="The maximum number of errors to report before failing", type=int, default=KgtkReader.ERROR_LIMIT_DEFAULT) - - parser.add_argument( "--fill-short-lines", dest="fill_short_lines", - help="Fill missing trailing columns in short lines with empty values.", action='store_true') - - parser.add_argument( "--force-column-names", dest="force_column_names", help="Force the column names.", nargs='+') - - parser.add_argument( "--gzip-in-parallel", dest="gzip_in_parallel", help="Execute gzip in parallel.", action='store_true') - - parser.add_argument( "--gzip-queue-size", dest="gzip_queue_size", - help="Queue size for parallel gzip.", type=int, default=KgtkReader.GZIP_QUEUE_SIZE_DEFAULT) - - parser.add_argument( "--header-error-action", dest="header_error_action", - help="The action to take when a header error is detected Only ERROR or EXIT are supported.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXIT) parser.add_argument( "--header-only", dest="header_only", help="Process the only the header of the input file.", action="store_true") - parser.add_argument( "--invalid-value-action", dest="invalid_value_action", - help="The action to take when a data cell value is invalid.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.REPORT) - - parser.add_argument( "--long-line-action", dest="long_line_action", - help="The action to take when a long line is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.COMPLAIN) - - parser.add_argument( "--mode", dest="mode", - help="Determine the KGTK input file mode.", type=KgtkReader.Mode, action=EnumNameAction, default=KgtkReader.Mode.AUTO) - - parser.add_argument( "--short-line-action", dest="short_line_action", - help="The action to take whe a short line is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.COMPLAIN) - - parser.add_argument( "--skip-first-record", dest="skip_first_record", help="Skip the first record when forcing column names.", action='store_true') - - parser.add_argument( "--truncate-long-lines", dest="truncate_long_lines", - help="Remove excess trailing columns in long lines.", action='store_true') - - parser.add_argument( "--unsafe-column-name-action", dest="unsafe_column_name_action", - help="The action to take when a column name is unsafe.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.REPORT) - - parser.add_argument("-v", "--verbose", dest="verbose", help="Print additional progress messages.", action='store_true') - - parser.add_argument( "--very-verbose", dest="very_verbose", help="Print additional progress messages.", action='store_true') - - parser.add_argument( "--whitespace-line-action", dest="whitespace_line_action", - help="The action to take when a whitespace line is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - - # Note: Any arguments described by KgtkValueOptions.add_arguments(...) - # need to be included in the arguments to run(...), below. + KgtkReader.add_debug_arguments(parser) + KgtkReaderOptions.add_arguments(parser, mode_options=True) KgtkValueOptions.add_arguments(parser) def run(kgtk_files: typing.Optional[typing.List[typing.Optional[Path]]], - force_column_names: typing.Optional[typing.List[str]] = None, - skip_first_record: bool = False, - fill_short_lines: bool = False, - truncate_long_lines: bool = False, errors_to_stdout: bool = False, errors_to_stderr: bool = False, - error_limit: int = KgtkReader.ERROR_LIMIT_DEFAULT, - empty_line_action: ValidationAction = ValidationAction.COMPLAIN, - comment_line_action: ValidationAction = ValidationAction.COMPLAIN, - whitespace_line_action: ValidationAction = ValidationAction.COMPLAIN, - blank_line_action: ValidationAction = ValidationAction.COMPLAIN, - blank_node1_line_action: typing.Optional[ValidationAction] = None, - blank_node2_line_action: typing.Optional[ValidationAction] = None, - blank_id_line_action: typing.Optional[ValidationAction] = None, - short_line_action: ValidationAction = ValidationAction.COMPLAIN, - long_line_action: ValidationAction = ValidationAction.COMPLAIN, - invalid_value_action: ValidationAction = ValidationAction.REPORT, - header_error_action: ValidationAction = ValidationAction.EXIT, - unsafe_column_name_action: ValidationAction = ValidationAction.REPORT, - compression_type: typing.Optional[str] = None, - gzip_in_parallel: bool = False, - gzip_queue_size: int = KgtkReader.GZIP_QUEUE_SIZE_DEFAULT, - column_separator: str = KgtkFormat.COLUMN_SEPARATOR, - mode: KgtkReader.Mode = KgtkReader.Mode.AUTO, header_only: bool = False, verbose: bool = False, very_verbose: bool = False, @@ -167,7 +61,8 @@ def run(kgtk_files: typing.Optional[typing.List[typing.Optional[Path]]], # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stderr if errors_to_stderr else sys.stdout - # Build the value parsing option structure. + # Build the option structures. + reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) try: @@ -181,31 +76,11 @@ def run(kgtk_files: typing.Optional[typing.List[typing.Optional[Path]]], print ("Validating from stdin", file=error_file, flush=True) kr: KgtkReader = KgtkReader.open(kgtk_file, - force_column_names=force_column_names, - skip_first_record=skip_first_record, - fill_short_lines=fill_short_lines, - truncate_long_lines=truncate_long_lines, error_file=error_file, - error_limit=error_limit, - empty_line_action=empty_line_action, - comment_line_action=comment_line_action, - whitespace_line_action=whitespace_line_action, - blank_line_action=blank_line_action, - blank_node1_line_action=blank_node1_line_action, - blank_node2_line_action=blank_node2_line_action, - blank_id_line_action=blank_id_line_action, - short_line_action=short_line_action, - long_line_action=long_line_action, - invalid_value_action=invalid_value_action, - header_error_action=header_error_action, - unsafe_column_name_action=unsafe_column_name_action, - compression_type=compression_type, + options=reader_options, value_options=value_options, - gzip_in_parallel=gzip_in_parallel, - gzip_queue_size=gzip_queue_size, - column_separator=column_separator, - mode=mode, - verbose=verbose, very_verbose=very_verbose) + verbose=verbose, + very_verbose=very_verbose) if header_only: kr.close() diff --git a/kgtk/io/edgereader.py b/kgtk/io/edgereader.py index 1f16b3961..837085269 100644 --- a/kgtk/io/edgereader.py +++ b/kgtk/io/edgereader.py @@ -10,7 +10,7 @@ import sys import typing -from kgtk.io.kgtkreader import KgtkReader +from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.utils.closableiter import ClosableIter from kgtk.utils.enumnameaction import EnumNameAction from kgtk.utils.validationaction import ValidationAction @@ -22,57 +22,32 @@ class EdgeReader(KgtkReader): @classmethod def open_edge_file(cls, file_path: typing.Optional[Path], - force_column_names: typing.Optional[typing.List[str]] = None, # - skip_first_record: bool = False, - fill_short_lines: bool = False, - truncate_long_lines: bool = False, error_file: typing.TextIO = sys.stderr, - error_limit: int = KgtkReader.ERROR_LIMIT_DEFAULT, - empty_line_action: ValidationAction = ValidationAction.EXCLUDE, - comment_line_action: ValidationAction = ValidationAction.EXCLUDE, - whitespace_line_action: ValidationAction = ValidationAction.EXCLUDE, - blank_node1_line_action: ValidationAction = ValidationAction.EXCLUDE, - blank_node2_line_action: ValidationAction = ValidationAction.EXCLUDE, - short_line_action: ValidationAction = ValidationAction.EXCLUDE, - long_line_action: ValidationAction = ValidationAction.EXCLUDE, - invalid_value_action: ValidationAction = ValidationAction.REPORT, - header_error_action: ValidationAction = ValidationAction.EXIT, - unsafe_column_name_action: ValidationAction = ValidationAction.REPORT, + options: typing.Optional[KgtkReaderOptions] = None, value_options: typing.Optional[KgtkValueOptions] = None, - compression_type: typing.Optional[str] = None, - gzip_in_parallel: bool = False, - gzip_queue_size: int = KgtkReader.GZIP_QUEUE_SIZE_DEFAULT, - column_separator: str = KgtkReader.COLUMN_SEPARATOR, verbose: bool = False, very_verbose: bool = False)->"EdgeReader": - source: ClosableIter[str] = cls._openfile(file_path, - compression_type=compression_type, - gzip_in_parallel=gzip_in_parallel, - gzip_queue_size=gzip_queue_size, - error_file=error_file, - verbose=verbose) + # Supply the default reader and value options: + (options, value_options) = cls._default_options(options, value_options) + + source: ClosableIter[str] = cls._openfile(file_path, options=options, error_file=error_file, verbose=verbose) # Read the edge file header and split it into column names. header: str column_names: typing.List[str] - (header, column_names) = cls._build_column_names(source, - force_column_names=force_column_names, - skip_first_record=skip_first_record, - column_separator=column_separator, - error_file=error_file, - verbose=verbose) + (header, column_names) = cls._build_column_names(source, options=options, error_file=error_file, verbose=verbose) # Check for unsafe column names. cls.check_column_names(column_names, header_line=header, - error_action=unsafe_column_name_action, + error_action=options.unsafe_column_name_action, error_file=error_file) # Build a map from column name to column index. column_name_map: typing.Mapping[str, int] = cls.build_column_name_map(column_names, header_line=header, - error_action=header_error_action, + error_action=options.header_error_action, error_file=error_file) # Get the indices of the required columns. node1_column_idx: int @@ -80,7 +55,7 @@ def open_edge_file(cls, label_column_idx: int (node1_column_idx, node2_column_idx, label_column_idx) = cls.required_edge_columns(column_name_map, header_line=header, - error_action=header_error_action, + error_action=options.header_error_action, error_file=error_file) if verbose: @@ -89,53 +64,35 @@ def open_edge_file(cls, return cls(file_path=file_path, source=source, - column_separator=column_separator, column_names=column_names, column_name_map=column_name_map, column_count=len(column_names), node1_column_idx=node1_column_idx, node2_column_idx=node2_column_idx, label_column_idx=label_column_idx, - force_column_names=force_column_names, - skip_first_record=skip_first_record, - fill_short_lines=fill_short_lines, - truncate_long_lines=truncate_long_lines, error_file=error_file, - error_limit=error_limit, - empty_line_action=empty_line_action, - comment_line_action=comment_line_action, - whitespace_line_action=whitespace_line_action, - blank_node1_line_action=blank_node1_line_action, - blank_node2_line_action=blank_node2_line_action, - short_line_action=short_line_action, - long_line_action=long_line_action, - invalid_value_action=invalid_value_action, - header_error_action=header_error_action, - unsafe_column_name_action=unsafe_column_name_action, + options=options, value_options=value_options, - compression_type=compression_type, - gzip_in_parallel=gzip_in_parallel, - gzip_queue_size=gzip_queue_size, is_edge_file=True, is_node_file=False, verbose=verbose, very_verbose=very_verbose, ) - def _ignore_if_blank_fields(self, values: typing.List[str], line: str)->bool: + def _ignore_if_blank_required_fields(self, values: typing.List[str], line: str)->bool: # Ignore line_action with blank node1 fields. This code comes after # filling missing trailing columns, although it could be reworked # to come first. - if self.blank_node1_line_action != ValidationAction.PASS and self.node1_column_idx >= 0 and len(values) > self.node1_column_idx: + if self.options.blank_required_field_line_action != ValidationAction.PASS and self.node1_column_idx >= 0 and len(values) > self.node1_column_idx: node1_value: str = values[self.node1_column_idx] if len(node1_value) == 0 or node1_value.isspace(): - return self.exclude_line(self.blank_node1_line_action, "node1 is blank", line) + return self.exclude_line(self.options.blank_required_field_line_action, "node1 is blank", line) # Ignore lines with blank node2 fields: - if self.blank_node2_line_action != ValidationAction.PASS and self.node2_column_idx >= 0 and len(values) > self.node2_column_idx: + if self.options.blank_required_field_line_action != ValidationAction.PASS and self.node2_column_idx >= 0 and len(values) > self.node2_column_idx: node2_value: str = values[self.node2_column_idx] if len(node2_value) == 0 or node2_value.isspace(): - return self.exclude_line(self.blank_node2_line_action, "node2 is blank", line) + return self.exclude_line(self.options.blank_required_field_line_action, "node2 is blank", line) return False # Do not ignore this line def _skip_reserved_fields(self, column_name)->bool: @@ -152,40 +109,25 @@ def main(): Test the KGTK edge file reader. """ parser = ArgumentParser() - KgtkReader.add_operation_arguments(parser) - KgtkReader.add_file_arguments(parser, edge_options=True) + parser.add_argument(dest="kgtk_file", help="The KGTK edge file to read", type=Path, nargs="?") + KgtkReader.add_debug_arguments(parser) + KgtkReaderOptions.add_arguments(parser) KgtkValueOptions.add_arguments(parser) args = parser.parse_args() error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr - # Build the value parsing option structure. + # Build the option structures. + reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args, mode=KgtkReaderMode.EDGE) value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) - er: EdgeReader = EdgeReader.open(args.kgtk_file, - force_column_names=args.force_column_names, - skip_first_record=args.skip_first_record, - fill_short_lines=args.fill_short_lines, - truncate_long_lines=args.truncate_long_lines, - error_file=error_file, - error_limit=args.error_limit, - empty_line_action=args.empty_line_action, - comment_line_action=args.comment_line_action, - whitespace_line_action=args.whitespace_line_action, - blank_node1_line_action=args.blank_node1_line_action, - blank_node2_line_action=args.blank_node2_line_action, - short_line_action=args.short_line_action, - long_line_action=args.long_line_action, - invalid_value_action=args.invalid_value_action, - header_error_action=args.header_error_action, - unsafe_column_name_action=args.unsafe_column_name_action, - value_options=value_options, - compression_type=args.compression_type, - gzip_in_parallel=args.gzip_in_parallel, - gzip_queue_size=args.gzip_queue_size, - column_separator=args.column_separator, - mode=KgtkReader.Mode.EDGE, - verbose=args.verbose, very_verbose=args.very_verbose) + # Force the edge mode: + er: EdgeReader = EdgeReader.open_edge_file(args.kgtk_file, + error_file=error_file, + options=reader_options, + value_options=value_options, + column_separator=args.column_separator, + verbose=args.verbose, very_verbose=args.very_verbose) line_count: int = 0 row: typing.List[str] diff --git a/kgtk/io/kgtkreader.py b/kgtk/io/kgtkreader.py index ee6c91571..7a358d53b 100644 --- a/kgtk/io/kgtkreader.py +++ b/kgtk/io/kgtkreader.py @@ -16,7 +16,7 @@ """ -from argparse import ArgumentParser, _ArgumentGroup +from argparse import ArgumentParser, _ArgumentGroup, Namespace import attr import bz2 from enum import Enum @@ -37,25 +37,21 @@ from kgtk.value.kgtkvalue import KgtkValue from kgtk.value.kgtkvalueoptions import KgtkValueOptions, DEFAULT_KGTK_VALUE_OPTIONS -@attr.s(slots=True, frozen=False) -class KgtkReader(KgtkBase, ClosableIter[typing.List[str]]): +class KgtkReaderMode(Enum): + """ + There are four file reading modes: + """ + NONE = 0 # Enforce neither edge nore node file required columns + EDGE = 1 # Enforce edge file required columns + NODE = 2 # Enforce node file require columns + AUTO = 3 # Automatically decide whether to enforce edge or node file required columns + +@attr.s(slots=True, frozen=True) +class KgtkReaderOptions(): ERROR_LIMIT_DEFAULT: int = 1000 GZIP_QUEUE_SIZE_DEFAULT: int = GunzipProcess.GZIP_QUEUE_SIZE_DEFAULT - file_path: typing.Optional[Path] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(Path))) - source: ClosableIter[str] = attr.ib() # Todo: validate - column_names: typing.List[str] = attr.ib(validator=attr.validators.deep_iterable(member_validator=attr.validators.instance_of(str), - iterable_validator=attr.validators.instance_of(list))) - column_name_map: typing.Mapping[str, int] = attr.ib(validator=attr.validators.deep_mapping(key_validator=attr.validators.instance_of(str), - value_validator=attr.validators.instance_of(int))) - - # For convenience, the count of columns. This is the same as len(column_names). - column_count: int = attr.ib(validator=attr.validators.instance_of(int)) - - data_lines_read: int = attr.ib(validator=attr.validators.instance_of(int), default=0) - data_lines_passed: int = attr.ib(validator=attr.validators.instance_of(int), default=0) - data_lines_ignored: int = attr.ib(validator=attr.validators.instance_of(int), default=0) - data_errors_reported: int = attr.ib(validator=attr.validators.instance_of(int), default=0) + mode: KgtkReaderMode = attr.ib(validator=attr.validators.instance_of(KgtkReaderMode), default=KgtkReaderMode.AUTO) # The column separator is normally tab. column_separator: str = attr.ib(validator=attr.validators.instance_of(str), default=KgtkFormat.COLUMN_SEPARATOR) @@ -66,25 +62,16 @@ class KgtkReader(KgtkBase, ClosableIter[typing.List[str]]): default=None) skip_first_record: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) - # The index of the mandatory columns. -1 means missing: - node1_column_idx: int = attr.ib(validator=attr.validators.instance_of(int), default=-1) # edge file - node2_column_idx: int = attr.ib(validator=attr.validators.instance_of(int), default=-1) # edge file - label_column_idx: int = attr.ib(validator=attr.validators.instance_of(int), default=-1) # edge file - id_column_idx: int = attr.ib(validator=attr.validators.instance_of(int), default=-1) # node file - # How do we handle errors? - error_file: typing.TextIO = attr.ib(default=sys.stderr) - error_limit: int = attr.ib(validator=attr.validators.instance_of(int), default=ERROR_LIMIT_DEFAULT) # >0 ==> limit error reports + error_limit: int = attr.ib(validator=attr.validators.instance_of(int), default=KgtkReaderOptions.ERROR_LIMIT_DEFAULT) # >0 ==> limit error reports # Ignore empty lines, comments, and all whitespace lines, etc.? empty_line_action: ValidationAction = attr.ib(validator=attr.validators.instance_of(ValidationAction), default=ValidationAction.EXCLUDE) comment_line_action: ValidationAction = attr.ib(validator=attr.validators.instance_of(ValidationAction), default=ValidationAction.EXCLUDE) whitespace_line_action: ValidationAction = attr.ib(validator=attr.validators.instance_of(ValidationAction), default=ValidationAction.EXCLUDE) - # Ignore records with values in certain fields: - blank_node1_line_action: ValidationAction = attr.ib(validator=attr.validators.instance_of(ValidationAction), default=ValidationAction.PASS) # EXCLUDE on edge file - blank_node2_line_action: ValidationAction = attr.ib(validator=attr.validators.instance_of(ValidationAction), default=ValidationAction.PASS) # EXCLUDE on edge file - blank_id_line_action: ValidationAction = attr.ib(validator=attr.validators.instance_of(ValidationAction), default=ValidationAction.PASS) # EXCLUDE on node file + # Ignore records with empty values in certain fields: + blank_required_field_line_action: ValidationAction = attr.ib(validator=attr.validators.instance_of(ValidationAction), default=ValidationAction.EXCLUDE) # Ignore records with too many or too few fields? short_line_action: ValidationAction = attr.ib(validator=attr.validators.instance_of(ValidationAction), default=ValidationAction.EXCLUDE) @@ -96,7 +83,6 @@ class KgtkReader(KgtkBase, ClosableIter[typing.List[str]]): # Validate data cell values? invalid_value_action: ValidationAction = attr.ib(validator=attr.validators.instance_of(ValidationAction), default=ValidationAction.REPORT) - value_options: typing.Optional[KgtkValueOptions] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(KgtkValueOptions)), default=None) # Repair records with too many or too few fields? fill_short_lines: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) @@ -107,92 +93,252 @@ class KgtkReader(KgtkBase, ClosableIter[typing.List[str]]): gzip_in_parallel: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) gzip_queue_size: int = attr.ib(validator=attr.validators.instance_of(int), default=GZIP_QUEUE_SIZE_DEFAULT) + @classmethod + def add_arguments(cls, + parser: ArgumentParser, + mode_options: bool = False, + who: str = ""): + prefix1: str = "--" if len(who) == 0 else "--" + who + "-" + prefix2: str = "" if len(who) == 0 else who + "_" + prefix3: str = "" if len(who) == 0 else who + ": " + prefix4: str = "" if len(who) == 0 else who + " file " + + fgroup: _ArgumentGroup = parser.add_argument_group(prefix3 + "File options", + "Options affecting " + prefix4 + "processing") + fgroup.add_argument(prefix1 + "column-separator", + dest=prefix2 + "column_separator", + help=prefix3 + "Column separator.", type=str, default=KgtkFormat.COLUMN_SEPARATOR) + + fgroup.add_argument(prefix1 + "compression-type", + dest=prefix2 + "compression_type", help=prefix3 + "Specify the compression type.") + + fgroup.add_argument(prefix1 + "error-limit", + dest=prefix2 + "error_limit", + help=prefix3 + "The maximum number of errors to report before failing", type=int, default=cls.ERROR_LIMIT_DEFAULT) + + fgroup.add_argument(prefix1 + "gzip-in-parallel", + dest=prefix2 + "gzip_in_parallel", help=prefix3 + "Execute gzip in parallel.", action='store_true') + + fgroup.add_argument(prefix1 + "gzip-queue-size", + dest=prefix2 + "gzip_queue_size", + help=prefix3 + "Queue size for parallel gzip.", type=int, default=cls.GZIP_QUEUE_SIZE_DEFAULT) + + if mode_options: + fgroup.add_argument(prefix1 + "mode", + dest=prefix2 + "mode", + help=prefix3 + "Determine the KGTK file mode.", + type=KgtkReaderMode, action=EnumNameAction, default=KgtkReaderMode.AUTO) + + hgroup: _ArgumentGroup = parser.add_argument_group(prefix3 + "Header parsing", "Options affecting " + prefix4 + "header parsing") + + hgroup.add_argument(prefix1 + "force-column-names", + dest=prefix2 + "force_column_names", + help=prefix3 + "Force the column names.", nargs='+') + + hgroup.add_argument(prefix1 + "header-error-action", + dest=prefix2 + "header_error_action", + help=prefix3 + "The action to take when a header error is detected Only ERROR or EXIT are supported.", + type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXIT) + + hgroup.add_argument(prefix1 + "skip-first-record", + dest=prefix2 + "skip_first_record", + help=prefix3 + "Skip the first record when forcing column names.", action='store_true') + + hgroup.add_argument(prefix1 + "unsafe-column-name-action", + dest=prefix2 + "unsafe_column_name_action", + help=prefix3 + "The action to take when a column name is unsafe.", + type=ValidationAction, action=EnumNameAction, default=ValidationAction.REPORT) + + lgroup: _ArgumentGroup = parser.add_argument_group("Line parsing", "Options affecting " + prefix4 + "data line parsing") + + lgroup.add_argument(prefix1 + "blank-required-field-line-action", + dest=prefix2 + "blank_required_field_line_action", + help=prefix3 + "The action to take when a line with a blank node1, node2, or id field (per mode) is detected.", + type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) + + lgroup.add_argument(prefix1 + "comment-line-action", + dest=prefix2 + "comment_line_action", + help=prefix3 + "The action to take when a comment line is detected.", + type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) + + lgroup.add_argument(prefix1 + "empty-line-action", + dest=prefix2 + "empty_line_action", + help=prefix3 + "The action to take when an empty line is detected.", + type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) + + lgroup.add_argument(prefix1 + "fill-short-lines", + dest=prefix2 + "fill_short_lines", + help=prefix3 + "Fill missing trailing columns in short lines with empty values.", action='store_true') + + lgroup.add_argument(prefix1 + "invalid-value-action", + dest=prefix2 + "invalid_value_action", + help=prefix3 + "The action to take when a data cell value is invalid.", + type=ValidationAction, action=EnumNameAction, default=ValidationAction.REPORT) + + lgroup.add_argument(prefix1 + "long-line-action", + dest=prefix2 + "long_line_action", + help=prefix3 + "The action to take when a long line is detected.", + type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) + + lgroup.add_argument(prefix1 + "short-line-action", + dest=prefix2 + "short_line_action", + help=prefix3 + "The action to take when a short line is detected.", + type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) + + lgroup.add_argument(prefix1 + "truncate-long-lines", + dest=prefix2 + "truncate_long_lines", + help=prefix3 + "Remove excess trailing columns in long lines.", action='store_true') + + lgroup.add_argument(prefix1 + "whitespace-line-action", + dest=prefix2 + "whitespace_line_action", + help=prefix3 + "The action to take when a whitespace line is detected.", + type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) + + @classmethod + # Build the value parsing option structure. + def from_dict(cls, + d: dict, + who: str = "", + mode: typing.Optional[KgtkReaderMode] = None, + )->'KgtkReaderOptions': + prefix: str = "" # The destination name prefix. + if len(who) > 0: + prefix = who + "_" + + reader_mode: KgtkReaderMode + if mode is None: + reader_mode = d.get(prefix + "mode", KgtkReaderMode.AUTO) + else: + reader_mode = mode + + return cls( + blank_required_field_line_action=d.get(prefix + "blank_required_field_line_action", ValidationAction.EXCLUDE), + column_separator=d.get(prefix + "column_separator", KgtkFormat.COLUMN_SEPARATOR), + comment_line_action=d.get(prefix + "comment_line_action", ValidationAction.EXCLUDE), + compression_type=d.get(prefix + "compression_type", None), + empty_line_action=d.get(prefix + "empty_line_action", ValidationAction.EXCLUDE), + error_limit=d.get(prefix + "error_limit", cls.ERROR_LIMIT_DEFAULT), + fill_short_lines=d.get(prefix + "fill_short_lines", False), + force_column_names=d.get(prefix + "force_column_names", None), + gzip_in_parallel=d.get(prefix + "gzip_in_parallel", False), + gzip_queue_size=d.get(prefix + "gzip_queue_size", KgtkReaderOptions.GZIP_QUEUE_SIZE_DEFAULT), + header_error_action=d.get(prefix + "header_error_action", ValidationAction.EXCLUDE), + invalid_value_action=d.get(prefix + "invalid_value_action", ValidationAction.REPORT), + long_line_action=d.get(prefix + "long_line_action", ValidationAction.EXCLUDE), + mode=reader_mode, + short_line_action=d.get(prefix + "short_line_action", ValidationAction.EXCLUDE), + skip_first_record=d.get(prefix + "skip_first_recordb", False), + truncate_long_lines=d.get(prefix + "truncate_long_lines", False), + unsafe_column_name_action=d.get(prefix + "unsafe_column_name_action", ValidationAction.REPORT), + whitespace_line_action=d.get(prefix + "whitespace_line_action", ValidationAction.EXCLUDE), + ) + + @classmethod + # Build the value parsing option structure. + def from_args(cls, + args: Namespace, + who: str = "", + mode: typing.Optional[KgtkReaderMode] = None, + )->'KgtkReaderOptions': + return cls.from_dict(vars(args), who=who, mode=mode) + +DEFAULT_KGTK_READER_OPTIONS: KgtkReaderOptions = KgtkReaderOptions() + + +@attr.s(slots=True, frozen=False) +class KgtkReader(KgtkBase, ClosableIter[typing.List[str]]): + file_path: typing.Optional[Path] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(Path))) + source: ClosableIter[str] = attr.ib() # Todo: validate + + options: KgtkReaderOptions = attr.ib(validator=attr.validators.instance_of(KgtkReaderOptions)) + + value_options: KgtkValueOptions = attr.ib(validator=attr.validators.instance_of(KgtkValueOptions)) + + column_names: typing.List[str] = attr.ib(validator=attr.validators.deep_iterable(member_validator=attr.validators.instance_of(str), + iterable_validator=attr.validators.instance_of(list))) + # For convenience, the count of columns. This is the same as len(column_names). + column_count: int = attr.ib(validator=attr.validators.instance_of(int)) + + column_name_map: typing.Mapping[str, int] = attr.ib(validator=attr.validators.deep_mapping(key_validator=attr.validators.instance_of(str), + value_validator=attr.validators.instance_of(int))) + + # The index of the mandatory columns. -1 means missing: + node1_column_idx: int = attr.ib(validator=attr.validators.instance_of(int), default=-1) # edge file + node2_column_idx: int = attr.ib(validator=attr.validators.instance_of(int), default=-1) # edge file + label_column_idx: int = attr.ib(validator=attr.validators.instance_of(int), default=-1) # edge file + id_column_idx: int = attr.ib(validator=attr.validators.instance_of(int), default=-1) # node file + + data_lines_read: int = attr.ib(validator=attr.validators.instance_of(int), default=0) + data_lines_passed: int = attr.ib(validator=attr.validators.instance_of(int), default=0) + data_lines_ignored: int = attr.ib(validator=attr.validators.instance_of(int), default=0) + data_errors_reported: int = attr.ib(validator=attr.validators.instance_of(int), default=0) + # Is this an edge file or a node file? is_edge_file: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) is_node_file: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) + # Feedback and error output: + error_file: typing.TextIO = attr.ib(default=sys.stderr) verbose: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) very_verbose: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) - class Mode(Enum): - """ - There are four file reading modes: - """ - NONE = 0 # Enforce neither edge nore node file required columns - EDGE = 1 # Enforce edge file required columns - NODE = 2 # Enforce node file require columns - AUTO = 3 # Automatically decide whether to enforce edge or node file required columns + @classmethod + def _default_options( + cls, + options: typing.Optional[KgtkReaderOptions] = None, + value_options: typing.Optional[KgtkValueOptions] = None, + )->typing.Tuple[KgtkReaderOptions, KgtkValueOptions]: + # Supply the default reader and value options: + if options is None: + options = DEFAULT_KGTK_READER_OPTIONS + if value_options is None: + value_options = DEFAULT_KGTK_VALUE_OPTIONS + + return (options, value_options) @classmethod def open(cls, file_path: typing.Optional[Path], - force_column_names: typing.Optional[typing.List[str]] = None, - skip_first_record: bool = False, - fill_short_lines: bool = False, - truncate_long_lines: bool = False, error_file: typing.TextIO = sys.stderr, - error_limit: int = ERROR_LIMIT_DEFAULT, - empty_line_action: ValidationAction = ValidationAction.EXCLUDE, - comment_line_action: ValidationAction = ValidationAction.EXCLUDE, - whitespace_line_action: ValidationAction = ValidationAction.EXCLUDE, - blank_line_action: ValidationAction = ValidationAction.EXCLUDE, - blank_node1_line_action: typing.Optional[ValidationAction] = None, - blank_node2_line_action: typing.Optional[ValidationAction] = None, - blank_id_line_action: typing.Optional[ValidationAction] = None, - short_line_action: ValidationAction = ValidationAction.EXCLUDE, - long_line_action: ValidationAction = ValidationAction.EXCLUDE, - invalid_value_action: ValidationAction = ValidationAction.REPORT, - header_error_action: ValidationAction = ValidationAction.EXIT, - unsafe_column_name_action: ValidationAction = ValidationAction.REPORT, + options: typing.Optional[KgtkReaderOptions] = None, value_options: typing.Optional[KgtkValueOptions] = None, - compression_type: typing.Optional[str] = None, - gzip_in_parallel: bool = False, - gzip_queue_size: int = GZIP_QUEUE_SIZE_DEFAULT, - column_separator: str = KgtkFormat.COLUMN_SEPARATOR, - mode: Mode = Mode.AUTO, verbose: bool = False, very_verbose: bool = False)->"KgtkReader": """ Opens a KGTK file, which may be an edge file or a node file. The appropriate reader is returned. """ - source: ClosableIter[str] = cls._openfile(file_path, - compression_type=compression_type, - gzip_in_parallel=gzip_in_parallel, - gzip_queue_size=gzip_queue_size, - error_file=error_file, - verbose=verbose) + + # Supply the default reader and value options: + (options, value_options) = cls._default_options(options, value_options) + + source: ClosableIter[str] = cls._openfile(file_path, options=options, error_file=error_file, verbose=verbose) # Read the kgtk file header and split it into column names. We get the # header back, too, for use in debugging and error messages. header: str column_names: typing.List[str] - (header, column_names) = cls._build_column_names(source, - force_column_names=force_column_names, - skip_first_record=skip_first_record, - column_separator=column_separator, - error_file=error_file, - verbose=verbose) + (header, column_names) = cls._build_column_names(source, options, error_file=error_file, verbose=verbose) # Check for unsafe column names. cls.check_column_names(column_names, header_line=header, - error_action=unsafe_column_name_action, + error_action=options.unsafe_column_name_action, error_file=error_file) # Build a map from column name to column index. column_name_map: typing.Mapping[str, int] = cls.build_column_name_map(column_names, header_line=header, - error_action=header_error_action, + error_action=options.header_error_action, error_file=error_file) # Should we automatically determine if this is an edge file or a node file? is_edge_file: bool = False is_node_file: bool = False - if mode is KgtkReader.Mode.AUTO: + if options.mode is KgtkReaderMode.AUTO: # If we have a node1 (or alias) column, then this must be an edge file. Otherwise, assume it is a node file. node1_idx: int = cls.get_column_idx(cls.NODE1_COLUMN_NAMES, column_name_map, header_line=header, - error_action=header_error_action, + error_action=options.header_error_action, error_file=error_file, is_optional=True) if node1_idx >= 0: @@ -206,11 +352,11 @@ def open(cls, if verbose: print("node1 column not found, assuming this is a KGTK node file", file=error_file, flush=True) - elif mode is KgtkReader.Mode.EDGE: + elif options.mode is KgtkReaderMode.EDGE: is_edge_file = True - elif mode is KgtkReader.Mode.NODE: + elif options.mode is KgtkReaderMode.NODE: is_node_file = True - elif mode is KgtkReader.Mode.NONE: + elif options.mode is KgtkReaderMode.NONE: pass if is_edge_file: @@ -224,50 +370,23 @@ def open(cls, label_column_idx: int (node1_column_idx, node2_column_idx, label_column_idx) = cls.required_edge_columns(column_name_map, header_line=header, - error_action=header_error_action, + error_action=options.header_error_action, error_file=error_file) if verbose: print("KgtkReader: Reading an edge file. node1=%d label=%d node2=%d" % (node1_column_idx, label_column_idx, node2_column_idx), file=error_file, flush=True) - # Apply the proper defaults to the blank node1, node2, and id actions: - if blank_node1_line_action is None: - blank_node1_line_action = blank_line_action - if blank_node2_line_action is None: - blank_node2_line_action = blank_line_action - if blank_id_line_action is None: - blank_id_line_action = ValidationAction.PASS - return EdgeReader(file_path=file_path, source=source, - column_separator=column_separator, column_names=column_names, column_name_map=column_name_map, column_count=len(column_names), node1_column_idx=node1_column_idx, node2_column_idx=node2_column_idx, label_column_idx=label_column_idx, - force_column_names=force_column_names, - skip_first_record=skip_first_record, - fill_short_lines=fill_short_lines, - truncate_long_lines=truncate_long_lines, error_file=error_file, - error_limit=error_limit, - empty_line_action=empty_line_action, - comment_line_action=comment_line_action, - whitespace_line_action=whitespace_line_action, - blank_node1_line_action=blank_node1_line_action, - blank_node2_line_action=blank_node2_line_action, - blank_id_line_action=blank_id_line_action, - short_line_action=short_line_action, - long_line_action=long_line_action, - invalid_value_action=invalid_value_action, - header_error_action=header_error_action, - unsafe_column_name_action=unsafe_column_name_action, + options=options, value_options=value_options, - compression_type=compression_type, - gzip_in_parallel=gzip_in_parallel, - gzip_queue_size=gzip_queue_size, is_edge_file=is_edge_file, is_node_file=is_node_file, verbose=verbose, @@ -281,89 +400,35 @@ def open(cls, # Get the index of the required column: id_column_idx: int = cls.required_node_column(column_name_map, header_line=header, - error_action=header_error_action, + error_action=options.header_error_action, error_file=error_file) if verbose: print("KgtkReader: Reading an node file. id=%d" % (id_column_idx), file=error_file, flush=True) - # Apply the proper defaults to the blank node1, node2, and id actions: - if blank_node1_line_action is None: - blank_node1_line_action = ValidationAction.PASS - if blank_node2_line_action is None: - blank_node2_line_action = ValidationAction.PASS - if blank_id_line_action is None: - blank_id_line_action = blank_line_action - return NodeReader(file_path=file_path, source=source, - column_separator=column_separator, column_names=column_names, column_name_map=column_name_map, column_count=len(column_names), id_column_idx=id_column_idx, - force_column_names=force_column_names, - skip_first_record=skip_first_record, - fill_short_lines=fill_short_lines, - truncate_long_lines=truncate_long_lines, error_file=error_file, - error_limit=error_limit, - empty_line_action=empty_line_action, - comment_line_action=comment_line_action, - whitespace_line_action=whitespace_line_action, - blank_node1_line_action=blank_node1_line_action, - blank_node2_line_action=blank_node2_line_action, - blank_id_line_action=blank_id_line_action, - short_line_action=short_line_action, - long_line_action=long_line_action, - invalid_value_action=invalid_value_action, - header_error_action=header_error_action, - unsafe_column_name_action=unsafe_column_name_action, + options=options, value_options=value_options, - compression_type=compression_type, - gzip_in_parallel=gzip_in_parallel, - gzip_queue_size=gzip_queue_size, is_edge_file=is_edge_file, is_node_file=is_node_file, verbose=verbose, very_verbose=very_verbose, ) else: - # Apply the proper defaults to the blank node1, node2, and id actions: - if blank_node1_line_action is None: - blank_node1_line_action = ValidationAction.PASS - if blank_node2_line_action is None: - blank_node2_line_action = ValidationAction.PASS - if blank_id_line_action is None: - blank_id_line_action = ValidationAction.PASS - return cls(file_path=file_path, source=source, - column_separator=column_separator, column_names=column_names, column_name_map=column_name_map, column_count=len(column_names), - force_column_names=force_column_names, - skip_first_record=skip_first_record, - fill_short_lines=fill_short_lines, - truncate_long_lines=truncate_long_lines, error_file=error_file, - error_limit=error_limit, - empty_line_action=empty_line_action, - comment_line_action=comment_line_action, - whitespace_line_action=whitespace_line_action, - blank_node1_line_action=blank_node1_line_action, - blank_node2_line_action=blank_node2_line_action, - blank_id_line_action=blank_id_line_action, - short_line_action=short_line_action, - long_line_action=long_line_action, - invalid_value_action=invalid_value_action, - header_error_action=header_error_action, - unsafe_column_name_action=unsafe_column_name_action, + options=options, value_options=value_options, - compression_type=compression_type, - gzip_in_parallel=gzip_in_parallel, - gzip_queue_size=gzip_queue_size, is_edge_file=is_edge_file, is_node_file=is_node_file, verbose=verbose, @@ -404,16 +469,15 @@ def _open_compressed_file(cls, raise ValueError("%s: Unexpected compression_type '%s'" % (who, compression_type)) @classmethod - def _openfile(cls, file_path: typing.Optional[Path], - compression_type: typing.Optional[str], - gzip_in_parallel: bool, - gzip_queue_size: int, + def _openfile(cls, + file_path: typing.Optional[Path], + options: KgtkReaderOptions, error_file: typing.TextIO, verbose: bool)->ClosableIter[str]: who: str = cls.__name__ if file_path is None or str(file_path) == "-": - if compression_type is not None and len(compression_type) > 0: - return ClosableIterTextIOWrapper(cls._open_compressed_file(compression_type, "-", sys.stdin, who, error_file, verbose)) + if options.compression_type is not None and len(options.compression_type) > 0: + return ClosableIterTextIOWrapper(cls._open_compressed_file(options.compression_type, "-", sys.stdin, who, error_file, verbose)) else: if verbose: print("%s: reading stdin" % who, file=error_file, flush=True) @@ -423,8 +487,8 @@ def _openfile(cls, file_path: typing.Optional[Path], print("%s: File_path.suffix: %s" % (who, file_path.suffix), file=error_file, flush=True) gzip_file: typing.TextIO - if compression_type is not None and len(compression_type) > 0: - gzip_file = cls._open_compressed_file(compression_type, str(file_path), file_path, who, error_file, verbose) + if options.compression_type is not None and len(options.compression_type) > 0: + gzip_file = cls._open_compressed_file(options.compression_type, str(file_path), file_path, who, error_file, verbose) elif file_path.suffix in [".bz2", ".gz", ".lz4", ".xz"]: gzip_file = cls._open_compressed_file(file_path.suffix, str(file_path), file_path, who, error_file, verbose) else: @@ -432,8 +496,8 @@ def _openfile(cls, file_path: typing.Optional[Path], print("%s: reading file %s" % (who, str(file_path))) return ClosableIterTextIOWrapper(open(file_path, "r")) - if gzip_in_parallel: - gzip_thread: GunzipProcess = GunzipProcess(gzip_file, Queue(gzip_queue_size)) + if options.gzip_in_parallel: + gzip_thread: GunzipProcess = GunzipProcess(gzip_file, Queue(options.gzip_queue_size)) gzip_thread.start() return gzip_thread else: @@ -443,9 +507,7 @@ def _openfile(cls, file_path: typing.Optional[Path], @classmethod def _build_column_names(cls, source: ClosableIter[str], - force_column_names: typing.Optional[typing.List[str]], - skip_first_record: bool, - column_separator: str, + options: KgtkReaderOptions, error_file: typing.TextIO, verbose: bool = False, )->typing.Tuple[str, typing.List[str]]: @@ -453,7 +515,7 @@ def _build_column_names(cls, Read the kgtk file header and split it into column names. """ column_names: typing.List[str] - if force_column_names is None: + if options.force_column_names is None: # Read the column names from the first line, stripping end-of-line characters. # # TODO: if the read fails, throw a more useful exception with the line number. @@ -465,18 +527,18 @@ def _build_column_names(cls, print("header: %s" % header, file=error_file, flush=True) # Split the first line into column names. - return header, header.split(column_separator) + return header, header.split(options.column_separator) else: # Skip the first record to override the column names in the file. # Do not skip the first record if the file does not hae a header record. - if skip_first_record: + if options.skip_first_record: try: next(source) except StopIteration: raise ValueError("No header line to skip") # Use the forced column names. - return column_separator.join(force_column_names), force_column_names + return options.column_separator.join(options.force_column_names), options.force_column_names def close(self): self.source.close() @@ -503,7 +565,7 @@ def exclude_line(self, action: ValidationAction, msg: str, line: str)->bool: print("In input data line %d, %s: %s" % (self.data_lines_read, msg, line), file=self.error_file, flush=True) self.data_errors_reported += 1 - if self.error_limit > 0 and self.data_errors_reported >= self.error_limit: + if self.options.error_limit > 0 and self.data_errors_reported >= self.options.error_limit: raise ValueError("Too many data errors, exiting.") return result @@ -534,44 +596,44 @@ def nextrow(self)-> typing.List[str]: print("'%s'" % line, file=self.error_file, flush=True) # Ignore empty lines. - if self.empty_line_action != ValidationAction.PASS and len(line) == 0: - if self.exclude_line(self.empty_line_action, "saw an empty line", line): + if self.options.empty_line_action != ValidationAction.PASS and len(line) == 0: + if self.exclude_line(self.options.empty_line_action, "saw an empty line", line): continue # Ignore comment lines: - if self.comment_line_action != ValidationAction.PASS and line[0] == self.COMMENT_INDICATOR: - if self.exclude_line(self.comment_line_action, "saw a comment line", line): + if self.options.comment_line_action != ValidationAction.PASS and line[0] == self.COMMENT_INDICATOR: + if self.exclude_line(self.options.comment_line_action, "saw a comment line", line): continue # Ignore whitespace lines - if self.whitespace_line_action != ValidationAction.PASS and line.isspace(): - if self.exclude_line(self.whitespace_line_action, "saw a whitespace line", line): + if self.options.whitespace_line_action != ValidationAction.PASS and line.isspace(): + if self.exclude_line(self.options.whitespace_line_action, "saw a whitespace line", line): continue - row = line.split(self.column_separator) + row = line.split(self.options.column_separator) # Optionally fill missing trailing columns with empty row: - if self.fill_short_lines and len(row) < self.column_count: + if self.options.fill_short_lines and len(row) < self.column_count: while len(row) < self.column_count: row.append("") # Optionally remove extra trailing columns: - if self.truncate_long_lines and len(row) > self.column_count: + if self.options.truncate_long_lines and len(row) > self.column_count: row = row[:self.column_count] # Optionally validate that the line contained the right number of columns: # # When we report line numbers in error messages, line 1 is the first line after the header line. - if self.short_line_action != ValidationAction.PASS and len(row) < self.column_count: - if self.exclude_line(self.short_line_action, + if self.options.short_line_action != ValidationAction.PASS and len(row) < self.column_count: + if self.exclude_line(self.options.short_line_action, "Required %d columns, saw %d: '%s'" % (self.column_count, len(row), line), line): continue - if self.long_line_action != ValidationAction.PASS and len(row) > self.column_count: - if self.exclude_line(self.long_line_action, + if self.options.long_line_action != ValidationAction.PASS and len(row) > self.column_count: + if self.exclude_line(self.options.long_line_action, "Required %d columns, saw %d (%d extra): '%s'" % (self.column_count, len(row), len(row) - self.column_count, @@ -582,7 +644,7 @@ def nextrow(self)-> typing.List[str]: if self._ignore_if_blank_fields(row, line): continue - if self.invalid_value_action != ValidationAction.PASS: + if self.options.invalid_value_action != ValidationAction.PASS: # TODO: find a way to optionally cache the KgtkValue objects # so we don't have to create them a second time in the conversion # and iterator methods below. @@ -634,11 +696,10 @@ def to_kgtk_values(self, row: typing.List[str], validate: bool = False)->typing. When validate is True, validate each KgtkValue object. """ - options: KgtkValueOptions = self.value_options if self.value_options is not None else DEFAULT_KGTK_VALUE_OPTIONS results: typing.List[KgtkValue] = [ ] field: str for field in row: - kv = KgtkValue(field, options=options) + kv = KgtkValue(field, options=self.value_options) if validate: kv.validate() results.append(kv) @@ -663,14 +724,13 @@ def to_concise_kgtk_values(self, row: typing.List[str], validate: bool = False)- When validate is True, validate each KgtkValue object. """ - options: KgtkValueOptions = self.value_options if self.value_options is not None else DEFAULT_KGTK_VALUE_OPTIONS results: typing.List[typing.Optional[KgtkValue]] = [ ] field: str for field in row: if len(field) == 0: results.append(None) else: - kv = KgtkValue(field, options=options) + kv = KgtkValue(field, options=self.value_options) if validate: kv.validate() results.append(kv) @@ -733,7 +793,6 @@ def to_kgtk_value_dict(self, row: typing.List[str], validate: bool=False, concis When validate is True, validate each KgtkValue object. """ - options: KgtkValueOptions = self.value_options if self.value_options is not None else DEFAULT_KGTK_VALUE_OPTIONS results: typing.MutableMapping[str, KgtkValue] = { } idx: int = 0 field: str @@ -741,7 +800,7 @@ def to_kgtk_value_dict(self, row: typing.List[str], validate: bool=False, concis if concise and len(field) == 0: pass # Skip the empty field. else: - kv = KgtkValue(field, options=options) + kv = KgtkValue(field, options=self.value_options) if validate: kv.validate() results[self.column_names[idx]] = kv @@ -771,20 +830,19 @@ def _ignore_invalid_values(self, values: typing.List[str], line: str)->bool: Returns True to indicate that the row should be ignored (skipped). """ - options: KgtkValueOptions = self.value_options if self.value_options is not None else DEFAULT_KGTK_VALUE_OPTIONS problems: typing.List[str] = [ ] # Build a list of problems. idx: int value: str for idx, value in enumerate(values): if len(value) > 0: # Optimize the common case of empty columns. - kv: KgtkValue = KgtkValue(value, options=options) + kv: KgtkValue = KgtkValue(value, options=self.value_options) if not kv.is_valid(): problems.append("col %d (%s) value '%s'is an %s" % (idx, self.column_names[idx], value, kv.describe())) if len(problems) == 0: return False - return self.exclude_line(self.invalid_value_action, + return self.exclude_line(self.options.invalid_value_action, "; ".join(problems), line) @@ -822,7 +880,7 @@ def merge_columns(self, additional_columns: typing.List[str])->typing.List[str]: return merged_columns @classmethod - def add_operation_arguments(cls, parser: ArgumentParser): + def add_debug_arguments(cls, parser: ArgumentParser): errors_to = parser.add_mutually_exclusive_group() errors_to.add_argument( "--errors-to-stdout", dest="errors_to_stdout", help="Send errors to stdout instead of stderr", action="store_true") @@ -833,129 +891,6 @@ def add_operation_arguments(cls, parser: ArgumentParser): parser.add_argument( "--very-verbose", dest="very_verbose", help="Print additional progress messages.", action='store_true') - @classmethod - def add_file_arguments(cls, - parser: ArgumentParser, - node_options: bool = False, - edge_options: bool = False, - mode_options: bool = False, - optional_file: bool = True, - who: str = ""): - prefix1: str = "--" if len(who) == 0 else "--" + who + "-" - prefix2: str = "" if len(who) == 0 else who + "_" - prefix3: str = "" if len(who) == 0 else who + " " - - if optional_file: - parser.add_argument(dest=prefix2 + "kgtk_file", help="The " + who + " KGTK file to read", type=Path, nargs="?") - else: - parser.add_argument(dest=prefix2 + "kgtk_file", help="The " + who + " KGTK file to read", type=Path) - - fgroup: _ArgumentGroup = parser.add_argument_group(prefix3 + "File options", - "Options affecting " + prefix3 + "processing") - fgroup.add_argument(prefix1 + "column-separator", - dest=prefix2 + "column_separator", - help="Column separator.", type=str, default=cls.COLUMN_SEPARATOR) - - fgroup.add_argument(prefix1 + "compression-type", - dest=prefix2 + "compression_type", help="Specify the compression type.") - - fgroup.add_argument(prefix1 + "error-limit", - dest=prefix2 + "error_limit", - help="The maximum number of errors to report before failing", type=int, default=cls.ERROR_LIMIT_DEFAULT) - - fgroup.add_argument(prefix1 + "gzip-in-parallel", - dest=prefix2 + "gzip_in_parallel", help="Execute gzip in parallel.", action='store_true') - - fgroup.add_argument(prefix1 + "gzip-queue-size", - dest=prefix2 + "gzip_queue_size", - help="Queue size for parallel gzip.", type=int, default=cls.GZIP_QUEUE_SIZE_DEFAULT) - - if mode_options: - fgroup.add_argument(prefix1 + "mode", - dest=prefix2 + "mode", - help="Determine the KGTK file mode.", - type=KgtkReader.Mode, action=EnumNameAction, default=KgtkReader.Mode.AUTO) - - hgroup: _ArgumentGroup = parser.add_argument_group(prefix3 + "Header parsing", "Options affecting header parsing") - - hgroup.add_argument(prefix1 + "force-column-names", - dest=prefix2 + "force_column_names", help="Force the column names.", nargs='+') - - hgroup.add_argument(prefix1 + "header-error-action", - dest=prefix2 + "header_error_action", - help="The action to take when a header error is detected Only ERROR or EXIT are supported.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXIT) - - hgroup.add_argument(prefix1 + "skip-first-record", - dest=prefix2 + "skip_first_record", - help="Skip the first record when forcing column names.", action='store_true') - - hgroup.add_argument(prefix1 + "unsafe-column-name-action", - dest=prefix2 + "unsafe_column_name_action", - help="The action to take when a column name is unsafe.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.REPORT) - - lgroup: _ArgumentGroup = parser.add_argument_group("Line parsing", "Options affecting data line parsing") - - if node_options: - lgroup.add_argument(prefix1 + "blank-id-line-action", - dest=prefix2 + "blank_id_line_action", - help="The action to take when a blank id field is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - - if edge_options: - lgroup.add_argument(prefix1 + "blank-node1-line-action", - dest=prefix2 + "blank_node1_line_action", - help="The action to take when a blank node1 field is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - - lgroup.add_argument(prefix1 + "blank-node2-line-action", - dest=prefix2 + "blank_node2_line_action", - help="The action to take when a blank node2 field is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - lgroup.add_argument(prefix1 + "blank-required-field-line-action", - dest=prefix2 + "blank_line_action", - help="The action to take when a line with a blank node1, node2, or id field (per mode) is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - - lgroup.add_argument(prefix1 + "comment-line-action", - dest=prefix2 + "comment_line_action", - help="The action to take when a comment line is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - - lgroup.add_argument(prefix1 + "empty-line-action", - dest=prefix2 + "empty_line_action", - help="The action to take when an empty line is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - - lgroup.add_argument(prefix1 + "fill-short-lines", - dest=prefix2 + "fill_short_lines", - help="Fill missing trailing columns in short lines with empty values.", action='store_true') - - lgroup.add_argument(prefix1 + "invalid-value-action", - dest=prefix2 + "invalid_value_action", - help="The action to take when a data cell value is invalid.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.REPORT) - - lgroup.add_argument(prefix1 + "long-line-action", - dest=prefix2 + "long_line_action", - help="The action to take when a long line is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - - lgroup.add_argument(prefix1 + "short-line-action", - dest=prefix2 + "short_line_action", - help="The action to take when a short line is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - - lgroup.add_argument(prefix1 + "truncate-long-lines", - dest=prefix2 + "truncate_long_lines", - help="Remove excess trailing columns in long lines.", action='store_true') - - lgroup.add_argument(prefix1 + "whitespace-line-action", - dest=prefix2 + "whitespace_line_action", - help="The action to take when a whitespace line is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - def main(): """ Test the KGTK file reader. @@ -966,50 +901,32 @@ def main(): from kgtk.io.nodereader import NodeReader parser = ArgumentParser() - KgtkReader.add_operation_arguments(parser) - KgtkReader.add_file_arguments(parser, node_options=True, edge_options=True, mode_options=True) - KgtkValueOptions.add_arguments(parser) - + parser.add_argument(dest="kgtk_file", help="The KGTK file to read", type=Path, nargs="?") + KgtkReader.add_debug_arguments(parser) parser.add_argument( "--test", dest="test_method", help="The test to perform", choices=["rows", "concise-rows", "kgtk-values", "concise-kgtk-values", "dicts", "concise-dicts", "kgtk-value-dicts", "concise-kgtk-value-dicts"], default="rows") - parser.add_argument( "--test-valdate", dest="test_validate", help="Validate KgtkValue objects in test.", action='store_true') + parser.add_argument( "--test-validate", dest="test_validate", help="Validate KgtkValue objects in test.", action='store_true') + + KgtkReaderOptions.add_arguments(parser, mode_options=True) + KgtkValueOptions.add_arguments(parser) args = parser.parse_args() error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr - # Build the value parsing option structure. + # Build the option structures. + reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args) value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) kr: KgtkReader = KgtkReader.open(args.kgtk_file, - force_column_names=args.force_column_names, - skip_first_record=args.skip_first_record, - fill_short_lines=args.fill_short_lines, - truncate_long_lines=args.truncate_long_lines, error_file = error_file, - error_limit=args.error_limit, - empty_line_action=args.empty_line_action, - comment_line_action=args.comment_line_action, - whitespace_line_action=args.whitespace_line_action, - blank_line_action=args.blank_line_action, - blank_node1_line_action=args.blank_node1_line_action, - blank_node2_line_action=args.blank_node2_line_action, - blank_id_line_action=args.blank_id_line_action, - short_line_action=args.short_line_action, - long_line_action=args.long_line_action, - invalid_value_action=args.invalid_value_action, - header_error_action=args.header_error_action, - unsafe_column_name_action=args.unsafe_column_name_action, + options=reader_options, value_options=value_options, - compression_type=args.compression_type, - gzip_in_parallel=args.gzip_in_parallel, - gzip_queue_size=args.gzip_queue_size, - column_separator=args.column_separator, - mode=args.mode, - verbose=args.verbose, very_verbose=args.very_verbose) + verbose=args.verbose, + very_verbose=args.very_verbose) line_count: int = 0 row: typing.List[str] @@ -1019,49 +936,49 @@ def main(): kgtk_value_dict: typing.Mapping[str, str] if args.test_method == "rows": if args.verbose: - print("Testing iterating over rows.", flush=True) + print("Testing iterating over rows.", file=error_file, flush=True) for row in kr: line_count += 1 elif args.test_method == "concise-rows": if args.verbose: - print("Testing iterating over concise rows.", flush=True) + print("Testing iterating over concise rows.", file=error_file, flush=True) for row in kr.concise_rows(): line_count += 1 elif args.test_method == "kgtk-values": if args.verbose: - print("Testing iterating over KgtkValue rows.", flush=True) + print("Testing iterating over KgtkValue rows.", file=error_file, flush=True) for kgtk_values in kr.kgtk_values(validate=args.test_validate): line_count += 1 elif args.test_method == "concise-kgtk-values": if args.verbose: - print("Testing iterating over concise KgtkValue rows.", flush=True) + print("Testing iterating over concise KgtkValue rows.", file=error_file, flush=True) for kgtk_values in kr.concise_kgtk_values(validate=args.test_validate): line_count += 1 elif args.test_method == "dicts": if args.verbose: - print("Testing iterating over dicts.", flush=True) + print("Testing iterating over dicts.", file=error_file, flush=True) for dict_row in kr.dicts(): line_count += 1 elif args.test_method == "concise-dicts": if args.verbose: - print("Testing iterating over concise dicts.", flush=True) + print("Testing iterating over concise dicts.", file=error_file, flush=True) for dict_row in kr.dicts(concise=True): line_count += 1 elif args.test_method == "kgtk-value-dicts": if args.verbose: - print("Testing iterating over KgtkValue dicts.", flush=True) + print("Testing iterating over KgtkValue dicts.", file=error_file, flush=True) for kgtk_value_dict in kr.kgtk_value_dicts(validate=args.test_validate): line_count += 1 elif args.test_method == "concise-kgtk-value-dicts": if args.verbose: - print("Testing iterating over concise KgtkValue dicts.", flush=True) + print("Testing iterating over concise KgtkValue dicts.", file=error_file, flush=True) for kgtk_value_dict in kr.kgtk_value_dicts(concise=True, validate=args.test_validate): line_count += 1 @@ -1069,3 +986,4 @@ def main(): if __name__ == "__main__": main() + diff --git a/kgtk/io/nodereader.py b/kgtk/io/nodereader.py index b65d30a07..56702a73a 100644 --- a/kgtk/io/nodereader.py +++ b/kgtk/io/nodereader.py @@ -10,7 +10,7 @@ import sys import typing -from kgtk.io.kgtkreader import KgtkReader +from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.utils.closableiter import ClosableIter from kgtk.utils.enumnameaction import EnumNameAction from kgtk.utils.validationaction import ValidationAction @@ -22,60 +22,37 @@ class NodeReader(KgtkReader): @classmethod def open_node_file(cls, file_path: typing.Optional[Path], - force_column_names: typing.Optional[typing.List[str]] = None, # - skip_first_record: bool = False, - fill_short_lines: bool = False, - truncate_long_lines: bool = False, error_file: typing.TextIO = sys.stderr, - error_limit: int = KgtkReader.ERROR_LIMIT_DEFAULT, - empty_line_action: ValidationAction = ValidationAction.EXCLUDE, - comment_line_action: ValidationAction = ValidationAction.EXCLUDE, - whitespace_line_action: ValidationAction = ValidationAction.EXCLUDE, - blank_id_line_action: ValidationAction = ValidationAction.EXCLUDE, - short_line_action: ValidationAction = ValidationAction.EXCLUDE, - long_line_action: ValidationAction = ValidationAction.EXCLUDE, - invalid_value_action: ValidationAction = ValidationAction.REPORT, - header_error_action: ValidationAction = ValidationAction.EXIT, - unsafe_column_name_action: ValidationAction = ValidationAction.REPORT, + options: typing.Optional[KgtkReaderOptions] = None, value_options: typing.Optional[KgtkValueOptions] = None, - compression_type: typing.Optional[str] = None, - gzip_in_parallel: bool = False, - gzip_queue_size: int = KgtkReader.GZIP_QUEUE_SIZE_DEFAULT, - column_separator: str = KgtkReader.COLUMN_SEPARATOR, verbose: bool = False, very_verbose: bool = False)->"NodeReader": - source: ClosableIter[str] = cls._openfile(file_path, - compression_type=compression_type, - gzip_in_parallel=gzip_in_parallel, - gzip_queue_size=gzip_queue_size, - error_file=error_file, - verbose=verbose) + # Supply the default reader and value options: + (options, value_options) = cls._default_options(options, value_options) - # Read the node file header and split it into column names. + source: ClosableIter[str] = cls._openfile(file_path, options=options, error_file=error_file, verbose=verbose) + + # Read the edge file header and split it into column names. header: str column_names: typing.List[str] - (header, column_names) = cls._build_column_names(source, - force_column_names=force_column_names, - skip_first_record=skip_first_record, - column_separator=column_separator, - error_file=error_file, - verbose=verbose) + (header, column_names) = cls._build_column_names(source, options=options, error_file=error_file, verbose=verbose) + # Check for unsafe column names. cls.check_column_names(column_names, header_line=header, - error_action=unsafe_column_name_action, + error_action=options.unsafe_column_name_action, error_file=error_file) # Build a map from column name to column index. column_name_map: typing.Mapping[str, int] = cls.build_column_name_map(column_names, header_line=header, - error_action=header_error_action, + error_action=options.header_error_action, error_file=error_file) # Get the index of the required column. id_column_idx: int = cls.required_node_column(column_name_map, header_line=header, - error_action=header_error_action, + error_action=options.header_error_action, error_file=error_file) if verbose: @@ -83,44 +60,27 @@ def open_node_file(cls, return cls(file_path=file_path, source=source, - column_separator=column_separator, column_names=column_names, column_name_map=column_name_map, column_count=len(column_names), id_column_idx=id_column_idx, - force_column_names=force_column_names, - skip_first_record=skip_first_record, - fill_short_lines=fill_short_lines, - truncate_long_lines=truncate_long_lines, error_file=error_file, - error_limit=error_limit, - empty_line_action=empty_line_action, - comment_line_action=comment_line_action, - whitespace_line_action=whitespace_line_action, - blank_id_line_action=blank_id_line_action, - short_line_action=short_line_action, - long_line_action=long_line_action, - invalid_value_action=invalid_value_action, - header_error_action=header_error_action, - unsafe_column_name_action=unsafe_column_name_action, + options=options, value_options=value_options, - compression_type=compression_type, - gzip_in_parallel=gzip_in_parallel, - gzip_queue_size=gzip_queue_size, is_edge_file=False, is_node_file=True, verbose=verbose, very_verbose=very_verbose, ) - def _ignore_if_blank_fields(self, values: typing.List[str], line: str)->bool: + def _ignore_if_blank_required_fields(self, values: typing.List[str], line: str)->bool: # Ignore line_action with blank id fields. This code comes after # filling missing trailing columns, although it could be reworked # to come first. - if self.blank_id_line_action != ValidationAction.PASS and self.id_column_idx >= 0 and len(values) > self.id_column_idx: + if self.options.blank_required_field_line_action != ValidationAction.PASS and self.id_column_idx >= 0 and len(values) > self.id_column_idx: id_value: str = values[self.id_column_idx] if len(id_value) == 0 or id_value.isspace(): - return self.exclude_line(self.blank_id_line_action, "id is blank", line) + return self.exclude_line(self.options.blank_required_field_line_action, "id is blank", line) return False # Do not ignore this line def _skip_reserved_fields(self, column_name)->bool: @@ -133,43 +93,28 @@ def main(): Test the KGTK node file reader. """ parser = ArgumentParser() - KgtkReader.add_operation_arguments(parser) - KgtkReader.add_file_arguments(parser, node_options=True) + parser.add_argument(dest="kgtk_file", help="The KGTK edge file to read", type=Path, nargs="?") + KgtkReader.add_debug_arguments(parser) + KgtkReaderOptions.add_arguments(parser) KgtkValueOptions.add_arguments(parser) args = parser.parse_args() error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr - # Build the value parsing option structure. + # Build the option structures. + reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args, mode=KgtkReaderMode.NODE) value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) - er: NodeReader = NodeReader.open(args.kgtk_file, - force_column_names=args.force_column_names, - skip_first_record=args.skip_first_record, - fill_short_lines=args.fill_short_lines, - truncate_long_lines=args.truncate_long_lines, - error_file=error_file, - error_limit=args.error_limit, - empty_line_action=args.empty_line_action, - comment_line=args.comment_line_action, - whitespace_line_action=args.whitespace_line_action, - blank_id_line_action=args.blank_id_line_action, - short_line_action=args.short_line_action, - long_line_action=args.long_line_action, - invalid_value_action=args.invalid_value_action, - header_error_action=args.header_error_action, - unsafe_column_name_action=args.unsafe_column_name_action, - value_options=value_options, - compression_type=args.compression_type, - gzip_in_parallel=args.gzip_in_parallel, - gzip_queue_size=args.gzip_queue_size, - column_separator=args.column_separator, - mode=KgtkReader.Mode.NODE, - verbose=args.verbose, very_verbose=args.very_verbose) + nr: NodeReader = NodeReader.open_edge_file(args.kgtk_file, + error_file=error_file, + options=reader_options, + value_options=value_options, + column_separator=args.column_separator, + verbose=args.verbose, very_verbose=args.very_verbose) line_count: int = 0 row: typing.List[str] - for row in er: + for row in nr: line_count += 1 print("Read %d lines" % line_count) From 2dc66d82b3def5d3baee565f24cc3bbd2264fa4e Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Mon, 11 May 2020 12:17:50 -0700 Subject: [PATCH 136/278] Continuing conversion to KgtkReaderOptions. --- kgtk/cli/clean_data.py | 163 ++++++---------------------------------- kgtk/cli/ifexists.py | 84 +++++++++------------ kgtk/cli/ifnotexists.py | 88 +++++++++------------- kgtk/cli/validate.py | 6 +- kgtk/join/ifexists.py | 132 ++++++++++++++++---------------- kgtk/join/kgtkjoiner.py | 91 ++++++++-------------- 6 files changed, 197 insertions(+), 367 deletions(-) diff --git a/kgtk/cli/clean_data.py b/kgtk/cli/clean_data.py index bb4059d6b..e52a4f518 100644 --- a/kgtk/cli/clean_data.py +++ b/kgtk/cli/clean_data.py @@ -2,17 +2,17 @@ Copy a KGTK file, validating it and producing a clean KGTK file (no comments, whitespace lines, etc.) as output. +TODO: Need KgtkWriterOptions. + """ from pathlib import Path import sys import typing -from kgtk.kgtkformat import KgtkFormat -from kgtk.io.kgtkreader import KgtkReader +from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter -from kgtk.utils.enumnameaction import EnumNameAction -from kgtk.utils.validationaction import ValidationAction +from kgtk.value.kgtkvalueoptions import KgtkValueOptions def parser(): return { @@ -26,120 +26,21 @@ def add_arguments(parser): Args: parser (argparse.ArgumentParser) """ - parser.add_argument( "input_file", nargs="?", help="The KGTK file to read. May be omitted or '-' for stdin.", type=Path,) - + parser.add_argument( "input_file", nargs="?", help="The KGTK file to read. May be omitted or '-' for stdin.", type=Path) parser.add_argument( "output_file", nargs="?", help="The KGTK file to write. May be omitted or '-' for stdout.", type=Path) - parser.add_argument( "--blank-id-line-action", dest="blank_id_line_action", - help="The action to take when a blank id field is detected.", - type=ValidationAction, action=EnumNameAction, default=None) - - parser.add_argument( "--blank-node1-line-action", dest="blank_node1_line_action", - help="The action to take when a blank node1 field is detected.", - type=ValidationAction, action=EnumNameAction, default=None) - - parser.add_argument( "--blank-node2-line-action", dest="blank_node2_line_action", - help="The action to take when a blank node2 field is detected.", - type=ValidationAction, action=EnumNameAction, default=None) - - parser.add_argument( "--blank-required-field-line-action", dest="blank_line_action", - help="The action to take when a line with a blank node1, node2, or id field (per mode) is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - - parser.add_argument( "--column-separator", dest="column_separator", - help="Column separator.", type=str, default=KgtkFormat.COLUMN_SEPARATOR) - - parser.add_argument( "--comment-line-action", dest="comment_line_action", - help="The action to take when a comment line is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - - parser.add_argument( "--empty-line-action", dest="empty_line_action", - help="The action to take when an empty line is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - - errors_to = parser.add_mutually_exclusive_group() - errors_to.add_argument( "--errors-to-stdout", dest="errors_to_stdout", - help="Send errors to stdout instead of stderr (default)", action="store_true") - errors_to.add_argument( "--errors-to-stderr", dest="errors_to_stderr", - help="Send errors to stderr instead of stdout", action="store_true") - - parser.add_argument( "--error-limit", dest="error_limit", - help="The maximum number of errors to report before failing", type=int, default=KgtkReader.ERROR_LIMIT_DEFAULT) - - parser.add_argument( "--fill-short-lines", dest="fill_short_lines", - help="Fill missing trailing columns in short lines with empty values.", action='store_true') - - parser.add_argument( "--force-column-names", dest="force_column_names", help="Force the column names.", nargs='+') - - parser.add_argument( "--gzip-in-parallel", dest="gzip_in_parallel", help="Execute gzip in parallel.", action='store_true') - - parser.add_argument( "--gzip-queue-size", dest="gzip_queue_size", - help="Queue size for parallel gzip.", type=int, default=KgtkReader.GZIP_QUEUE_SIZE_DEFAULT) - - parser.add_argument( "--header-error-action", dest="header_error_action", - help="The action to take when a header error is detected Only ERROR or EXIT are supported.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXIT) - - parser.add_argument( "--input-compression", dest="input_compression_type", help="Specify the input file compression type, otherwise use the extension.") - - parser.add_argument( "--input-mode", dest="input_mode", - help="Determine the KGTK input file mode.", type=KgtkReader.Mode, action=EnumNameAction, default=KgtkReader.Mode.AUTO) - - parser.add_argument( "--long-line-action", dest="long_line_action", - help="The action to take when a long line is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - - # Not yet implemented: - # parser.add_argument( "--output-compression", dest="input_compression_type", help="Specify the input file compression type, otherwise use the extension.") - - parser.add_argument( "--output-mode", dest="output_mode", - help="Determine the KGTK output file mode.", type=KgtkWriter.Mode, action=EnumNameAction, default=KgtkWriter.Mode.AUTO) - - parser.add_argument( "--short-line-action", dest="short_line_action", - help="The action to take whe a short line is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - - parser.add_argument( "--skip-first-record", dest="skip_first_record", help="Skip the first record when forcing column names.", action='store_true') - - parser.add_argument( "--truncate-long-lines", dest="truncate_long_lines", - help="Remove excess trailing columns in long lines.", action='store_true') - - parser.add_argument("-v", "--verbose", dest="verbose", help="Print additional progress messages.", action='store_true') - - parser.add_argument( "--very-verbose", dest="very_verbose", help="Print additional progress messages.", action='store_true') - - parser.add_argument( "--whitespace-line-action", dest="whitespace_line_action", - help="The action to take when a whitespace line is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) + KgtkReader.add_debug_arguments(parser) + KgtkReaderOptions.add_arguments(parser, mode_options=True) + KgtkValueOptions.add_arguments(parser) def run(input_file: typing.Optional[Path], output_file: typing.Optional[Path], - force_column_names: typing.Optional[typing.List[str]] = None, - skip_first_record: bool = False, - fill_short_lines: bool = False, - truncate_long_lines: bool = False, errors_to_stdout: bool = False, - error_limit: int = KgtkReader.ERROR_LIMIT_DEFAULT, - empty_line_action: ValidationAction = ValidationAction.EXCLUDE, - comment_line_action: ValidationAction = ValidationAction.EXCLUDE, - whitespace_line_action: ValidationAction = ValidationAction.EXCLUDE, - blank_line_action: ValidationAction = ValidationAction.EXCLUDE, - blank_node1_line_action: typing.Optional[ValidationAction] = None, - blank_node2_line_action: typing.Optional[ValidationAction] = None, - blank_id_line_action: typing.Optional[ValidationAction] = None, - short_line_action: ValidationAction = ValidationAction.EXCLUDE, - long_line_action: ValidationAction = ValidationAction.EXCLUDE, - header_error_action: ValidationAction = ValidationAction.EXIT, - input_compression_type: typing.Optional[str] = None, - # output_compression_type: typing.Optional[str] = None, # Not yet implemented - gzip_in_parallel: bool = False, - gzip_queue_size: int = KgtkReader.GZIP_QUEUE_SIZE_DEFAULT, - column_separator: str = KgtkFormat.COLUMN_SEPARATOR, - input_mode: KgtkReader.Mode = KgtkReader.Mode.AUTO, - output_mode: KgtkWriter.Mode = KgtkWriter.Mode.AUTO, + errors_to_stderr: bool = False, verbose: bool = False, very_verbose: bool = False, + **kwargs # Whatever KgtkReaderOptions and KgtkValueOptions want. )->int: # import modules locally from kgtk.exceptions import KGTKException @@ -147,48 +48,30 @@ def run(input_file: typing.Optional[Path], # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr + # Build the option structures. + reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) + value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) + if verbose: if input_file is not None: - print("Cleaning data from '%s'" % str(input_file), file=error_file) + print("Cleaning data from '%s'" % str(input_file), file=error_file, flush=True) else: - print ("Cleaning data from stdin", file=error_file) + print ("Cleaning data from stdin", file=error_file, flush=True) if output_file is not None: - print("Writing data to '%s'" % str(output_file), file=error_file) + print("Writing data to '%s'" % str(output_file), file=error_file, flush=True) else: - print ("Writing data to stdin", file=error_file) + print ("Writing data to stdin", file=error_file, flush=True) try: kr: KgtkReader = KgtkReader.open(input_file, - force_column_names=force_column_names, - skip_first_record=skip_first_record, - fill_short_lines=fill_short_lines, - truncate_long_lines=truncate_long_lines, error_file=error_file, - error_limit=error_limit, - empty_line_action=empty_line_action, - comment_line_action=comment_line_action, - whitespace_line_action=whitespace_line_action, - blank_line_action=blank_line_action, - blank_node1_line_action=blank_node1_line_action, - blank_node2_line_action=blank_node2_line_action, - blank_id_line_action=blank_id_line_action, - short_line_action=short_line_action, - long_line_action=long_line_action, - compression_type=input_compression_type, - header_error_action=header_error_action, - gzip_in_parallel=gzip_in_parallel, - gzip_queue_size=gzip_queue_size, - column_separator=column_separator, - mode=input_mode, - verbose=verbose, very_verbose=very_verbose) + options=reader_options, + value_options=value_options, + verbose=verbose, + very_verbose=very_verbose) kw: KgtkWriter = KgtkWriter.open(kr.column_names, output_file, - header_error_action=header_error_action, - gzip_in_parallel=gzip_in_parallel, - gzip_queue_size=gzip_queue_size, - column_separator=column_separator, - mode=output_mode, verbose=verbose, very_verbose=very_verbose) line_count: int = 0 @@ -199,7 +82,7 @@ def run(input_file: typing.Optional[Path], kw.close() if verbose: - print("Copied %d clean data lines" % line_count, file=error_file) + print("Copied %d clean data lines" % line_count, file=error_file, flush=True) return 0 except Exception as e: diff --git a/kgtk/cli/ifexists.py b/kgtk/cli/ifexists.py index d6c2a8b23..112ff7eda 100644 --- a/kgtk/cli/ifexists.py +++ b/kgtk/cli/ifexists.py @@ -1,17 +1,16 @@ """Filter a KGTK file based on whether one or more records exist in a second KGTK file with matching values for one or more fields. + +TODO: Need KgtkWriterOptions """ from pathlib import Path import sys import typing -from kgtk.kgtkformat import KgtkFormat -from kgtk.io.kgtkreader import KgtkReader +from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.join.ifexists import IfExists -from kgtk.utils.enumnameaction import EnumNameAction -from kgtk.utils.validationaction import ValidationAction from kgtk.value.kgtkvalueoptions import KgtkValueOptions def parser(): @@ -26,34 +25,21 @@ def add_arguments(parser): Args: parser (argparse.ArgumentParser) """ - parser.add_argument( "input_kgtk_file", nargs="?", help="The KGTK file to filter ('left' file). May be omitted or '-' for stdin.", type=Path) + parser.add_argument( "input_kgtk_file", nargs="?", help="The KGTK file to filter. May be omitted or '-' for stdin.", type=Path) - parser.add_argument( "--filter-on", dest="filter_kgtk_file", help="The KGTK file to filter against ('right' file).", type=Path, required=True) + parser.add_argument( "--filter-on", dest="_filter_kgtk_file", help="The KGTK file to filter against.", type=Path, required=True) parser.add_argument("-o", "--output-file", dest="output_kgtk_file", help="The KGTK file to write", type=Path, default=None) - parser.add_argument( "--left-keys", dest="left_keys", help="The key columns in the file being filtered.", nargs='*') - - parser.add_argument( "--right-keys", dest="right_keys", help="The key columns in the filter-on file.", nargs='*') - + parser.add_argument( "--input-keys", "--left-keys", dest="input_keys", help="The key columns in the file being filtered.", nargs='*') - # A subset of common arguments: - errors_to = parser.add_mutually_exclusive_group() - errors_to.add_argument( "--errors-to-stdout", dest="errors_to_stdout", - help="Send errors to stdout instead of stderr (default)", action="store_true") - errors_to.add_argument( "--errors-to-stderr", dest="errors_to_stderr", - help="Send errors to stderr instead of stdout", action="store_true") + parser.add_argument( "--filter-keys", "--right-keys", dest="filter_keys", help="The key columns in the filter-on file.", nargs='*') - parser.add_argument( "--error-limit", dest="error_limit", - help="The maximum number of errors to report before failing", type=int, default=KgtkReader.ERROR_LIMIT_DEFAULT) + parser.add_argument( "--field-separator", dest="field_separator", help="Separator for multifield keys", default=IfExists.FIELD_SEPARATOR_DEFAULT) - parser.add_argument( "--field-separator", dest="field_separator", - help="Field separator.", type=str, default=IfExists.FIELD_SEPARATOR_DEFAULT) - - parser.add_argument("-v", "--verbose", dest="verbose", help="Print additional progress messages.", action='store_true') - - parser.add_argument( "--very-verbose", dest="very_verbose", help="Print additional progress messages.", action='store_true') - + KgtkReader.add_file_arguments(parser, mode_options=True, who="input") + KgtkReader.add_file_arguments(parser, mode_options=True, who="filter") + KgtkValueOptions.add_arguments(parser) # Note: Any arguments described by KgtkValueOptions.add_arguments(...) @@ -64,45 +50,45 @@ def add_arguments(parser): def run(input_kgtk_file: typing.Optional[Path], filter_kgtk_file: Path, output_kgtk_file: typing.Optional[Path], - left_keys: typing.Optional[typing.List[str]], - right_keys: typing.Optional[typing.List[str]], + input_keys: typing.Optional[typing.List[str]], + filter_keys: typing.Optional[typing.List[str]], - # Some common arguments: - errors_to_stdout: bool = False, - errors_to_stderr: bool = False, - error_limit: int = KgtkReader.ERROR_LIMIT_DEFAULT, field_separator: str = IfExists.FIELD_SEPARATOR_DEFAULT, + + errors_to_stdout: bool = False, + errors_to_stderr: bool = True, verbose: bool = False, very_verbose: bool = False, - **kwargs # Whatever KgtkValueOptions wants. + **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. )->int: # import modules locally from kgtk.exceptions import KGTKException - if input_kgtk_file is None: - input_kgtk_file = Path("-") - # Select where to send error messages, defaulting to stderr. - # (Not used yet) - error_file: typing.TextIO = sys.stderr if errors_to_stderr else sys.stdout + error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr - # Build the value parsing option structure. + # Build the option structures. + input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="input") + filter_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="filter") value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) try: - ie: IfExists = IfExists(left_file_path=input_kgtk_file, - right_file_path=filter_kgtk_file, - output_path=output_kgtk_file, - left_keys=left_keys, - right_keys=right_keys, - field_separator=field_separator, - invalid_value_action=ValidationAction.PASS, - value_options=value_options, - error_limit=error_limit, - verbose=verbose, - very_verbose=very_verbose) + ie: IfExists = IfExists( + input_file_path=input_kgtk_file, + input_keys=input_keys, + filter_file_path=filter_kgtk_file, + filter_keys=filter_keys, + output_file_path=output_kgtk_file, + field_separator=field_separator, + input_reader_options=input_reader_options, + filter_reader_options=filter_reader_options, + value_options=value_options, + error_file=error_file, + verbose=verbose, + very_verbose=very_verbose, + ) ie.process() diff --git a/kgtk/cli/ifnotexists.py b/kgtk/cli/ifnotexists.py index bd9f5a52b..481f2ccbc 100644 --- a/kgtk/cli/ifnotexists.py +++ b/kgtk/cli/ifnotexists.py @@ -1,18 +1,16 @@ -"""Filter a KGTK file based on whether one or more records do not exist in a -second KGTK file with matching values for one or more fields. +"""Filter a KGTK file based on whether one or more records exist in a second +KGTK file with matching values for one or more fields. +TODO: Need KgtkWriterOptions """ from pathlib import Path import sys import typing -from kgtk.kgtkformat import KgtkFormat -from kgtk.io.kgtkreader import KgtkReader +from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.join.ifexists import IfExists -from kgtk.utils.enumnameaction import EnumNameAction -from kgtk.utils.validationaction import ValidationAction from kgtk.value.kgtkvalueoptions import KgtkValueOptions def parser(): @@ -27,34 +25,21 @@ def add_arguments(parser): Args: parser (argparse.ArgumentParser) """ - parser.add_argument( "input_kgtk_file", nargs="?", help="The KGTK file to filter ('left' file). May be omitted or '-' for stdin.", type=Path) + parser.add_argument( "input_kgtk_file", nargs="?", help="The KGTK file to filter. May be omitted or '-' for stdin.", type=Path) - parser.add_argument( "--filter-on", dest="filter_kgtk_file", help="The KGTK file to filter against ('right' file).", type=Path, required=True) + parser.add_argument( "--filter-on", dest="_filter_kgtk_file", help="The KGTK file to filter against.", type=Path, required=True) parser.add_argument("-o", "--output-file", dest="output_kgtk_file", help="The KGTK file to write", type=Path, default=None) - parser.add_argument( "--left-keys", dest="left_keys", help="The key columns in the file being filtered.", nargs='*') + parser.add_argument( "--input-keys", "--left-keys", dest="input_keys", help="The key columns in the file being filtered.", nargs='*') - parser.add_argument( "--right-keys", dest="right_keys", help="The key columns in the filter-on file.", nargs='*') + parser.add_argument( "--filter-keys", "--right-keys", dest="filter_keys", help="The key columns in the filter-on file.", nargs='*') + parser.add_argument( "--field-separator", dest="field_separator", help="Separator for multifield keys", default=IfExists.FIELD_SEPARATOR_DEFAULT) - # A subset of common arguments: - errors_to = parser.add_mutually_exclusive_group() - errors_to.add_argument( "--errors-to-stdout", dest="errors_to_stdout", - help="Send errors to stdout instead of stderr (default)", action="store_true") - errors_to.add_argument( "--errors-to-stderr", dest="errors_to_stderr", - help="Send errors to stderr instead of stdout", action="store_true") - - parser.add_argument( "--error-limit", dest="error_limit", - help="The maximum number of errors to report before failing", type=int, default=KgtkReader.ERROR_LIMIT_DEFAULT) - - parser.add_argument( "--field-separator", dest="field_separator", - help="Field separator.", type=str, default=IfExists.FIELD_SEPARATOR_DEFAULT) - - parser.add_argument("-v", "--verbose", dest="verbose", help="Print additional progress messages.", action='store_true') - - parser.add_argument( "--very-verbose", dest="very_verbose", help="Print additional progress messages.", action='store_true') - + KgtkReader.add_file_arguments(parser, mode_options=True, who="input") + KgtkReader.add_file_arguments(parser, mode_options=True, who="filter") + KgtkValueOptions.add_arguments(parser) # Note: Any arguments described by KgtkValueOptions.add_arguments(...) @@ -65,45 +50,46 @@ def add_arguments(parser): def run(input_kgtk_file: typing.Optional[Path], filter_kgtk_file: Path, output_kgtk_file: typing.Optional[Path], - left_keys: typing.Optional[typing.List[str]], - right_keys: typing.Optional[typing.List[str]], + input_keys: typing.Optional[typing.List[str]], + filter_keys: typing.Optional[typing.List[str]], - # Some common arguments: - errors_to_stdout: bool = False, - errors_to_stderr: bool = False, - error_limit: int = KgtkReader.ERROR_LIMIT_DEFAULT, field_separator: str = IfExists.FIELD_SEPARATOR_DEFAULT, + + errors_to_stdout: bool = False, + errors_to_stderr: bool = True, verbose: bool = False, very_verbose: bool = False, - **kwargs # Whatever KgtkValueOptions wants. + **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. )->int: # import modules locally from kgtk.exceptions import KGTKException - if input_kgtk_file is None: - input_kgtk_file = Path("-") - # Select where to send error messages, defaulting to stderr. - # (Not used yet) - error_file: typing.TextIO = sys.stderr if errors_to_stderr else sys.stdout + error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr - # Build the value parsing option structure. + # Build the option structures. + input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="input") + filter_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="filter") value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) try: - ie: IfExists = IfExists(left_file_path=input_kgtk_file, - right_file_path=filter_kgtk_file, - output_path=output_kgtk_file, - invert=True, - left_keys=left_keys, - right_keys=right_keys, - field_separator=field_separator, - value_options=value_options, - error_limit=error_limit, - verbose=verbose, - very_verbose=very_verbose) + ie: IfExists = IfExists( + input_file_path=input_kgtk_file, + input_keys=input_keys, + filter_file_path=filter_kgtk_file, + filter_keys=filter_keys, + output_file_path=output_kgtk_file, + invert=True, + field_separator=field_separator, + input_reader_options=input_reader_options, + filter_reader_options=filter_reader_options, + value_options=value_options, + error_file=error_file, + verbose=verbose, + very_verbose=very_verbose, + ) ie.process() diff --git a/kgtk/cli/validate.py b/kgtk/cli/validate.py index 2b7c1a5dc..2fa013bfb 100644 --- a/kgtk/cli/validate.py +++ b/kgtk/cli/validate.py @@ -15,10 +15,7 @@ import sys import typing -from kgtk.kgtkformat import KgtkFormat from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions -from kgtk.utils.enumnameaction import EnumNameAction -from kgtk.utils.validationaction import ValidationAction from kgtk.value.kgtkvalueoptions import KgtkValueOptions def parser(): @@ -35,7 +32,6 @@ def add_arguments(parser): """ parser.add_argument( "kgtk_files", nargs="*", help="The KGTK file(s) to validate. May be omitted or '-' for stdin.", type=Path) - parser.add_argument( "--header-only", dest="header_only", help="Process the only the header of the input file.", action="store_true") @@ -50,7 +46,7 @@ def run(kgtk_files: typing.Optional[typing.List[typing.Optional[Path]]], header_only: bool = False, verbose: bool = False, very_verbose: bool = False, - **kwargs # Whatever KgtkValueOptions wants. + **kwargs # Whatever KgtkReaderOptions and KgtkValueOptions want. )->int: # import modules locally from kgtk.exceptions import KGTKException diff --git a/kgtk/join/ifexists.py b/kgtk/join/ifexists.py index 9a19e5ef8..1e85e44fa 100644 --- a/kgtk/join/ifexists.py +++ b/kgtk/join/ifexists.py @@ -25,7 +25,7 @@ import typing from kgtk.kgtkformat import KgtkFormat -from kgtk.io.kgtkreader import KgtkReader +from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.utils.enumnameaction import EnumNameAction from kgtk.utils.validationaction import ValidationAction @@ -33,11 +33,11 @@ @attr.s(slots=True, frozen=True) class IfExists(KgtkFormat): - input_reader_args: typing.Mapping[str, typing.Any] = attr.ib() + input_file_path: typing.Optional[Path] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(Path))) input_keys: typing.Optional[typing.List[str]] = attr.ib(validator=attr.validators.optional(attr.validators.deep_iterable(member_validator=attr.validators.instance_of(str), iterable_validator=attr.validators.instance_of(list)))) - filter_reader_args: typing.Mapping[str, typing.Any] = attr.ib() + filter_file_path: Path = attr.ib(validator=attr.validators.instance_of(Path)) filter_keys: typing.Optional[typing.List[str]] = attr.ib(validator=attr.validators.optional(attr.validators.deep_iterable(member_validator=attr.validators.instance_of(str), iterable_validator=attr.validators.instance_of(list)))) @@ -48,10 +48,13 @@ class IfExists(KgtkFormat): invert: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) - # TODO: find a working validator + # TODO: find working validators # value_options: typing.Optional[KgtkValueOptions] = attr.ib(attr.validators.optional(attr.validators.instance_of(KgtkValueOptions)), default=None) + input_reader_options: typing.Optional[KgtkReaderOptions]= attr.ib(default=None) + filter_reader_options: typing.Optional[KgtkReaderOptions]= attr.ib(default=None) value_options: typing.Optional[KgtkValueOptions] = attr.ib(default=None) + error_file: typing.TextIO = attr.ib(default=sys.stderr) verbose: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) very_verbose: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) @@ -74,11 +77,11 @@ def get_edge_key_columns(self, kr: KgtkReader, who: str)-> typing.List[int]: if not kr.is_edge_file: raise ValueError("get_edge_keys called on %s at wrong time." % who) if kr.node1_column_idx < 0: - raise ValueError("The node1 column is missing from the %s node file." % who) + raise ValueError("The node1 column is missing from the %s edge file." % who) if kr.label_column_idx < 0: - raise ValueError("The label column is missing from the %s node file." % who) + raise ValueError("The label column is missing from the %s edge file." % who) if kr.node2_column_idx < 0: - raise ValueError("The node2 column is missing from the %s node file." % who) + raise ValueError("The node2 column is missing from the %s edge file." % who) return [ kr.node1_column_idx, kr.label_column_idx, kr.node2_column_idx ] def get_supplied_key_columns(self, supplied_keys: typing.List[str], kr: KgtkReader, who: str)->typing.List[int]: @@ -89,7 +92,7 @@ def get_supplied_key_columns(self, supplied_keys: typing.List[str], kr: KgtkRead raise ValueError("Column %s is not in the %s file" % (key, who)) result.append(kr.column_name_map[key]) return result - + def get_key_columns(self, supplied_keys: typing.Optional[typing.List[str]], kr: KgtkReader, other_kr: KgtkReader, who: str)->typing.List[int]: if supplied_keys is not None and len(supplied_keys) > 0: return self.get_supplied_key_columns(supplied_keys, kr, who) @@ -121,79 +124,76 @@ def extract_key_set(self, kr: KgtkReader, who: str, key_columns: typing.List[int def process(self): # Open the input files once. if self.verbose: - print("Opening the input file: %s" % self.left_file_path, flush=True) - left_kr: KgtkReader = KgtkReader.open(self.left_file_path, - short_line_action=self.short_line_action, - long_line_action=self.long_line_action, - fill_short_lines=self.fill_short_lines, - truncate_long_lines=self.truncate_long_lines, - invalid_value_action=self.invalid_value_action, + if self.left_file_path is not None: + print("Opening the input file: %s" % self.input_file_path, file=self.error_file, flush=True) + else: + print("Reading the input data from stdin", file=self.error_file, flush=True) + + input_kr: KgtkReader = KgtkReader.open(self.input_file_path, + error_file=self.error_file, + options=self.input_reader_options, value_options = self.value_options, - error_limit=self.error_limit, verbose=self.verbose, very_verbose=self.very_verbose, ) if self.verbose: - print("Opening the right input file: %s" % self.right_file_path, flush=True) - right_kr: KgtkReader = KgtkReader.open(self.right_file_path, - short_line_action=self.short_line_action, - long_line_action=self.long_line_action, - fill_short_lines=self.fill_short_lines, - truncate_long_lines=self.truncate_long_lines, - invalid_value_action=self.invalid_value_action, - value_options = self.value_options, - error_limit=self.error_limit, - verbose=self.verbose, - very_verbose=self.very_verbose, + print("Opening the filter input file: %s" % self.filter_file_path, flush=True) + filter_kr: KgtkReader = KgtkReader.open(self.filter_file_path, + error_file=self.error_file, + options=self.filter_reader_options, + value_options=self.value_options, + verbose=self.verbose, + very_verbose=self.very_verbose, ) - left_key_columns: typing.List[int] = self.get_key_columns(self.left_keys, left_kr, right_kr, "left") - right_key_columns: typing.List[int] = self.get_key_columns(self.right_keys, right_kr, left_kr, "right") + input_key_columns: typing.List[int] = self.get_key_columns(self.input_keys, input_kr, filter_kr, "input") + filter_key_columns: typing.List[int] = self.get_key_columns(self.filter_keys, filter_kr, input_kr, "filter") - if len(left_key_columns) != len(right_key_columns): - print("There are %d left key columns but %d right key columns. Exiting." % (len(left_key_columns), len(right_key_columns)), flush=True) + if len(input_key_columns) != len(filter_key_columns): + print("There are %d input key columns but %d filter key columns. Exiting." % (len(input_key_columns), len(filter_key_columns)), + file=self.error_file, flush=True) return if self.verbose: - print("Building the input key set from %s" % self.right_file_path, flush=True) - key_set: typint.Set[str] = self.extract_key_set(right_kr, "right", right_key_columns) + print("Building the filter key set from %s" % self.filter_file_path, file=self.error_file, flush=True) + key_set: typint.Set[str] = self.extract_key_set(filter_kr, "fitler", filter_key_columns) if self.verbose or self.very_verbose: - print("There are %d entries in the key set." % len(key_set)) + print("There are %d entries in the filter key set." % len(key_set), file=self.error_file, flush=True) if self.very_verbose: - print("Keys: %s" % " ".join(key_set)) + print("Keys: %s" % " ".join(key_set), file=self.error_file, flush=True) if self.verbose: - print("Opening the output file: %s" % self.output_path, flush=True) + print("Opening the output file: %s" % self.output_path, file=self.error_file, flush=True) ew: KgtkWriter = KgtkWriter.open(left_kr.column_names, self.output_path, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, - gzip_in_parallel=self.gzip_in_parallel, + gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) if self.verbose: - print("Filtering records from %s" % self.left_file_path, flush=True) + print("Filtering records from %s" % self.input_file_path, file=self.error_file, flush=True) input_line_count: int = 0 output_line_count: int = 0; row: typing.list[str] - for row in left_kr: + for row in input_kr: input_line_count += 1 - left_key: str = self.build_key(row, left_key_columns) + input_key: str = self.build_key(row, input_key_columns) if self.invert: - if left_key not in key_set: + if input_key not in key_set: ew.write(row) output_line_count += 1 else: - if left_key in key_set: + if input_key in key_set: ew.write(row) output_line_count += 1 if self.verbose: - print("Read %d records, wrote %d records." % (input_line_count, output_line_count), flush=True) + print("Read %d records, wrote %d records." % (input_line_count, output_line_count), file=self.error_file, flush=True) ew.close() @@ -202,7 +202,11 @@ def main(): Test the KGTK file joiner. """ parser: ArgumentParser = ArgumentParser() - KgtkReader.add_operation_arguments(parser) + KgtkReader.add_debug_arguments(parser) + + parser.add_argument(dest="input_file", help="The KGTK file with the input data", type=Path, nargs="?") + + parser.add_argument( "--filter-on", dest="filter_file_path", help="The KGTK file with the filter data", type=Path, required=True) parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to read", type=Path, default=None) @@ -210,34 +214,36 @@ def main(): parser.add_argument( "--invert", dest="invert", help="Invert the test (if not exists).", action='store_true') - parser.add_argument( "--input-keys", dest="_input_keys", help="The key columns in the input file.", nargs='*') - parser.add_argument( "--filter-keys", dest="_filter_keys", help="The key columns in the filter file.", nargs='*') + parser.add_argument( "--input-keys", dest="input_keys", help="The key columns in the input file.", nargs='*') + parser.add_argument( "--filter-keys", dest="filter_keys", help="The key columns in the filter file.", nargs='*') KgtkReader.add_file_arguments(parser, mode_options=True, who="input") - - # TODO: Find a way to use "--filter-on" - KgtkReader.add_file_arguments(parser, mode_options=True, who="filter", optional_file=True) - + KgtkReader.add_file_arguments(parser, mode_options=True, who="filter") KgtkValueOptions.add_arguments(parser) args: Namespace = parser.parse_args() - input_args: typing.Mapping[str, typing.Any] = dict(((item[0][len("input_"):], item[1]) for item in vars(args) if item[0].startswith("input_"))) - filter_args: typing.Mapping[str, typing.Any] = dict(((item[0][len("filter_"):], item[1]) for item in vars(args) if item[0].startswith("filter_"))) + error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr - # Build the value parsing option structure. + # Build the option structures. + input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args, who="input") + filter_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args, who="filter") value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) - ie: IfExists = IfExists(input_reader_args=input_args, - input_keys=args._input_keys, - filter_reader_args=filter_args, - filter_keys=args._filter_keys, - output_file_path=args.output_file_path, - field_separator=args.field_separator, - invert=args.invert, - value_options=value_options, - verbose=args.verbose, - very_verbose=args.very_verbose) + ie: IfExists = IfExists( + input_file_path=args.input_file_path, + input_keys=args.input_keys, + filter_file_path=args.filter_file_path, + filter_keys=args.filter_keys, + output_file_path=args.output_file_path, + field_separator=args.field_separator, + invert=args.invert, + input_reader_options=input_reader_options, + filter_reader_options=filter_reader_options, + value_options=value_options, + error_file=error_file, + verbose=args.verbose, + very_verbose=args.very_verbose) ie.process() diff --git a/kgtk/join/kgtkjoiner.py b/kgtk/join/kgtkjoiner.py index 4855ac774..58f597740 100644 --- a/kgtk/join/kgtkjoiner.py +++ b/kgtk/join/kgtkjoiner.py @@ -15,7 +15,7 @@ import typing from kgtk.kgtkformat import KgtkFormat -from kgtk.io.kgtkreader import KgtkReader +from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.utils.enumnameaction import EnumNameAction from kgtk.utils.validationaction import ValidationAction @@ -45,31 +45,24 @@ class KgtkJoiner(KgtkFormat): # The prefix applied to right file column names in the output file: prefix: typing.Optional[str] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(str)), default=None) - # The field separator used in multifield joins. The KGHT list character should be safe. + # The field separator used in multifield joins. The KGTK list character should be safe. # TODO: USE THE COLUMN SEPARATOR !!!!! field_separator: str = attr.ib(validator=attr.validators.instance_of(str), default=KgtkFormat.LIST_SEPARATOR) - # Ignore records with too many or too few fields? - short_line_action: ValidationAction = attr.ib(validator=attr.validators.instance_of(ValidationAction), default=ValidationAction.EXCLUDE) - long_line_action: ValidationAction = attr.ib(validator=attr.validators.instance_of(ValidationAction), default=ValidationAction.EXCLUDE) - - # Require or fill trailing fields? - fill_short_lines: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) - truncate_long_lines: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) - - # TODO: find a working validator + # TODO: find working validators: + left_reader_options: typing.Optional[KgtkReaderOptions] = attr.ib(default=None) + right_reader_options: typing.Optional[KgtkReaderOptions] = attr.ib(default=None) # value_options: typing.Optional[KgtkValueOptions] = attr.ib(attr.validators.optional(attr.validators.instance_of(KgtkValueOptions)), default=None) value_options: typing.Optional[KgtkValueOptions] = attr.ib(default=None) - gzip_in_parallel: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) - - error_limit: int = attr.ib(validator=attr.validators.instance_of(int), default=KgtkReader.ERROR_LIMIT_DEFAULT) - verbose: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) very_verbose: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) FIELD_SEPARATOR_DEFAULT: str = KgtkFormat.LIST_SEPARATOR + LEFT: str = "left" + RIGHT: str = "right" + def node1_column_idx(self, kr: KgtkReader, who: str)->int: idx: int = kr.node1_column_idx if idx < 0: @@ -163,14 +156,15 @@ def build_join_idx_list(self, kr: KgtkReader, who: str, join_columns: typing.Opt def extract_join_key_set(self, file_path: Path, who: str, join_idx_list: typing.List[int])->typing.Set[str]: if self.verbose: print("Extracting the join key set from the %s input file: %s" % (who, str(file_path)), flush=True) + reader_options: typing.Optional[KgtkReaderOptions] + if who == self.LEFT: + reader_options = self.left_reader_options + else: + reader_options = self.right_reader_options + kr: KgtkReader = KgtkReader.open(file_path, - short_line_action=self.short_line_action, - long_line_action=self.long_line_action, - fill_short_lines=self.fill_short_lines, - truncate_long_lines=self.truncate_long_lines, + options=reader_options, value_options = self.value_options, - gzip_in_parallel=self.gzip_in_parallel, - error_limit=self.error_limit, verbose=self.verbose, very_verbose=self.very_verbose) @@ -196,7 +190,7 @@ def join_key_sets(self, left_join_idx_list: typing.List[int], right_join_idx_lis elif self.left_join and not self.right_join: if self.verbose: print("Computing the left join key set", flush=True) - join_key_set = self.extract_join_key_set(self.left_file_path, "left", left_join_idx_list).copy() + join_key_set = self.extract_join_key_set(self.left_file_path, self.LEFT, left_join_idx_list).copy() if self.verbose: print("There are %d keys in the left join key set." % len(join_key_set), flush=True) return join_key_set @@ -204,7 +198,7 @@ def join_key_sets(self, left_join_idx_list: typing.List[int], right_join_idx_lis elif self.right_join and not self.left_join: if self.verbose: print("Computing the right join key set", flush=True) - join_key_set = self.extract_join_key_set(self.right_file_path, "right", right_join_idx_list).copy() + join_key_set = self.extract_join_key_set(self.right_file_path, self.RIGHT, right_join_idx_list).copy() if self.verbose: print("There are %d keys in the right join key set." % len(join_key_set), flush=True) return join_key_set @@ -212,10 +206,10 @@ def join_key_sets(self, left_join_idx_list: typing.List[int], right_join_idx_lis else: if self.verbose: print("Computing the inner join key set", flush=True) - left_join_key_set: typing.Set[str] = self.extract_join_key_set(self.left_file_path, "left", left_join_idx_list) + left_join_key_set: typing.Set[str] = self.extract_join_key_set(self.left_file_path, self.LEFT, left_join_idx_list) if self.verbose: print("There are %d keys in the left file key set." % len(left_join_key_set), flush=True) - right_join_key_set: typing.Set[str] = self.extract_join_key_set(self.right_file_path, "right", right_join_idx_list) + right_join_key_set: typing.Set[str] = self.extract_join_key_set(self.right_file_path, self.RIGHT, right_join_idx_list) if self.verbose: print("There are %d keys in the right file key set." % len(right_join_key_set), flush=True) join_key_set = left_join_key_set.intersection(right_join_key_set) @@ -264,10 +258,7 @@ def process(self): if self.verbose: print("Opening the left edge file: %s" % str(self.left_file_path), flush=True) left_kr: KgtkReader = KgtkReader.open(self.left_file_path, - short_line_action=self.short_line_action, - long_line_action=self.long_line_action, - fill_short_lines=self.fill_short_lines, - truncate_long_lines=self.truncate_long_lines, + options=self.left_reader_options, value_options = self.value_options, error_limit=self.error_limit) @@ -275,10 +266,7 @@ def process(self): if self.verbose: print("Opening the right edge file: %s" % str(self.right_file_path), flush=True) right_kr: KgtkReader = KgtkReader.open(self.right_file_path, - short_line_action=self.short_line_action, - long_line_action=self.long_line_action, - fill_short_lines=self.fill_short_lines, - truncate_long_lines=self.truncate_long_lines, + options=self.right_reader_options, value_options = self.value_options, error_limit=self.error_limit) @@ -292,8 +280,8 @@ def process(self): print("Cannot join edge and node files.", flush=True) return - left_join_idx_list: typing.List[int] = self.build_join_idx_list(left_kr, "left", self.left_join_columns) - right_join_idx_list: typing.List[int] = self.build_join_idx_list(right_kr, "right", self.right_join_columns) + left_join_idx_list: typing.List[int] = self.build_join_idx_list(left_kr, self.LEFT, self.left_join_columns) + right_join_idx_list: typing.List[int] = self.build_join_idx_list(right_kr, self.RIGHT, self.right_join_columns) if len(left_join_idx_list) != len(right_join_idx_list): print("the left join key has %d components, the right join key has %d columns. Exiting." % (len(left_join_idx_list), len(right_join_idx_list)), flush=True) left_kr.close() @@ -322,7 +310,7 @@ def process(self): require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, - gzip_in_parallel=self.gzip_in_parallel, + gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) @@ -385,41 +373,30 @@ def main(): parser = ArgumentParser() parser.add_argument(dest="left_file_path", help="The left KGTK file to join", type=Path) parser.add_argument(dest="right_file_path", help="The right KGTK file to join", type=Path) - parser.add_argument( "--error-limit", dest="error_limit", - help="The maximum number of errors to report before failing", type=int, default=KgtkReader.ERROR_LIMIT_DEFAULT) - parser.add_argument( "--field-separator", dest="field_separator", help="Separator for multifield keys", default=KgtkJoiner.FIELD_SEPARATOR_DEFAULT) - parser.add_argument( "--fill-short-lines", dest="fill_short_lines", - help="Fill missing trailing columns in short lines with empty values.", action='store_true') + parser.add_argument( "--join-on-label", dest="join_on_label", help="If both input files are edge files, include the label column in the join.", action='store_true') parser.add_argument( "--join-on-node2", dest="join_on_node2", help="If both input files are edge files, include the node2 column in the join.", action='store_true') - parser.add_argument( "--gzip-in-parallel", dest="gzip_in_parallel", help="Execute gzip in parallel.", action='store_true') parser.add_argument( "--left-file-join-columns", dest="left_join_columns", help="Left file join columns.", nargs='+') parser.add_argument( "--left-join", dest="left_join", help="Perform a left outer join.", action='store_true') - parser.add_argument( "--long-line-action", dest="long_line_action", - help="The action to take when a long line is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to read", type=Path, default=None) parser.add_argument( "--prefix", dest="prefix", help="The prefix applied to right file column names in the output file.") parser.add_argument( "--right-file-join-columns", dest="right_join_columns", help="Right file join columns.", nargs='+') parser.add_argument( "--right-join", dest="right_join", help="Perform a right outer join.", action='store_true') - parser.add_argument( "--short-line-action", dest="short_line_action", - help="The action to take whe a short line is detected.", - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) - - parser.add_argument( "--truncate-long-lines", dest="truncate_long_lines", - help="Remove excess trailing columns in long lines.", action='store_true') parser.add_argument("-v", "--verbose", dest="verbose", help="Print additional progress messages.", action='store_true') parser.add_argument( "--very-verbose", dest="very_verbose", help="Print additional progress messages.", action='store_true') + KgtkReaderOptions.add_arguments(parser, mode_options=True, who=KgtkJoiner.LEFT) + KgtkReaderOptions.add_arguments(parser, mode_options=True, who=KgtkJoiner.RIGHT) KgtkValueOptions.add_arguments(parser) args = parser.parse_args() - # Build the value parsing option structure. + # Build the option structures. + left_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args, who=KgtkJoiner.LEFT) + right_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args, who=KgtkJoiner.RIGHT) value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) ej: KgtkJoiner = KgtkJoiner(left_file_path=args.left_file_path, @@ -433,13 +410,9 @@ def main(): right_join_columns=args.right_join_columns, prefix=args.prefix, field_separator=args.field_separator, - short_line_action=args.short_line_action, - long_line_action=args.long_line_action, - fill_short_lines=args.fill_short_lines, - truncate_long_lines=args.truncate_long_lines, + left_reader_options=left_reader_options, + right_reader_options=right_reader_options, value_options=value_options, - gzip_in_parallel=args.gzip_in_parallel, - error_limit=args.error_limit, verbose=args.verbose, very_verbose=args.very_verbose) From 743590fba06199de06320cf01d633e5dc4e5c76b Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Mon, 11 May 2020 12:20:36 -0700 Subject: [PATCH 137/278] Fix a namespace issue. --- kgtk/io/kgtkreader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kgtk/io/kgtkreader.py b/kgtk/io/kgtkreader.py index 7a358d53b..6672ef05d 100644 --- a/kgtk/io/kgtkreader.py +++ b/kgtk/io/kgtkreader.py @@ -63,7 +63,7 @@ class KgtkReaderOptions(): skip_first_record: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) # How do we handle errors? - error_limit: int = attr.ib(validator=attr.validators.instance_of(int), default=KgtkReaderOptions.ERROR_LIMIT_DEFAULT) # >0 ==> limit error reports + error_limit: int = attr.ib(validator=attr.validators.instance_of(int), default=ERROR_LIMIT_DEFAULT) # >0 ==> limit error reports # Ignore empty lines, comments, and all whitespace lines, etc.? empty_line_action: ValidationAction = attr.ib(validator=attr.validators.instance_of(ValidationAction), default=ValidationAction.EXCLUDE) From 642f9bf6b5672c506cf054b3010c0835990e3674 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Mon, 11 May 2020 12:30:21 -0700 Subject: [PATCH 138/278] Add defaults to help. --- kgtk/io/kgtkreader.py | 46 +++++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/kgtk/io/kgtkreader.py b/kgtk/io/kgtkreader.py index 6672ef05d..9ddd0b33f 100644 --- a/kgtk/io/kgtkreader.py +++ b/kgtk/io/kgtkreader.py @@ -107,91 +107,99 @@ def add_arguments(cls, "Options affecting " + prefix4 + "processing") fgroup.add_argument(prefix1 + "column-separator", dest=prefix2 + "column_separator", - help=prefix3 + "Column separator.", type=str, default=KgtkFormat.COLUMN_SEPARATOR) + help=prefix3 + "Column separator (default=).", # TODO: provide the default with escapes, e.g. \t + type=str, default=KgtkFormat.COLUMN_SEPARATOR) fgroup.add_argument(prefix1 + "compression-type", - dest=prefix2 + "compression_type", help=prefix3 + "Specify the compression type.") + dest=prefix2 + "compression_type", + help=prefix3 + "Specify the compression type (default=%(default)s).") fgroup.add_argument(prefix1 + "error-limit", dest=prefix2 + "error_limit", - help=prefix3 + "The maximum number of errors to report before failing", type=int, default=cls.ERROR_LIMIT_DEFAULT) + help=prefix3 + "The maximum number of errors to report before failing (default=%(default)s)", + type=int, default=cls.ERROR_LIMIT_DEFAULT) fgroup.add_argument(prefix1 + "gzip-in-parallel", - dest=prefix2 + "gzip_in_parallel", help=prefix3 + "Execute gzip in parallel.", action='store_true') + dest=prefix2 + "gzip_in_parallel", + help=prefix3 + "Execute gzip in parallel (default=%(default)s).", action='store_true') fgroup.add_argument(prefix1 + "gzip-queue-size", dest=prefix2 + "gzip_queue_size", - help=prefix3 + "Queue size for parallel gzip.", type=int, default=cls.GZIP_QUEUE_SIZE_DEFAULT) + help=prefix3 + "Queue size for parallel gzip (default=%(default)s).", + type=int, default=cls.GZIP_QUEUE_SIZE_DEFAULT) if mode_options: fgroup.add_argument(prefix1 + "mode", dest=prefix2 + "mode", - help=prefix3 + "Determine the KGTK file mode.", + help=prefix3 + "Determine the KGTK file mode (default=%(default)s).", type=KgtkReaderMode, action=EnumNameAction, default=KgtkReaderMode.AUTO) hgroup: _ArgumentGroup = parser.add_argument_group(prefix3 + "Header parsing", "Options affecting " + prefix4 + "header parsing") hgroup.add_argument(prefix1 + "force-column-names", dest=prefix2 + "force_column_names", - help=prefix3 + "Force the column names.", nargs='+') + help=prefix3 + "Force the column names (default=None).", + nargs='+') hgroup.add_argument(prefix1 + "header-error-action", dest=prefix2 + "header_error_action", - help=prefix3 + "The action to take when a header error is detected Only ERROR or EXIT are supported.", + help=prefix3 + "The action to take when a header error is detected. Only ERROR or EXIT are supported (default=%(default)s).", type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXIT) hgroup.add_argument(prefix1 + "skip-first-record", dest=prefix2 + "skip_first_record", - help=prefix3 + "Skip the first record when forcing column names.", action='store_true') + help=prefix3 + "Skip the first record when forcing column names (default=%(default)s).", action='store_true') hgroup.add_argument(prefix1 + "unsafe-column-name-action", dest=prefix2 + "unsafe_column_name_action", - help=prefix3 + "The action to take when a column name is unsafe.", + help=prefix3 + "The action to take when a column name is unsafe (default=%(default)s).", type=ValidationAction, action=EnumNameAction, default=ValidationAction.REPORT) lgroup: _ArgumentGroup = parser.add_argument_group("Line parsing", "Options affecting " + prefix4 + "data line parsing") lgroup.add_argument(prefix1 + "blank-required-field-line-action", dest=prefix2 + "blank_required_field_line_action", - help=prefix3 + "The action to take when a line with a blank node1, node2, or id field (per mode) is detected.", + help=prefix3 + "The action to take when a line with a blank node1, node2, or id field (per mode) is detected (default=%(default)s).", type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) lgroup.add_argument(prefix1 + "comment-line-action", dest=prefix2 + "comment_line_action", - help=prefix3 + "The action to take when a comment line is detected.", + help=prefix3 + "The action to take when a comment line is detected (default=%(default)s).", type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) lgroup.add_argument(prefix1 + "empty-line-action", dest=prefix2 + "empty_line_action", - help=prefix3 + "The action to take when an empty line is detected.", + help=prefix3 + "The action to take when an empty line is detected (default=%(default)s).", type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) lgroup.add_argument(prefix1 + "fill-short-lines", dest=prefix2 + "fill_short_lines", - help=prefix3 + "Fill missing trailing columns in short lines with empty values.", action='store_true') + help=prefix3 + "Fill missing trailing columns in short lines with empty values (default=%(default)s).", + action='store_true') lgroup.add_argument(prefix1 + "invalid-value-action", dest=prefix2 + "invalid_value_action", - help=prefix3 + "The action to take when a data cell value is invalid.", + help=prefix3 + "The action to take when a data cell value is invalid (default=%(default)s).", type=ValidationAction, action=EnumNameAction, default=ValidationAction.REPORT) lgroup.add_argument(prefix1 + "long-line-action", dest=prefix2 + "long_line_action", - help=prefix3 + "The action to take when a long line is detected.", + help=prefix3 + "The action to take when a long line is detected (default=%(default)s).", type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) lgroup.add_argument(prefix1 + "short-line-action", dest=prefix2 + "short_line_action", - help=prefix3 + "The action to take when a short line is detected.", + help=prefix3 + "The action to take when a short line is detected (default=%(default)s).", type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) lgroup.add_argument(prefix1 + "truncate-long-lines", dest=prefix2 + "truncate_long_lines", - help=prefix3 + "Remove excess trailing columns in long lines.", action='store_true') + help=prefix3 + "Remove excess trailing columns in long lines (default=%(default)s).", + action='store_true') lgroup.add_argument(prefix1 + "whitespace-line-action", dest=prefix2 + "whitespace_line_action", - help=prefix3 + "The action to take when a whitespace line is detected.", + help=prefix3 + "The action to take when a whitespace line is detected (default=%(default)s).", type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) @classmethod From c50bb9cb92c793885b0e6d30d87bbdff7a67cfeb Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Mon, 11 May 2020 12:38:46 -0700 Subject: [PATCH 139/278] Add default values to help message. --- kgtk/value/kgtkvalueoptions.py | 46 ++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/kgtk/value/kgtkvalueoptions.py b/kgtk/value/kgtkvalueoptions.py index a51b16dc7..b54046b29 100644 --- a/kgtk/value/kgtkvalueoptions.py +++ b/kgtk/value/kgtkvalueoptions.py @@ -77,25 +77,30 @@ def add_arguments(cls, parser: ArgumentParser, who: str = "", desc: str = "."): vgroup = parser.add_argument_group(prefix3 + "Data value parsing", "Options controlling the parsing and processing of KGTK data values" + desc) vgroup.add_argument( prefix1 + "additional-language-codes", dest=prefix2 + "additional_language_codes", - help=prefix3 + "Additional language codes.", nargs="*", default=None) + help=prefix3 + "Additional language codes (default=None).", + nargs="*", default=None) lsgroup= vgroup.add_mutually_exclusive_group() lsgroup.add_argument( prefix1 + "allow-language-suffixes", dest=prefix2 + "allow_language_suffixes", - help=prefix3 + "Allow language identifier suffixes starting with a dash.", action='store_true', default=True) + help=prefix3 + "Allow language identifier suffixes starting with a dash (default=%(default)s).", + action='store_true', default=True) lsgroup.add_argument( prefix1 + "disallow-language-suffixes", dest=prefix2 + "allow_language_suffixes", - help=prefix3 + "Disallow language identifier suffixes starting with a dash.", action='store_false') + help=prefix3 + "Disallow language identifier suffixes starting with a dash.", + action='store_false') laxgroup= vgroup.add_mutually_exclusive_group() laxgroup.add_argument( prefix1 + "allow-lax-strings", dest=prefix2 + "allow_lax_strings", - help=prefix3 + "Do not check if double quotes are backslashed inside strings.", action='store_true', default=False) + help=prefix3 + "Do not check if double quotes are backslashed inside strings (default=%(default)s).", + action='store_true', default=False) laxgroup.add_argument( prefix1 + "disallow-lax-strings", dest=prefix2 + "allow_lax_strings", - help=prefix3 + "Check if double quotes are backslashed inside strings.", action='store_false') + help=prefix3 + "Check if double quotes are backslashed inside strings.", + action='store_false') lqgroup= vgroup.add_mutually_exclusive_group() lqgroup.add_argument( prefix1 + "allow-lax-lq-strings", dest=prefix2 + "allow_lax_lq_strings", - help=prefix3 + "Do not check if single quotes are backslashed inside language qualified strings.", + help=prefix3 + "Do not check if single quotes are backslashed inside language qualified strings (default=%(default)s).", action='store_true', default=False) lqgroup.add_argument( prefix1 + "disallow-lax-lq-strings", dest=prefix2 + "allow_lax_lq_strings", @@ -104,39 +109,48 @@ def add_arguments(cls, parser: ArgumentParser, who: str = "", desc: str = "."): amd0group= vgroup.add_mutually_exclusive_group() amd0group.add_argument( prefix1 + "allow-month-or-day-zero", dest=prefix2 + "allow_month_or_day_zero", - help=prefix3 + "Allow month or day zero in dates.", action='store_true', default=False) + help=prefix3 + "Allow month or day zero in dates (default=%(default)s).", action='store_true', default=False) amd0group.add_argument( prefix1 + "disallow-month-or-day-zero", dest=prefix2 + "allow_month_or_day_zero", - help=prefix3 + "Allow month or day zero in dates.", action='store_false') + help=prefix3 + "Allow month or day zero in dates.", + action='store_false') rmd0group= vgroup.add_mutually_exclusive_group() rmd0group.add_argument( prefix1 + "repair-month-or-day-zero", dest=prefix2 + "repair_month_or_day_zero", - help=prefix3 + "Repair month or day zero in dates.", action='store_true', default=False) + help=prefix3 + "Repair month or day zero in dates (default=%(default)s).", + action='store_true', default=False) rmd0group.add_argument( prefix1 + "no-repair-month-or-day-zero", dest=prefix2 + "repair_month_or_day_zero", help=prefix3 + "Do not repair month or day zero in dates.", action='store_false') vgroup.add_argument( prefix1 + "minimum-valid-year", dest=prefix2 + "minimum_valid_year", - help=prefix3 + "The minimum valid year in dates.", type=int, default=cls.MINIMUM_VALID_YEAR) + help=prefix3 + "The minimum valid year in dates (default=%(default)d).", + type=int, default=cls.MINIMUM_VALID_YEAR) vgroup.add_argument( prefix1 + "maximum-valid-year", dest=prefix2 + "maximum_valid_year", - help=prefix3 + "The maximum valid year in dates.", type=int, default=cls.MAXIMUM_VALID_YEAR) + help=prefix3 + "The maximum valid year in dates (default=%(default)d).", + type=int, default=cls.MAXIMUM_VALID_YEAR) vgroup.add_argument( prefix1 + "minimum-valid-lat", dest=prefix2 + "minimum_valid_lat", - help=prefix3 + "The minimum valid latitude.", type=int, default=cls.MINIMUM_VALID_LAT) + help=prefix3 + "The minimum valid latitude (default=%(default)d).", + type=int, default=cls.MINIMUM_VALID_LAT) vgroup.add_argument( prefix1 + "maximum-valid-lat", dest=prefix2 + "maximum_valid_lat", - help=prefix3 + "The maximum valid latitude.", type=int, default=cls.MAXIMUM_VALID_LAT) + help=prefix3 + "The maximum valid latitude (default=%(default)d).", + type=int, default=cls.MAXIMUM_VALID_LAT) vgroup.add_argument( prefix1 + "minimum-valid-lon", dest=prefix2 + "minimum_valid_lon", - help=prefix3 + "The minimum valid longitude.", type=int, default=cls.MINIMUM_VALID_LON) + help=prefix3 + "The minimum valid longitude (default=%(default)d).", + type=int, default=cls.MINIMUM_VALID_LON) vgroup.add_argument( prefix1 + "maximum-valid-lon", dest=prefix2 + "maximum_valid_lon", - help=prefix3 + "The maximum valid longitude.", type=int, default=cls.MAXIMUM_VALID_LON) + help=prefix3 + "The maximum valid longitude (default=%(default)d).", + type=int, default=cls.MAXIMUM_VALID_LON) elsgroup= vgroup.add_mutually_exclusive_group() elsgroup.add_argument( prefix1 + "escape-list-separators", dest=prefix2 + "escape_list_separators", - help=prefix3 + "Escape all list separators instead of splitting on them.", action='store_true', default=False) + help=prefix3 + "Escape all list separators instead of splitting on them (default=%(default)s).", + action='store_true', default=False) elsgroup.add_argument( prefix1 + "no-escape-list-separators", dest=prefix2 + "escape_list_separators", help=prefix3 + "Do not escape list separators.", action='store_false') From 2f01aae704f843ec33608788b5557005d4ddffae Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Mon, 11 May 2020 13:40:05 -0700 Subject: [PATCH 140/278] SHow defaults. Fixed some bugs. --- kgtk/cli/ifexists.py | 10 +-- kgtk/cli/ifnotexists.py | 10 +-- kgtk/cli/validate.py | 2 +- kgtk/io/edgereader.py | 2 +- kgtk/io/kgtkreader.py | 136 +++++++++++++++++++++++++--------------- kgtk/io/nodereader.py | 2 +- kgtk/join/ifexists.py | 4 +- 7 files changed, 95 insertions(+), 71 deletions(-) diff --git a/kgtk/cli/ifexists.py b/kgtk/cli/ifexists.py index 112ff7eda..eb7d12801 100644 --- a/kgtk/cli/ifexists.py +++ b/kgtk/cli/ifexists.py @@ -37,16 +37,10 @@ def add_arguments(parser): parser.add_argument( "--field-separator", dest="field_separator", help="Separator for multifield keys", default=IfExists.FIELD_SEPARATOR_DEFAULT) - KgtkReader.add_file_arguments(parser, mode_options=True, who="input") - KgtkReader.add_file_arguments(parser, mode_options=True, who="filter") + KgtkReaderOptions.add_arguments(parser, mode_options=True, who="input") + KgtkReaderOptions.add_arguments(parser, mode_options=True, who="filter") KgtkValueOptions.add_arguments(parser) - - # Note: Any arguments described by KgtkValueOptions.add_arguments(...) - # need to be included in the arguments to run(...), below. - KgtkValueOptions.add_arguments(parser) - - def run(input_kgtk_file: typing.Optional[Path], filter_kgtk_file: Path, output_kgtk_file: typing.Optional[Path], diff --git a/kgtk/cli/ifnotexists.py b/kgtk/cli/ifnotexists.py index 481f2ccbc..c3789e984 100644 --- a/kgtk/cli/ifnotexists.py +++ b/kgtk/cli/ifnotexists.py @@ -37,16 +37,10 @@ def add_arguments(parser): parser.add_argument( "--field-separator", dest="field_separator", help="Separator for multifield keys", default=IfExists.FIELD_SEPARATOR_DEFAULT) - KgtkReader.add_file_arguments(parser, mode_options=True, who="input") - KgtkReader.add_file_arguments(parser, mode_options=True, who="filter") + KgtkReaderOptions.add_arguments(parser, mode_options=True, who="input") + KgtkReaderOptions.add_arguments(parser, mode_options=True, who="filter") KgtkValueOptions.add_arguments(parser) - - # Note: Any arguments described by KgtkValueOptions.add_arguments(...) - # need to be included in the arguments to run(...), below. - KgtkValueOptions.add_arguments(parser) - - def run(input_kgtk_file: typing.Optional[Path], filter_kgtk_file: Path, output_kgtk_file: typing.Optional[Path], diff --git a/kgtk/cli/validate.py b/kgtk/cli/validate.py index 2fa013bfb..f61db403a 100644 --- a/kgtk/cli/validate.py +++ b/kgtk/cli/validate.py @@ -36,7 +36,7 @@ def add_arguments(parser): help="Process the only the header of the input file.", action="store_true") KgtkReader.add_debug_arguments(parser) - KgtkReaderOptions.add_arguments(parser, mode_options=True) + KgtkReaderOptions.add_arguments(parser, mode_options=True, validate=True) KgtkValueOptions.add_arguments(parser) diff --git a/kgtk/io/edgereader.py b/kgtk/io/edgereader.py index 837085269..835177a1d 100644 --- a/kgtk/io/edgereader.py +++ b/kgtk/io/edgereader.py @@ -111,7 +111,7 @@ def main(): parser = ArgumentParser() parser.add_argument(dest="kgtk_file", help="The KGTK edge file to read", type=Path, nargs="?") KgtkReader.add_debug_arguments(parser) - KgtkReaderOptions.add_arguments(parser) + KgtkReaderOptions.add_arguments(parser, validate=True) KgtkValueOptions.add_arguments(parser) args = parser.parse_args() diff --git a/kgtk/io/kgtkreader.py b/kgtk/io/kgtkreader.py index 9ddd0b33f..153447899 100644 --- a/kgtk/io/kgtkreader.py +++ b/kgtk/io/kgtkreader.py @@ -65,6 +65,10 @@ class KgtkReaderOptions(): # How do we handle errors? error_limit: int = attr.ib(validator=attr.validators.instance_of(int), default=ERROR_LIMIT_DEFAULT) # >0 ==> limit error reports + # Top-level validation controls: + repair_and_validate_lines: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) + repair_and_validate_values: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) + # Ignore empty lines, comments, and all whitespace lines, etc.? empty_line_action: ValidationAction = attr.ib(validator=attr.validators.instance_of(ValidationAction), default=ValidationAction.EXCLUDE) comment_line_action: ValidationAction = attr.ib(validator=attr.validators.instance_of(ValidationAction), default=ValidationAction.EXCLUDE) @@ -97,6 +101,7 @@ class KgtkReaderOptions(): def add_arguments(cls, parser: ArgumentParser, mode_options: bool = False, + validate: bool = False, who: str = ""): prefix1: str = "--" if len(who) == 0 else "--" + who + "-" prefix2: str = "" if len(who) == 0 else who + "_" @@ -157,6 +162,26 @@ def add_arguments(cls, lgroup: _ArgumentGroup = parser.add_argument_group("Line parsing", "Options affecting " + prefix4 + "data line parsing") + lgroup.add_argument(prefix1 + "repair-and-validate-lines", + dest=prefix2 + "repair_and_validate_lines", + help=prefix3 + "Repair and validate lines (default=%(default)s).", + action='store_true', default=validate) + + lgroup.add_argument(prefix1 + "do-not-repair-and-validate-lines", + dest=prefix2 + "repair_and_validate_lines", + help=prefix3 + "Do not repair and validate lines.", + action='store_false') + + lgroup.add_argument(prefix1 + "repair-and-validate-values", + dest=prefix2 + "repair_and_validate_values", + help=prefix3 + "Repair and validate values (default=%(default)s).", + action='store_true', default=validate) + + lgroup.add_argument(prefix1 + "do-not-repair-and-validate-values", + dest=prefix2 + "repair-and-validate_values", + help=prefix3 + "Do not repair and validate values.", + action='store_false') + lgroup.add_argument(prefix1 + "blank-required-field-line-action", dest=prefix2 + "blank_required_field_line_action", help=prefix3 + "The action to take when a line with a blank node1, node2, or id field (per mode) is detected (default=%(default)s).", @@ -234,6 +259,8 @@ def from_dict(cls, invalid_value_action=d.get(prefix + "invalid_value_action", ValidationAction.REPORT), long_line_action=d.get(prefix + "long_line_action", ValidationAction.EXCLUDE), mode=reader_mode, + repair_and_validate_lines=d.get(prefix + "repair_and_validate_lines", False), + repair_and_validate_values=d.get(prefix + "repair_and_validate_values", False), short_line_action=d.get(prefix + "short_line_action", ValidationAction.EXCLUDE), skip_first_record=d.get(prefix + "skip_first_recordb", False), truncate_long_lines=d.get(prefix + "truncate_long_lines", False), @@ -258,7 +285,9 @@ class KgtkReader(KgtkBase, ClosableIter[typing.List[str]]): file_path: typing.Optional[Path] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(Path))) source: ClosableIter[str] = attr.ib() # Todo: validate - options: KgtkReaderOptions = attr.ib(validator=attr.validators.instance_of(KgtkReaderOptions)) + # TODO: Fix this validator: + # options: KgtkReaderOptions = attr.ib(validator=attr.validators.instance_of(KgtkReaderOptions)) + options: KgtkReaderOptions = attr.ib() value_options: KgtkValueOptions = attr.ib(validator=attr.validators.instance_of(KgtkValueOptions)) @@ -581,6 +610,9 @@ def exclude_line(self, action: ValidationAction, msg: str, line: str)->bool: def nextrow(self)-> typing.List[str]: row: typing.List[str] + repair_and_validate_lines: bool = self.options.repair_and_validate_lines + repair_and_validate_values: bool = self.options.repair_and_validate_values + # This loop accomodates lines that are ignored. while (True): line: str @@ -600,69 +632,73 @@ def nextrow(self)-> typing.List[str]: # Strip the end-of-line characters: line = line.rstrip("\r\n") - if self.very_verbose: - print("'%s'" % line, file=self.error_file, flush=True) + if repair_and_validate_lines: + # TODO: Use a sepearate option to control this. + if self.very_verbose: + print("'%s'" % line, file=self.error_file, flush=True) + + # Ignore empty lines. + if self.options.empty_line_action != ValidationAction.PASS and len(line) == 0: + if self.exclude_line(self.options.empty_line_action, "saw an empty line", line): + continue - # Ignore empty lines. - if self.options.empty_line_action != ValidationAction.PASS and len(line) == 0: - if self.exclude_line(self.options.empty_line_action, "saw an empty line", line): - continue + # Ignore comment lines: + if self.options.comment_line_action != ValidationAction.PASS and line[0] == self.COMMENT_INDICATOR: + if self.exclude_line(self.options.comment_line_action, "saw a comment line", line): + continue - # Ignore comment lines: - if self.options.comment_line_action != ValidationAction.PASS and line[0] == self.COMMENT_INDICATOR: - if self.exclude_line(self.options.comment_line_action, "saw a comment line", line): - continue - - # Ignore whitespace lines - if self.options.whitespace_line_action != ValidationAction.PASS and line.isspace(): - if self.exclude_line(self.options.whitespace_line_action, "saw a whitespace line", line): - continue + # Ignore whitespace lines + if self.options.whitespace_line_action != ValidationAction.PASS and line.isspace(): + if self.exclude_line(self.options.whitespace_line_action, "saw a whitespace line", line): + continue row = line.split(self.options.column_separator) - # Optionally fill missing trailing columns with empty row: - if self.options.fill_short_lines and len(row) < self.column_count: - while len(row) < self.column_count: - row.append("") + if repair_and_validate_lines: + # Optionally fill missing trailing columns with empty row: + if self.options.fill_short_lines and len(row) < self.column_count: + while len(row) < self.column_count: + row.append("") - # Optionally remove extra trailing columns: - if self.options.truncate_long_lines and len(row) > self.column_count: - row = row[:self.column_count] - - # Optionally validate that the line contained the right number of columns: - # - # When we report line numbers in error messages, line 1 is the first line after the header line. - if self.options.short_line_action != ValidationAction.PASS and len(row) < self.column_count: - if self.exclude_line(self.options.short_line_action, - "Required %d columns, saw %d: '%s'" % (self.column_count, - len(row), - line), - line): - continue + # Optionally remove extra trailing columns: + if self.options.truncate_long_lines and len(row) > self.column_count: + row = row[:self.column_count] + + # Optionally validate that the line contained the right number of columns: + # + # When we report line numbers in error messages, line 1 is the first line after the header line. + if self.options.short_line_action != ValidationAction.PASS and len(row) < self.column_count: + if self.exclude_line(self.options.short_line_action, + "Required %d columns, saw %d: '%s'" % (self.column_count, + len(row), + line), + line): + continue - if self.options.long_line_action != ValidationAction.PASS and len(row) > self.column_count: - if self.exclude_line(self.options.long_line_action, - "Required %d columns, saw %d (%d extra): '%s'" % (self.column_count, - len(row), - len(row) - self.column_count, - line), - line): + if self.options.long_line_action != ValidationAction.PASS and len(row) > self.column_count: + if self.exclude_line(self.options.long_line_action, + "Required %d columns, saw %d (%d extra): '%s'" % (self.column_count, + len(row), + len(row) - self.column_count, + line), + line): + continue + + if self._ignore_if_blank_fields(row, line): continue - if self._ignore_if_blank_fields(row, line): - continue - - if self.options.invalid_value_action != ValidationAction.PASS: + if repair_and_validate_values and self.options.invalid_value_action != ValidationAction.PASS: # TODO: find a way to optionally cache the KgtkValue objects # so we don't have to create them a second time in the conversion # and iterator methods below. if self._ignore_invalid_values(row, line): continue - self.data_lines_passed += 1 - if self.very_verbose: - sys.stdout.write(".") - sys.stdout.flush() + self.data_lines_passed += 1 + # TODO: User a seperate option to control this. + # if self.very_verbose: + # self.error_file.write(".") + # self.error_file.flush() return row @@ -919,7 +955,7 @@ def main(): default="rows") parser.add_argument( "--test-validate", dest="test_validate", help="Validate KgtkValue objects in test.", action='store_true') - KgtkReaderOptions.add_arguments(parser, mode_options=True) + KgtkReaderOptions.add_arguments(parser, mode_options=True, validate=True) KgtkValueOptions.add_arguments(parser) args = parser.parse_args() diff --git a/kgtk/io/nodereader.py b/kgtk/io/nodereader.py index 56702a73a..8fee4bd03 100644 --- a/kgtk/io/nodereader.py +++ b/kgtk/io/nodereader.py @@ -94,7 +94,7 @@ def main(): """ parser = ArgumentParser() parser.add_argument(dest="kgtk_file", help="The KGTK edge file to read", type=Path, nargs="?") - KgtkReader.add_debug_arguments(parser) + KgtkReader.add_debug_arguments(parser, validate=True) KgtkReaderOptions.add_arguments(parser) KgtkValueOptions.add_arguments(parser) args = parser.parse_args() diff --git a/kgtk/join/ifexists.py b/kgtk/join/ifexists.py index 1e85e44fa..7ec3878d6 100644 --- a/kgtk/join/ifexists.py +++ b/kgtk/join/ifexists.py @@ -217,8 +217,8 @@ def main(): parser.add_argument( "--input-keys", dest="input_keys", help="The key columns in the input file.", nargs='*') parser.add_argument( "--filter-keys", dest="filter_keys", help="The key columns in the filter file.", nargs='*') - KgtkReader.add_file_arguments(parser, mode_options=True, who="input") - KgtkReader.add_file_arguments(parser, mode_options=True, who="filter") + KgtkReaderOptions.add_arguments(parser, mode_options=True, who="input") + KgtkReaderOptions.add_arguments(parser, mode_options=True, who="filter") KgtkValueOptions.add_arguments(parser) args: Namespace = parser.parse_args() From 401fbde11515a78d09570f7a579f6586d49fd96e Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Mon, 11 May 2020 13:44:39 -0700 Subject: [PATCH 141/278] Finish conversion to more consistent names. --- kgtk/join/ifexists.py | 10 +++++----- kgtk/join/test/ifexists-test1-node1.sh | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/kgtk/join/ifexists.py b/kgtk/join/ifexists.py index 7ec3878d6..2242a0935 100644 --- a/kgtk/join/ifexists.py +++ b/kgtk/join/ifexists.py @@ -124,7 +124,7 @@ def extract_key_set(self, kr: KgtkReader, who: str, key_columns: typing.List[int def process(self): # Open the input files once. if self.verbose: - if self.left_file_path is not None: + if self.input_file_path is not None: print("Opening the input file: %s" % self.input_file_path, file=self.error_file, flush=True) else: print("Reading the input data from stdin", file=self.error_file, flush=True) @@ -164,9 +164,9 @@ def process(self): print("Keys: %s" % " ".join(key_set), file=self.error_file, flush=True) if self.verbose: - print("Opening the output file: %s" % self.output_path, file=self.error_file, flush=True) - ew: KgtkWriter = KgtkWriter.open(left_kr.column_names, - self.output_path, + print("Opening the output file: %s" % self.output_file_path, file=self.error_file, flush=True) + ew: KgtkWriter = KgtkWriter.open(input_kr.column_names, + self.output_file_path, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, @@ -204,7 +204,7 @@ def main(): parser: ArgumentParser = ArgumentParser() KgtkReader.add_debug_arguments(parser) - parser.add_argument(dest="input_file", help="The KGTK file with the input data", type=Path, nargs="?") + parser.add_argument(dest="input_file_path", help="The KGTK file with the input data", type=Path, nargs="?") parser.add_argument( "--filter-on", dest="filter_file_path", help="The KGTK file with the filter data", type=Path, required=True) diff --git a/kgtk/join/test/ifexists-test1-node1.sh b/kgtk/join/test/ifexists-test1-node1.sh index 7ff31e58f..b11a66710 100755 --- a/kgtk/join/test/ifexists-test1-node1.sh +++ b/kgtk/join/test/ifexists-test1-node1.sh @@ -1,7 +1,7 @@ #! /bin/sh python3 kgtk/join/ifexists.py \ kgtk/join/test/ifexists-test1-file1.tsv \ - kgtk/join/test/ifexists-test1-file2.tsv \ - --left-keys node1 \ - --right-keys node1 \ + --input-keys node1 \ + --filter-on kgtk/join/test/ifexists-test1-file2.tsv \ + --filter-keys node1 \ --output-file kgtk/join/test/ifexists-test1-node1-output.tsv From c71fb1fb2916b277ef2f514b9ca416205805dd21 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Mon, 11 May 2020 13:50:48 -0700 Subject: [PATCH 142/278] Use the debug options. --- kgtk/cli/ifexists.py | 3 ++- kgtk/cli/ifnotexists.py | 1 + kgtk/join/ifexists.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/kgtk/cli/ifexists.py b/kgtk/cli/ifexists.py index eb7d12801..1588f1d4e 100644 --- a/kgtk/cli/ifexists.py +++ b/kgtk/cli/ifexists.py @@ -27,7 +27,7 @@ def add_arguments(parser): """ parser.add_argument( "input_kgtk_file", nargs="?", help="The KGTK file to filter. May be omitted or '-' for stdin.", type=Path) - parser.add_argument( "--filter-on", dest="_filter_kgtk_file", help="The KGTK file to filter against.", type=Path, required=True) + parser.add_argument( "--filter-on", dest="filter_kgtk_file", help="The KGTK file to filter against.", type=Path, required=True) parser.add_argument("-o", "--output-file", dest="output_kgtk_file", help="The KGTK file to write", type=Path, default=None) @@ -37,6 +37,7 @@ def add_arguments(parser): parser.add_argument( "--field-separator", dest="field_separator", help="Separator for multifield keys", default=IfExists.FIELD_SEPARATOR_DEFAULT) + KgtkReader.add_debug_arguments(parser) KgtkReaderOptions.add_arguments(parser, mode_options=True, who="input") KgtkReaderOptions.add_arguments(parser, mode_options=True, who="filter") KgtkValueOptions.add_arguments(parser) diff --git a/kgtk/cli/ifnotexists.py b/kgtk/cli/ifnotexists.py index c3789e984..971bdb3d8 100644 --- a/kgtk/cli/ifnotexists.py +++ b/kgtk/cli/ifnotexists.py @@ -37,6 +37,7 @@ def add_arguments(parser): parser.add_argument( "--field-separator", dest="field_separator", help="Separator for multifield keys", default=IfExists.FIELD_SEPARATOR_DEFAULT) + KgtkReader.add_debug_arguments(parser) KgtkReaderOptions.add_arguments(parser, mode_options=True, who="input") KgtkReaderOptions.add_arguments(parser, mode_options=True, who="filter") KgtkValueOptions.add_arguments(parser) diff --git a/kgtk/join/ifexists.py b/kgtk/join/ifexists.py index 2242a0935..9e5a788c0 100644 --- a/kgtk/join/ifexists.py +++ b/kgtk/join/ifexists.py @@ -202,7 +202,6 @@ def main(): Test the KGTK file joiner. """ parser: ArgumentParser = ArgumentParser() - KgtkReader.add_debug_arguments(parser) parser.add_argument(dest="input_file_path", help="The KGTK file with the input data", type=Path, nargs="?") @@ -217,6 +216,7 @@ def main(): parser.add_argument( "--input-keys", dest="input_keys", help="The key columns in the input file.", nargs='*') parser.add_argument( "--filter-keys", dest="filter_keys", help="The key columns in the filter file.", nargs='*') + KgtkReader.add_debug_arguments(parser) KgtkReaderOptions.add_arguments(parser, mode_options=True, who="input") KgtkReaderOptions.add_arguments(parser, mode_options=True, who="filter") KgtkValueOptions.add_arguments(parser) From 155af7bbfc3c5605c95b421b97afb78c1b82463d Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Mon, 11 May 2020 13:52:32 -0700 Subject: [PATCH 143/278] Ad a missing prefix for a --help feedback message. --- kgtk/io/kgtkreader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kgtk/io/kgtkreader.py b/kgtk/io/kgtkreader.py index 153447899..2d7e1de45 100644 --- a/kgtk/io/kgtkreader.py +++ b/kgtk/io/kgtkreader.py @@ -160,7 +160,7 @@ def add_arguments(cls, help=prefix3 + "The action to take when a column name is unsafe (default=%(default)s).", type=ValidationAction, action=EnumNameAction, default=ValidationAction.REPORT) - lgroup: _ArgumentGroup = parser.add_argument_group("Line parsing", "Options affecting " + prefix4 + "data line parsing") + lgroup: _ArgumentGroup = parser.add_argument_group(prefix3 + "Line parsing", "Options affecting " + prefix4 + "data line parsing") lgroup.add_argument(prefix1 + "repair-and-validate-lines", dest=prefix2 + "repair_and_validate_lines", From 50b732a092ce951a060803a33a298400e855d720 Mon Sep 17 00:00:00 2001 From: saggu Date: Mon, 11 May 2020 16:33:41 -0700 Subject: [PATCH 144/278] remove unused function --- kgtk/cli/gt_loader.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/kgtk/cli/gt_loader.py b/kgtk/cli/gt_loader.py index 00fa84404..400ea8fab 100644 --- a/kgtk/cli/gt_loader.py +++ b/kgtk/cli/gt_loader.py @@ -9,20 +9,6 @@ def parser(): } -def convert_scientific_notation(num): - if isinstance(num, float): - num = str(num) - if 'e' in num: - vals = num.split('e') - formatter = int(vals[1].replace('-', '')) + 2 - try: - return "{:.{formatter}f}".format(float(num), formatter=formatter) - except: - print(num, vals, formatter) - raise - return num - - def add_arguments(parser): """ Parse arguments @@ -177,7 +163,7 @@ def infer_predicate(h, options=[]): for vprop in G2.vertex_properties.keys(): if vprop == id_col: continue sys.stdout.write( - '%s\t%s\t%s\t%s\n' % (v_id, v_prop_dict[vprop], convert_scientific_notation(G2.vp[vprop][v]), + '%s\t%s\t%s\t%s\n' % (v_id, v_prop_dict[vprop], G2.vp[vprop][v], '{}-{}-{}'.format(v_id, v_prop_dict[vprop], id_count))) id_count += 1 From 0bb56ec96c84bba19770df79f2c4343c360b0b1c Mon Sep 17 00:00:00 2001 From: saggu Date: Mon, 11 May 2020 16:33:59 -0700 Subject: [PATCH 145/278] choose type wisely --- kgtk/triple_generator.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/kgtk/triple_generator.py b/kgtk/triple_generator.py index 1ed5b5414..2afaff4e9 100644 --- a/kgtk/triple_generator.py +++ b/kgtk/triple_generator.py @@ -19,6 +19,7 @@ ExternalIdentifier, URLValue ) +from etk.knowledge_graph.node import LiteralType BAD_CHARS = [":", "-", "&", ",", " ", "(", ")", "\'", '\"', "/", "\\", "[", "]", ";", "|"] @@ -199,6 +200,12 @@ def generate_prop_declaration_triple(self, node1: str, label: str, node2: str) - self.doc.kg.add_subject(prop) return True + @staticmethod + def xsd_number_type(num): + if isinstance(num, float) and 'e' in str(num).lower(): + return LiteralType.double + return LiteralType.decimal + def generate_normal_triple( self, node1: str, label: str, node2: str, is_qualifier_edge: bool, e_id: str) -> bool: if self.use_id: @@ -268,20 +275,22 @@ def generate_normal_triple( amount, lower_bound, upper_bound, unit = res amount = TripleGenerator.clean_number_string(amount) + num_type = self.xsd_number_type(amount) + print(amount, num_type) lower_bound = TripleGenerator.clean_number_string(lower_bound) upper_bound = TripleGenerator.clean_number_string(upper_bound) if unit != None: if upper_bound != None and lower_bound != None: object = QuantityValue(amount, unit=Item( - unit), upper_bound=upper_bound, lower_bound=lower_bound) + unit), upper_bound=upper_bound, lower_bound=lower_bound, type=num_type) else: - object = QuantityValue(amount, unit=Item(unit)) + object = QuantityValue(amount, unit=Item(unit), type=num_type) else: if upper_bound != None and lower_bound != None: object = QuantityValue( - amount, upper_bound=upper_bound, lower_bound=lower_bound) + amount, upper_bound=upper_bound, lower_bound=lower_bound, type=num_type) else: - object = QuantityValue(amount) + object = QuantityValue(amount, type=num_type) elif edge_type == MonolingualText: text_string, lang = TripleGenerator.process_text_string(node2) From 1a4b38cc209165845bb4d77ae43bf395c3361c7c Mon Sep 17 00:00:00 2001 From: saggu Date: Mon, 11 May 2020 16:34:25 -0700 Subject: [PATCH 146/278] remove strict version for etk and rdflib --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 27e52a2c2..54ba2f65e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,8 +8,8 @@ sh sklearn SPARQLWrapper tqdm -rdflib==5.0.0 -etk==2.2.1 +rdflib +etk simplejson pyrallel.lib attrs From 6173cbaac5a3a1298e2defa80ae1dd3fa4aa50b1 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Mon, 11 May 2020 17:07:47 -0700 Subject: [PATCH 147/278] Implement expert mode. --- kgtk/cli/clean_data.py | 12 ++-- kgtk/cli/ifexists.py | 29 ++++++--- kgtk/cli/ifnotexists.py | 26 ++++++-- kgtk/cli/validate.py | 12 ++-- kgtk/cli_entry.py | 6 +- kgtk/io/edgereader.py | 6 +- kgtk/io/kgtkreader.py | 106 +++++++++++++++++++++------------ kgtk/io/nodereader.py | 6 +- kgtk/value/kgtkvalueoptions.py | 68 +++++++++++++-------- 9 files changed, 180 insertions(+), 91 deletions(-) diff --git a/kgtk/cli/clean_data.py b/kgtk/cli/clean_data.py index e52a4f518..bb497eea4 100644 --- a/kgtk/cli/clean_data.py +++ b/kgtk/cli/clean_data.py @@ -6,10 +6,12 @@ """ +from argparse import Namespace, SUPPRESS from pathlib import Path import sys import typing +from kgtk.cli_argparse import KGTKArgumentParser from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions @@ -20,18 +22,20 @@ def parser(): } -def add_arguments(parser): +def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Namespace): """ Parse arguments Args: parser (argparse.ArgumentParser) """ + _expert: bool = parsed_shared_args._expert + parser.add_argument( "input_file", nargs="?", help="The KGTK file to read. May be omitted or '-' for stdin.", type=Path) parser.add_argument( "output_file", nargs="?", help="The KGTK file to write. May be omitted or '-' for stdout.", type=Path) - KgtkReader.add_debug_arguments(parser) - KgtkReaderOptions.add_arguments(parser, mode_options=True) - KgtkValueOptions.add_arguments(parser) + KgtkReader.add_debug_arguments(parser, expert=_expert) + KgtkReaderOptions.add_arguments(parser, mode_options=True, validate_by_default=True, expert=True) + KgtkValueOptions.add_arguments(parser, expert=True) def run(input_file: typing.Optional[Path], diff --git a/kgtk/cli/ifexists.py b/kgtk/cli/ifexists.py index 1588f1d4e..9e62ec60e 100644 --- a/kgtk/cli/ifexists.py +++ b/kgtk/cli/ifexists.py @@ -4,10 +4,12 @@ TODO: Need KgtkWriterOptions """ +from argparse import Namespace, SUPPRESS from pathlib import Path import sys import typing +from kgtk.cli_argparse import KGTKArgumentParser from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.join.ifexists import IfExists @@ -15,16 +17,29 @@ def parser(): return { - 'help': 'Filter a KGTK file based on whether one or more records exist in a second KGTK file with matching values for one or more fields.' + 'help': 'Filter a KGTK file', + 'description': 'Filter a KGTK file based on whether one or more records exist in a second KGTK file with matching values for one or more fields.' } -def add_arguments(parser): +def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Namespace): """ Parse arguments Args: parser (argparse.ArgumentParser) """ + + _expert: bool = parsed_shared_args._expert + + # This helper function makes it easy to suppress options from + # The help message. The options are still there, and initialize + # what they need to initialize. + def h(msg: str)->str: + if _expert: + return msg + else: + return SUPPRESS + parser.add_argument( "input_kgtk_file", nargs="?", help="The KGTK file to filter. May be omitted or '-' for stdin.", type=Path) parser.add_argument( "--filter-on", dest="filter_kgtk_file", help="The KGTK file to filter against.", type=Path, required=True) @@ -35,12 +50,12 @@ def add_arguments(parser): parser.add_argument( "--filter-keys", "--right-keys", dest="filter_keys", help="The key columns in the filter-on file.", nargs='*') - parser.add_argument( "--field-separator", dest="field_separator", help="Separator for multifield keys", default=IfExists.FIELD_SEPARATOR_DEFAULT) + parser.add_argument( "--field-separator", dest="field_separator", help=h("Separator for multifield keys"), default=IfExists.FIELD_SEPARATOR_DEFAULT) - KgtkReader.add_debug_arguments(parser) - KgtkReaderOptions.add_arguments(parser, mode_options=True, who="input") - KgtkReaderOptions.add_arguments(parser, mode_options=True, who="filter") - KgtkValueOptions.add_arguments(parser) + KgtkReader.add_debug_arguments(parser, expert=_expert) + KgtkReaderOptions.add_arguments(parser, mode_options=True, who="input", expert=_expert) + KgtkReaderOptions.add_arguments(parser, mode_options=True, who="filter", expert=_expert) + KgtkValueOptions.add_arguments(parser, expert=_expert) def run(input_kgtk_file: typing.Optional[Path], filter_kgtk_file: Path, diff --git a/kgtk/cli/ifnotexists.py b/kgtk/cli/ifnotexists.py index 971bdb3d8..964f77dc0 100644 --- a/kgtk/cli/ifnotexists.py +++ b/kgtk/cli/ifnotexists.py @@ -4,10 +4,12 @@ TODO: Need KgtkWriterOptions """ +from argparse import Namespace, SUPPRESS from pathlib import Path import sys import typing +from kgtk.cli_argparse import KGTKArgumentParser from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.join.ifexists import IfExists @@ -19,12 +21,24 @@ def parser(): } -def add_arguments(parser): +def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Namespace): """ Parse arguments Args: parser (argparse.ArgumentParser) """ + _expert: bool = parsed_shared_args._expert + + # This helper function makes it easy to suppress options from + # The help message. The options are still there, and initialize + # what they need to initialize. + def h(msg: str)->str: + if not _expert: + return SUPPRESS + else: + return msg + + parser.add_argument( "input_kgtk_file", nargs="?", help="The KGTK file to filter. May be omitted or '-' for stdin.", type=Path) parser.add_argument( "--filter-on", dest="_filter_kgtk_file", help="The KGTK file to filter against.", type=Path, required=True) @@ -35,12 +49,12 @@ def add_arguments(parser): parser.add_argument( "--filter-keys", "--right-keys", dest="filter_keys", help="The key columns in the filter-on file.", nargs='*') - parser.add_argument( "--field-separator", dest="field_separator", help="Separator for multifield keys", default=IfExists.FIELD_SEPARATOR_DEFAULT) + parser.add_argument( "--field-separator", dest="field_separator", help=h("Separator for multifield keys"), default=IfExists.FIELD_SEPARATOR_DEFAULT) - KgtkReader.add_debug_arguments(parser) - KgtkReaderOptions.add_arguments(parser, mode_options=True, who="input") - KgtkReaderOptions.add_arguments(parser, mode_options=True, who="filter") - KgtkValueOptions.add_arguments(parser) + KgtkReader.add_debug_arguments(parser, expert=_expert) + KgtkReaderOptions.add_arguments(parser, mode_options=True, who="input", expert=_expert) + KgtkReaderOptions.add_arguments(parser, mode_options=True, who="filter", expert=_expert) + KgtkValueOptions.add_arguments(parser, expert=_expert) def run(input_kgtk_file: typing.Optional[Path], filter_kgtk_file: Path, diff --git a/kgtk/cli/validate.py b/kgtk/cli/validate.py index f61db403a..9e16bd90d 100644 --- a/kgtk/cli/validate.py +++ b/kgtk/cli/validate.py @@ -11,10 +11,12 @@ This program does not validate individual fields. """ +from argparse import Namespace from pathlib import Path import sys import typing +from kgtk.cli_argparse import KGTKArgumentParser from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.value.kgtkvalueoptions import KgtkValueOptions @@ -24,20 +26,22 @@ def parser(): } -def add_arguments(parser): +def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Namespace): """ Parse arguments Args: parser (argparse.ArgumentParser) """ + _expert: bool = parsed_shared_args._expert + parser.add_argument( "kgtk_files", nargs="*", help="The KGTK file(s) to validate. May be omitted or '-' for stdin.", type=Path) parser.add_argument( "--header-only", dest="header_only", help="Process the only the header of the input file.", action="store_true") - KgtkReader.add_debug_arguments(parser) - KgtkReaderOptions.add_arguments(parser, mode_options=True, validate=True) - KgtkValueOptions.add_arguments(parser) + KgtkReader.add_debug_arguments(parser, expert=_expert) + KgtkReaderOptions.add_arguments(parser, mode_options=True, validate_by_default=True, expert=True) + KgtkValueOptions.add_arguments(parser, expert=True) def run(kgtk_files: typing.Optional[typing.List[typing.Optional[Path]]], diff --git a/kgtk/cli_entry.py b/kgtk/cli_entry.py index 5baa9976b..007bca01f 100644 --- a/kgtk/cli_entry.py +++ b/kgtk/cli_entry.py @@ -49,6 +49,7 @@ def cli_entry(*args): ) shared_args = base_parser.add_argument_group('shared optional arguments') shared_args.add_argument('--debug', dest='_debug', action='store_true', default=False, help='enable debug mode') + shared_args.add_argument('--expert', dest='_expert', action='store_true', default=False, help='enable expert mode') add_shared_arguments(shared_args) # parse shared arguments @@ -70,7 +71,10 @@ def cli_entry(*args): mod = importlib.import_module('.{}'.format(h), 'kgtk.cli') sub_parser = sub_parsers.add_parser(h, **mod.parser()) add_default_arguments(sub_parser) # call this before adding other arguments - mod.add_arguments(sub_parser) + if hasattr(mod, "add_arguments_extended"): + mod.add_arguments_extended(sub_parser, parsed_shared_args) + else: + mod.add_arguments(sub_parser) # add root level usage after sub-parsers are created # this won't pollute help info in sub-parsers diff --git a/kgtk/io/edgereader.py b/kgtk/io/edgereader.py index 835177a1d..c1234ea78 100644 --- a/kgtk/io/edgereader.py +++ b/kgtk/io/edgereader.py @@ -110,9 +110,9 @@ def main(): """ parser = ArgumentParser() parser.add_argument(dest="kgtk_file", help="The KGTK edge file to read", type=Path, nargs="?") - KgtkReader.add_debug_arguments(parser) - KgtkReaderOptions.add_arguments(parser, validate=True) - KgtkValueOptions.add_arguments(parser) + KgtkReader.add_debug_arguments(parser, expert=True) + KgtkReaderOptions.add_arguments(parser, validate_by_default=True, expert=True) + KgtkValueOptions.add_arguments(parser, expert=True) args = parser.parse_args() error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr diff --git a/kgtk/io/kgtkreader.py b/kgtk/io/kgtkreader.py index 2d7e1de45..80518a5e8 100644 --- a/kgtk/io/kgtkreader.py +++ b/kgtk/io/kgtkreader.py @@ -16,7 +16,7 @@ """ -from argparse import ArgumentParser, _ArgumentGroup, Namespace +from argparse import ArgumentParser, _ArgumentGroup, Namespace, SUPPRESS import attr import bz2 from enum import Enum @@ -101,130 +101,145 @@ class KgtkReaderOptions(): def add_arguments(cls, parser: ArgumentParser, mode_options: bool = False, - validate: bool = False, + validate_by_default: bool = False, + expert: bool = False, who: str = ""): + + # This helper function makes it easy to suppress options from + # The help message. The options are still there, and initialize + # what they need to initialize. + def h(msg: str)->str: + if expert: + return msg + else: + return SUPPRESS + prefix1: str = "--" if len(who) == 0 else "--" + who + "-" prefix2: str = "" if len(who) == 0 else who + "_" prefix3: str = "" if len(who) == 0 else who + ": " prefix4: str = "" if len(who) == 0 else who + " file " - fgroup: _ArgumentGroup = parser.add_argument_group(prefix3 + "File options", - "Options affecting " + prefix4 + "processing") + fgroup: _ArgumentGroup = parser.add_argument_group(h(prefix3 + "File options"), + h("Options affecting " + prefix4 + "processing")) fgroup.add_argument(prefix1 + "column-separator", dest=prefix2 + "column_separator", - help=prefix3 + "Column separator (default=).", # TODO: provide the default with escapes, e.g. \t + help=h(prefix3 + "Column separator (default=)."), # TODO: provide the default with escapes, e.g. \t type=str, default=KgtkFormat.COLUMN_SEPARATOR) fgroup.add_argument(prefix1 + "compression-type", dest=prefix2 + "compression_type", - help=prefix3 + "Specify the compression type (default=%(default)s).") + help=h(prefix3 + "Specify the compression type (default=%(default)s).")) fgroup.add_argument(prefix1 + "error-limit", dest=prefix2 + "error_limit", - help=prefix3 + "The maximum number of errors to report before failing (default=%(default)s)", + help=h(prefix3 + "The maximum number of errors to report before failing (default=%(default)s)"), type=int, default=cls.ERROR_LIMIT_DEFAULT) fgroup.add_argument(prefix1 + "gzip-in-parallel", dest=prefix2 + "gzip_in_parallel", - help=prefix3 + "Execute gzip in parallel (default=%(default)s).", action='store_true') + help=h(prefix3 + "Execute gzip in parallel (default=%(default)s)."), + action='store_true') fgroup.add_argument(prefix1 + "gzip-queue-size", dest=prefix2 + "gzip_queue_size", - help=prefix3 + "Queue size for parallel gzip (default=%(default)s).", + help=h(prefix3 + "Queue size for parallel gzip (default=%(default)s)."), type=int, default=cls.GZIP_QUEUE_SIZE_DEFAULT) if mode_options: fgroup.add_argument(prefix1 + "mode", dest=prefix2 + "mode", - help=prefix3 + "Determine the KGTK file mode (default=%(default)s).", + help=h(prefix3 + "Determine the KGTK file mode (default=%(default)s)."), type=KgtkReaderMode, action=EnumNameAction, default=KgtkReaderMode.AUTO) - hgroup: _ArgumentGroup = parser.add_argument_group(prefix3 + "Header parsing", "Options affecting " + prefix4 + "header parsing") + hgroup: _ArgumentGroup = parser.add_argument_group(h(prefix3 + "Header parsing"), + h("Options affecting " + prefix4 + "header parsing")) hgroup.add_argument(prefix1 + "force-column-names", dest=prefix2 + "force_column_names", - help=prefix3 + "Force the column names (default=None).", + help=h(prefix3 + "Force the column names (default=None)."), nargs='+') hgroup.add_argument(prefix1 + "header-error-action", dest=prefix2 + "header_error_action", - help=prefix3 + "The action to take when a header error is detected. Only ERROR or EXIT are supported (default=%(default)s).", + help=h(prefix3 + "The action to take when a header error is detected. Only ERROR or EXIT are supported (default=%(default)s)."), type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXIT) hgroup.add_argument(prefix1 + "skip-first-record", dest=prefix2 + "skip_first_record", - help=prefix3 + "Skip the first record when forcing column names (default=%(default)s).", action='store_true') + help=h(prefix3 + "Skip the first record when forcing column names (default=%(default)s)."), + action='store_true') hgroup.add_argument(prefix1 + "unsafe-column-name-action", dest=prefix2 + "unsafe_column_name_action", - help=prefix3 + "The action to take when a column name is unsafe (default=%(default)s).", + help=h(prefix3 + "The action to take when a column name is unsafe (default=%(default)s)."), type=ValidationAction, action=EnumNameAction, default=ValidationAction.REPORT) - lgroup: _ArgumentGroup = parser.add_argument_group(prefix3 + "Line parsing", "Options affecting " + prefix4 + "data line parsing") + lgroup: _ArgumentGroup = parser.add_argument_group(h(prefix3 + "Line parsing"), + h("Options affecting " + prefix4 + "data line parsing")) lgroup.add_argument(prefix1 + "repair-and-validate-lines", dest=prefix2 + "repair_and_validate_lines", - help=prefix3 + "Repair and validate lines (default=%(default)s).", - action='store_true', default=validate) + help=h(prefix3 + "Repair and validate lines (default=%(default)s)."), + action='store_true', default=validate_by_default) lgroup.add_argument(prefix1 + "do-not-repair-and-validate-lines", dest=prefix2 + "repair_and_validate_lines", - help=prefix3 + "Do not repair and validate lines.", + help=h(prefix3 + "Do not repair and validate lines."), action='store_false') lgroup.add_argument(prefix1 + "repair-and-validate-values", dest=prefix2 + "repair_and_validate_values", - help=prefix3 + "Repair and validate values (default=%(default)s).", - action='store_true', default=validate) + help=h(prefix3 + "Repair and validate values (default=%(default)s)."), + action='store_true', default=validate_by_default) lgroup.add_argument(prefix1 + "do-not-repair-and-validate-values", dest=prefix2 + "repair-and-validate_values", - help=prefix3 + "Do not repair and validate values.", + help=h(prefix3 + "Do not repair and validate values."), action='store_false') lgroup.add_argument(prefix1 + "blank-required-field-line-action", dest=prefix2 + "blank_required_field_line_action", - help=prefix3 + "The action to take when a line with a blank node1, node2, or id field (per mode) is detected (default=%(default)s).", + help=h(prefix3 + "The action to take when a line with a blank node1, node2, or id field (per mode) is detected (default=%(default)s)."), type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) lgroup.add_argument(prefix1 + "comment-line-action", dest=prefix2 + "comment_line_action", - help=prefix3 + "The action to take when a comment line is detected (default=%(default)s).", + help=h(prefix3 + "The action to take when a comment line is detected (default=%(default)s)."), type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) lgroup.add_argument(prefix1 + "empty-line-action", dest=prefix2 + "empty_line_action", - help=prefix3 + "The action to take when an empty line is detected (default=%(default)s).", + help=h(prefix3 + "The action to take when an empty line is detected (default=%(default)s)."), type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) lgroup.add_argument(prefix1 + "fill-short-lines", dest=prefix2 + "fill_short_lines", - help=prefix3 + "Fill missing trailing columns in short lines with empty values (default=%(default)s).", + help=h(prefix3 + "Fill missing trailing columns in short lines with empty values (default=%(default)s)."), action='store_true') lgroup.add_argument(prefix1 + "invalid-value-action", dest=prefix2 + "invalid_value_action", - help=prefix3 + "The action to take when a data cell value is invalid (default=%(default)s).", + help=h(prefix3 + "The action to take when a data cell value is invalid (default=%(default)s)."), type=ValidationAction, action=EnumNameAction, default=ValidationAction.REPORT) lgroup.add_argument(prefix1 + "long-line-action", dest=prefix2 + "long_line_action", - help=prefix3 + "The action to take when a long line is detected (default=%(default)s).", + help=h(prefix3 + "The action to take when a long line is detected (default=%(default)s)."), type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) lgroup.add_argument(prefix1 + "short-line-action", dest=prefix2 + "short_line_action", - help=prefix3 + "The action to take when a short line is detected (default=%(default)s).", + help=h(prefix3 + "The action to take when a short line is detected (default=%(default)s)."), type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) lgroup.add_argument(prefix1 + "truncate-long-lines", dest=prefix2 + "truncate_long_lines", - help=prefix3 + "Remove excess trailing columns in long lines (default=%(default)s).", + help=h(prefix3 + "Remove excess trailing columns in long lines (default=%(default)s)."), action='store_true') lgroup.add_argument(prefix1 + "whitespace-line-action", dest=prefix2 + "whitespace_line_action", - help=prefix3 + "The action to take when a whitespace line is detected (default=%(default)s).", + help=h(prefix3 + "The action to take when a whitespace line is detected (default=%(default)s)."), type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) @classmethod @@ -924,16 +939,31 @@ def merge_columns(self, additional_columns: typing.List[str])->typing.List[str]: return merged_columns @classmethod - def add_debug_arguments(cls, parser: ArgumentParser): + def add_debug_arguments(cls, parser: ArgumentParser, expert: bool = False): + # This helper function makes it easy to suppress options from + # The help message. The options are still there, and initialize + # what they need to initialize. + def h(msg: str)->str: + if expert: + return msg + else: + return SUPPRESS + + # TODO: Fix the argparse bug that prevents these two arguments from + # having their help messages suppressed. errors_to = parser.add_mutually_exclusive_group() errors_to.add_argument( "--errors-to-stdout", dest="errors_to_stdout", - help="Send errors to stdout instead of stderr", action="store_true") + help="Send errors to stdout instead of stderr", + action="store_true") errors_to.add_argument( "--errors-to-stderr", dest="errors_to_stderr", - help="Send errors to stderr instead of stdout", action="store_true") + help="Send errors to stderr instead of stdout", + action="store_true") parser.add_argument("-v", "--verbose", dest="verbose", help="Print additional progress messages.", action='store_true') - parser.add_argument( "--very-verbose", dest="very_verbose", help="Print additional progress messages.", action='store_true') + parser.add_argument( "--very-verbose", dest="very_verbose", + help=h("Print additional progress messages."), + action='store_true') def main(): """ @@ -946,7 +976,7 @@ def main(): parser = ArgumentParser() parser.add_argument(dest="kgtk_file", help="The KGTK file to read", type=Path, nargs="?") - KgtkReader.add_debug_arguments(parser) + KgtkReader.add_debug_arguments(parser, expert=True) parser.add_argument( "--test", dest="test_method", help="The test to perform", choices=["rows", "concise-rows", "kgtk-values", "concise-kgtk-values", @@ -955,8 +985,8 @@ def main(): default="rows") parser.add_argument( "--test-validate", dest="test_validate", help="Validate KgtkValue objects in test.", action='store_true') - KgtkReaderOptions.add_arguments(parser, mode_options=True, validate=True) - KgtkValueOptions.add_arguments(parser) + KgtkReaderOptions.add_arguments(parser, mode_options=True, validate_by_default=True, expert=True) + KgtkValueOptions.add_arguments(parser, expert=True) args = parser.parse_args() error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr diff --git a/kgtk/io/nodereader.py b/kgtk/io/nodereader.py index 8fee4bd03..4d73ca3a6 100644 --- a/kgtk/io/nodereader.py +++ b/kgtk/io/nodereader.py @@ -94,9 +94,9 @@ def main(): """ parser = ArgumentParser() parser.add_argument(dest="kgtk_file", help="The KGTK edge file to read", type=Path, nargs="?") - KgtkReader.add_debug_arguments(parser, validate=True) - KgtkReaderOptions.add_arguments(parser) - KgtkValueOptions.add_arguments(parser) + KgtkReader.add_debug_arguments(parser, expert=True) + KgtkReaderOptions.add_arguments(parser, validate_by_default=True, expert=True) + KgtkValueOptions.add_arguments(parser, expert=True) args = parser.parse_args() error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr diff --git a/kgtk/value/kgtkvalueoptions.py b/kgtk/value/kgtkvalueoptions.py index b54046b29..5cb8e7526 100644 --- a/kgtk/value/kgtkvalueoptions.py +++ b/kgtk/value/kgtkvalueoptions.py @@ -2,7 +2,7 @@ KGTK value processing options. """ -from argparse import ArgumentParser, Namespace +from argparse import ArgumentParser, Namespace, SUPPRESS import attr import typing @@ -59,7 +59,12 @@ class KgtkValueOptions: @classmethod - def add_arguments(cls, parser: ArgumentParser, who: str = "", desc: str = "."): + def add_arguments(cls, + parser: ArgumentParser, + who: str = "", + desc: str = ".", + expert: bool = False, + ): """Add arguments for KgtkValue option processing. When "who" is not empty, it prefixes the options, destinations, and @@ -75,85 +80,98 @@ def add_arguments(cls, parser: ArgumentParser, who: str = "", desc: str = "."): prefix2 = who + "_" prefix3 = who + ": " - vgroup = parser.add_argument_group(prefix3 + "Data value parsing", "Options controlling the parsing and processing of KGTK data values" + desc) + # This helper function makes it easy to suppress options from + # The help message. The options are still there, and initialize + # what they need to initialize. + def h(msg: str)->str: + if expert: + return msg + else: + return SUPPRESS + + vgroup = parser.add_argument_group(h(prefix3 + "Data value parsing"), + h("Options controlling the parsing and processing of KGTK data values" + desc)) vgroup.add_argument( prefix1 + "additional-language-codes", dest=prefix2 + "additional_language_codes", - help=prefix3 + "Additional language codes (default=None).", + help=h(prefix3 + "Additional language codes (default=None)."), nargs="*", default=None) lsgroup= vgroup.add_mutually_exclusive_group() lsgroup.add_argument( prefix1 + "allow-language-suffixes", dest=prefix2 + "allow_language_suffixes", - help=prefix3 + "Allow language identifier suffixes starting with a dash (default=%(default)s).", + help=h(prefix3 + "Allow language identifier suffixes starting with a dash (default=%(default)s)."), action='store_true', default=True) lsgroup.add_argument( prefix1 + "disallow-language-suffixes", dest=prefix2 + "allow_language_suffixes", - help=prefix3 + "Disallow language identifier suffixes starting with a dash.", + help=h(prefix3 + "Disallow language identifier suffixes starting with a dash."), action='store_false') laxgroup= vgroup.add_mutually_exclusive_group() laxgroup.add_argument( prefix1 + "allow-lax-strings", dest=prefix2 + "allow_lax_strings", - help=prefix3 + "Do not check if double quotes are backslashed inside strings (default=%(default)s).", + help=h(prefix3 + "Do not check if double quotes are backslashed inside strings (default=%(default)s)."), action='store_true', default=False) laxgroup.add_argument( prefix1 + "disallow-lax-strings", dest=prefix2 + "allow_lax_strings", - help=prefix3 + "Check if double quotes are backslashed inside strings.", + help=h(prefix3 + "Check if double quotes are backslashed inside strings."), action='store_false') lqgroup= vgroup.add_mutually_exclusive_group() lqgroup.add_argument( prefix1 + "allow-lax-lq-strings", dest=prefix2 + "allow_lax_lq_strings", - help=prefix3 + "Do not check if single quotes are backslashed inside language qualified strings (default=%(default)s).", + help=h(prefix3 + "Do not check if single quotes are backslashed inside language qualified strings (default=%(default)s)."), action='store_true', default=False) lqgroup.add_argument( prefix1 + "disallow-lax-lq-strings", dest=prefix2 + "allow_lax_lq_strings", - help=prefix3 + "Check if single quotes are backslashed inside language qualified strings.", + help=h(prefix3 + "Check if single quotes are backslashed inside language qualified strings."), action='store_false') amd0group= vgroup.add_mutually_exclusive_group() amd0group.add_argument( prefix1 + "allow-month-or-day-zero", dest=prefix2 + "allow_month_or_day_zero", - help=prefix3 + "Allow month or day zero in dates (default=%(default)s).", action='store_true', default=False) + help=h(prefix3 + "Allow month or day zero in dates (default=%(default)s)."), + action='store_true', default=False) amd0group.add_argument( prefix1 + "disallow-month-or-day-zero", dest=prefix2 + "allow_month_or_day_zero", - help=prefix3 + "Allow month or day zero in dates.", + help=h(prefix3 + "Allow month or day zero in dates."), action='store_false') rmd0group= vgroup.add_mutually_exclusive_group() rmd0group.add_argument( prefix1 + "repair-month-or-day-zero", dest=prefix2 + "repair_month_or_day_zero", - help=prefix3 + "Repair month or day zero in dates (default=%(default)s).", + help=h(prefix3 + "Repair month or day zero in dates (default=%(default)s)."), action='store_true', default=False) rmd0group.add_argument( prefix1 + "no-repair-month-or-day-zero", dest=prefix2 + "repair_month_or_day_zero", - help=prefix3 + "Do not repair month or day zero in dates.", action='store_false') + help=h(prefix3 + "Do not repair month or day zero in dates."), + action='store_false') vgroup.add_argument( prefix1 + "minimum-valid-year", dest=prefix2 + "minimum_valid_year", - help=prefix3 + "The minimum valid year in dates (default=%(default)d).", + help=h(prefix3 + "The minimum valid year in dates (default=%(default)d)."), type=int, default=cls.MINIMUM_VALID_YEAR) vgroup.add_argument( prefix1 + "maximum-valid-year", dest=prefix2 + "maximum_valid_year", - help=prefix3 + "The maximum valid year in dates (default=%(default)d).", + help=h(prefix3 + "The maximum valid year in dates (default=%(default)d)."), type=int, default=cls.MAXIMUM_VALID_YEAR) vgroup.add_argument( prefix1 + "minimum-valid-lat", dest=prefix2 + "minimum_valid_lat", - help=prefix3 + "The minimum valid latitude (default=%(default)d).", + help=h(prefix3 + "The minimum valid latitude (default=%(default)d)."), type=int, default=cls.MINIMUM_VALID_LAT) vgroup.add_argument( prefix1 + "maximum-valid-lat", dest=prefix2 + "maximum_valid_lat", - help=prefix3 + "The maximum valid latitude (default=%(default)d).", + help=h(prefix3 + "The maximum valid latitude (default=%(default)d)."), type=int, default=cls.MAXIMUM_VALID_LAT) vgroup.add_argument( prefix1 + "minimum-valid-lon", dest=prefix2 + "minimum_valid_lon", - help=prefix3 + "The minimum valid longitude (default=%(default)d).", + help=h(prefix3 + "The minimum valid longitude (default=%(default)d)."), type=int, default=cls.MINIMUM_VALID_LON) vgroup.add_argument( prefix1 + "maximum-valid-lon", dest=prefix2 + "maximum_valid_lon", - help=prefix3 + "The maximum valid longitude (default=%(default)d).", + help=h(prefix3 + "The maximum valid longitude (default=%(default)d)."), type=int, default=cls.MAXIMUM_VALID_LON) elsgroup= vgroup.add_mutually_exclusive_group() elsgroup.add_argument( prefix1 + "escape-list-separators", dest=prefix2 + "escape_list_separators", - help=prefix3 + "Escape all list separators instead of splitting on them (default=%(default)s).", + help=h(prefix3 + "Escape all list separators instead of splitting on them (default=%(default)s)."), action='store_true', default=False) elsgroup.add_argument( prefix1 + "no-escape-list-separators", dest=prefix2 + "escape_list_separators", - help=prefix3 + "Do not escape list separators.", action='store_false') + help=h(prefix3 + "Do not escape list separators."), + action='store_false') @classmethod # Build the value parsing option structure. @@ -184,9 +202,9 @@ def main(): Test the KGTK value options. """ parser: ArgumentParser = ArgumentParser() - KgtkValueOptions.add_arguments(parser) - KgtkValueOptions.add_arguments(parser, who="left", desc=" for the left file.") - KgtkValueOptions.add_arguments(parser, who="right", desc=" for the right file.") + KgtkValueOptions.add_arguments(parser, expert=True) + KgtkValueOptions.add_arguments(parser, who="left", desc=" for the left file.", expert=True) + KgtkValueOptions.add_arguments(parser, who="right", desc=" for the right file.", expert=True) args: Namespace = parser.parse_args() # Build the value parsing option structure. From 41b8544cdc35d2e6790bd9fd8f69b1e4e73de31b Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Mon, 11 May 2020 17:36:08 -0700 Subject: [PATCH 148/278] Implement lookup fallbacks to assist with maintaining APi compatability. --- kgtk/cli/ifexists.py | 19 ++++++++----- kgtk/cli/ifnotexists.py | 13 ++++++--- kgtk/io/kgtkreader.py | 60 ++++++++++++++++++++++++----------------- 3 files changed, 59 insertions(+), 33 deletions(-) diff --git a/kgtk/cli/ifexists.py b/kgtk/cli/ifexists.py index 9e62ec60e..69718fbc7 100644 --- a/kgtk/cli/ifexists.py +++ b/kgtk/cli/ifexists.py @@ -42,15 +42,22 @@ def h(msg: str)->str: parser.add_argument( "input_kgtk_file", nargs="?", help="The KGTK file to filter. May be omitted or '-' for stdin.", type=Path) + parser.add_argument( "--input-keys", "--left-keys", dest="input_keys", help="The key columns in the file being filtered.", nargs='*') + parser.add_argument( "--filter-on", dest="filter_kgtk_file", help="The KGTK file to filter against.", type=Path, required=True) - parser.add_argument("-o", "--output-file", dest="output_kgtk_file", help="The KGTK file to write", type=Path, default=None) + parser.add_argument( "--filter-keys", "--right-keys", dest="filter_keys", help="The key columns in the filter-on file.", nargs='*') - parser.add_argument( "--input-keys", "--left-keys", dest="input_keys", help="The key columns in the file being filtered.", nargs='*') + parser.add_argument("-o", "--output-file", dest="output_kgtk_file", help="The KGTK file to write", type=Path, default=None) - parser.add_argument( "--filter-keys", "--right-keys", dest="filter_keys", help="The key columns in the filter-on file.", nargs='*') + # This argument is retained for compatability with earlier versions of this command. + parser.add_argument( "--error-limit", dest="error_limit", + help=h("The maximum number of errors per input fule (default=%(default)s)"), + default=KgtkReaderOptions.ERROR_LIMIT_DEFAULT) - parser.add_argument( "--field-separator", dest="field_separator", help=h("Separator for multifield keys"), default=IfExists.FIELD_SEPARATOR_DEFAULT) + parser.add_argument( "--field-separator", dest="field_separator", + help=h("Separator for multifield keys (default=%(default)s)") + , default=IfExists.FIELD_SEPARATOR_DEFAULT) KgtkReader.add_debug_arguments(parser, expert=_expert) KgtkReaderOptions.add_arguments(parser, mode_options=True, who="input", expert=_expert) @@ -80,8 +87,8 @@ def run(input_kgtk_file: typing.Optional[Path], error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. - input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="input") - filter_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="filter") + input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="input", fallback=True) + filter_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="filter", fallback=True) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) try: diff --git a/kgtk/cli/ifnotexists.py b/kgtk/cli/ifnotexists.py index 964f77dc0..e49a481f7 100644 --- a/kgtk/cli/ifnotexists.py +++ b/kgtk/cli/ifnotexists.py @@ -49,7 +49,14 @@ def h(msg: str)->str: parser.add_argument( "--filter-keys", "--right-keys", dest="filter_keys", help="The key columns in the filter-on file.", nargs='*') - parser.add_argument( "--field-separator", dest="field_separator", help=h("Separator for multifield keys"), default=IfExists.FIELD_SEPARATOR_DEFAULT) + # This argument is retained for compatability with earlier versions of this command. + parser.add_argument( "--error-limit", dest="error_limit", + help=h("The maximum number of errors per input fule (default=%(default)s)"), + default=KgtkReaderOptions.ERROR_LIMIT_DEFAULT) + + parser.add_argument( "--field-separator", dest="field_separator", + help=h("Separator for multifield keys"), + default=IfExists.FIELD_SEPARATOR_DEFAULT) KgtkReader.add_debug_arguments(parser, expert=_expert) KgtkReaderOptions.add_arguments(parser, mode_options=True, who="input", expert=_expert) @@ -79,8 +86,8 @@ def run(input_kgtk_file: typing.Optional[Path], error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. - input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="input") - filter_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="filter") + input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="input", fallback=True) + filter_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="filter", fallback=True) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) try: diff --git a/kgtk/io/kgtkreader.py b/kgtk/io/kgtkreader.py index 80518a5e8..ed6de11d9 100644 --- a/kgtk/io/kgtkreader.py +++ b/kgtk/io/kgtkreader.py @@ -248,39 +248,50 @@ def from_dict(cls, d: dict, who: str = "", mode: typing.Optional[KgtkReaderMode] = None, + fallback: bool = False, )->'KgtkReaderOptions': prefix: str = "" # The destination name prefix. if len(who) > 0: prefix = who + "_" + # TODO: Figure out how to type check this method. + def lookup(name: str, default): + prefixed_name = prefix + name + if prefixed_name in d: + return d[prefixed_name] + elif fallback and name in d: + return d[name] + else: + return default + reader_mode: KgtkReaderMode - if mode is None: - reader_mode = d.get(prefix + "mode", KgtkReaderMode.AUTO) - else: + if mode is not None: reader_mode = mode + else: + reader_mode = lookup("mode", KgtkReaderMode.AUTO) return cls( - blank_required_field_line_action=d.get(prefix + "blank_required_field_line_action", ValidationAction.EXCLUDE), - column_separator=d.get(prefix + "column_separator", KgtkFormat.COLUMN_SEPARATOR), - comment_line_action=d.get(prefix + "comment_line_action", ValidationAction.EXCLUDE), - compression_type=d.get(prefix + "compression_type", None), - empty_line_action=d.get(prefix + "empty_line_action", ValidationAction.EXCLUDE), - error_limit=d.get(prefix + "error_limit", cls.ERROR_LIMIT_DEFAULT), - fill_short_lines=d.get(prefix + "fill_short_lines", False), - force_column_names=d.get(prefix + "force_column_names", None), - gzip_in_parallel=d.get(prefix + "gzip_in_parallel", False), - gzip_queue_size=d.get(prefix + "gzip_queue_size", KgtkReaderOptions.GZIP_QUEUE_SIZE_DEFAULT), - header_error_action=d.get(prefix + "header_error_action", ValidationAction.EXCLUDE), - invalid_value_action=d.get(prefix + "invalid_value_action", ValidationAction.REPORT), - long_line_action=d.get(prefix + "long_line_action", ValidationAction.EXCLUDE), + blank_required_field_line_action=lookup("blank_required_field_line_action", ValidationAction.EXCLUDE), + column_separator=lookup("column_separator", KgtkFormat.COLUMN_SEPARATOR), + comment_line_action=lookup("comment_line_action", ValidationAction.EXCLUDE), + compression_type=lookup("compression_type", None), + empty_line_action=lookup("empty_line_action", ValidationAction.EXCLUDE), + error_limit=lookup("error_limit", cls.ERROR_LIMIT_DEFAULT), + fill_short_lines=lookup("fill_short_lines", False), + force_column_names=lookup("force_column_names", None), + gzip_in_parallel=lookup("gzip_in_parallel", False), + gzip_queue_size=lookup("gzip_queue_size", KgtkReaderOptions.GZIP_QUEUE_SIZE_DEFAULT), + header_error_action=lookup("header_error_action", ValidationAction.EXCLUDE), + invalid_value_action=lookup("invalid_value_action", ValidationAction.REPORT), + long_line_action=lookup("long_line_action", ValidationAction.EXCLUDE), mode=reader_mode, - repair_and_validate_lines=d.get(prefix + "repair_and_validate_lines", False), - repair_and_validate_values=d.get(prefix + "repair_and_validate_values", False), - short_line_action=d.get(prefix + "short_line_action", ValidationAction.EXCLUDE), - skip_first_record=d.get(prefix + "skip_first_recordb", False), - truncate_long_lines=d.get(prefix + "truncate_long_lines", False), - unsafe_column_name_action=d.get(prefix + "unsafe_column_name_action", ValidationAction.REPORT), - whitespace_line_action=d.get(prefix + "whitespace_line_action", ValidationAction.EXCLUDE), + repair_and_validate_lines=lookup("repair_and_validate_lines", False), + repair_and_validate_values=lookup("repair_and_validate_values", False), + short_line_action=lookup("short_line_action", ValidationAction.EXCLUDE), + skip_first_record=lookup("skip_first_recordb", False), + truncate_long_lines=lookup("truncate_long_lines", False), + unsafe_column_name_action=lookup("unsafe_column_name_action", ValidationAction.REPORT), + whitespace_line_action=lookup("whitespace_line_action", ValidationAction.EXCLUDE), ) @classmethod @@ -289,8 +300,9 @@ def from_args(cls, args: Namespace, who: str = "", mode: typing.Optional[KgtkReaderMode] = None, + fallback: bool = False, )->'KgtkReaderOptions': - return cls.from_dict(vars(args), who=who, mode=mode) + return cls.from_dict(vars(args), who=who, mode=mode, fallback=fallback) DEFAULT_KGTK_READER_OPTIONS: KgtkReaderOptions = KgtkReaderOptions() From b5a14ce1e59e7016666371e9f92492fc86b16e0d Mon Sep 17 00:00:00 2001 From: saggu Date: Mon, 11 May 2020 17:53:25 -0700 Subject: [PATCH 149/278] remove print --- kgtk/triple_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kgtk/triple_generator.py b/kgtk/triple_generator.py index 2afaff4e9..6eb538add 100644 --- a/kgtk/triple_generator.py +++ b/kgtk/triple_generator.py @@ -276,7 +276,7 @@ def generate_normal_triple( amount = TripleGenerator.clean_number_string(amount) num_type = self.xsd_number_type(amount) - print(amount, num_type) + lower_bound = TripleGenerator.clean_number_string(lower_bound) upper_bound = TripleGenerator.clean_number_string(upper_bound) if unit != None: From b03678c5f3e46e5e970c51277917f112e2078c49 Mon Sep 17 00:00:00 2001 From: Naren Date: Mon, 11 May 2020 18:19:15 -0700 Subject: [PATCH 150/278] improve speed of reachibility command --- kgtk/cli/reachable_nodes.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/kgtk/cli/reachable_nodes.py b/kgtk/cli/reachable_nodes.py index fa988ae5b..5998ab361 100644 --- a/kgtk/cli/reachable_nodes.py +++ b/kgtk/cli/reachable_nodes.py @@ -25,7 +25,7 @@ def add_arguments(parser): parser.add_argument("--subj", action="store", type=int, dest="sub", help='Column in which the subject is given, default 0', default=0) parser.add_argument("--obj", action="store", type=int, dest="obj", help='Column in which the subject is given, default 2', default=2) parser.add_argument("--pred",action="store" ,type=int, dest="pred",help='Column in which predicate is given, default 1',default=1) - parser.add_argument("--props", action="store", type=str, dest="props",help='Properties to consider while finding reachable nodes - comma-separated string',default=None) + parser.add_argument("--props", action="store", type=str, dest="props",help='Properties to consider while finding reachable nodes - comma-separated string,default all properties',default=None) parser.add_argument('--undirected', action='store_true', dest="undirected", help="Option to specify graph as undirected?") @@ -56,8 +56,8 @@ def get_edges_by_edge_prop(g, p, v): label='c'+str(find_pred_position(sub,pred,obj)) header=['node1','label','node2'] root_set=set() - root_list=[] property_list=[] + if (rootfile): tsv_file = open(rootfile) read_tsv = csv.reader(tsv_file, delimiter="\t") @@ -71,23 +71,24 @@ def get_edges_by_edge_prop(g, p, v): if (root): for r in root.split(','): root_set.add(r) - root_list=list(root_set) - property_list = [item for item in props.split(',')] + G = load_graph_from_csv(filename,not(undirected),skip_first=not(header_bool),hashed=True,csv_options={'delimiter': '\t'},ecols=(sub,obj)) name = G.vp["name"] index_list = [] for v in G.vertices(): - if name[v] in root_list: + if name[v] in root_set: index_list.append(v) edge_filter_set = set() - for prop in property_list: - edge_filter_set.update(get_edges_by_edge_prop(G, label,prop)); - - G.clear_edges() - G.add_edge_list(list(edge_filter_set)) + if props: + property_list = [item for item in props.split(',')] + for prop in property_list: + edge_filter_set.update(get_edges_by_edge_prop(G, label,prop)); + G.clear_edges() + G.add_edge_list(list(edge_filter_set)) + if output: f=open(output,'w') tsv_writer = csv.writer(f, quoting=csv.QUOTE_NONE,delimiter="\t",escapechar="\n",quotechar='') From f2fff9ce7e7d6a506733e0531feb43d5e1a54ba1 Mon Sep 17 00:00:00 2001 From: greatyyx Date: Mon, 11 May 2020 18:45:37 -0700 Subject: [PATCH 151/278] fix broken pipe while calling sh in filter --- kgtk/cli/filter.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kgtk/cli/filter.py b/kgtk/cli/filter.py index 8eb4282e6..23fc3232f 100644 --- a/kgtk/cli/filter.py +++ b/kgtk/cli/filter.py @@ -62,5 +62,8 @@ def prepare_filter(property, prop_pattern): elif not sys.stdin.isatty(): sh.mlr('--%slite' % datatype, 'filter', filter_str, _in=sys.stdin, _out=sys.stdout, _err=sys.stderr) - except: - raise KGTKException + except sh.SignalException_SIGPIPE: + # handles SIGPIPE, if it raises to upper level, it will cause another error + pass + except Exception as e: + raise KGTKException(e) From fb0351e2276625f4327c18a3cbfbbce10d68dd90 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Mon, 11 May 2020 20:06:54 -0700 Subject: [PATCH 152/278] Add error_file to kgtkjoiner. --- kgtk/join/kgtkjoiner.py | 86 ++++++++++--------- kgtk/join/test/edgejoiner-test1-inner.sh | 5 -- ...1-file1.tsv => kgtkjoiner-test1-file1.tsv} | 0 ...1-file2.tsv => kgtkjoiner-test1-file2.tsv} | 0 kgtk/join/test/kgtkjoiner-test1-inner.sh | 5 ++ 5 files changed, 49 insertions(+), 47 deletions(-) delete mode 100755 kgtk/join/test/edgejoiner-test1-inner.sh rename kgtk/join/test/{edgejoiner-test1-file1.tsv => kgtkjoiner-test1-file1.tsv} (100%) rename kgtk/join/test/{edgejoiner-test1-file2.tsv => kgtkjoiner-test1-file2.tsv} (100%) create mode 100755 kgtk/join/test/kgtkjoiner-test1-inner.sh diff --git a/kgtk/join/kgtkjoiner.py b/kgtk/join/kgtkjoiner.py index 58f597740..0e7c9dcc8 100644 --- a/kgtk/join/kgtkjoiner.py +++ b/kgtk/join/kgtkjoiner.py @@ -55,6 +55,7 @@ class KgtkJoiner(KgtkFormat): # value_options: typing.Optional[KgtkValueOptions] = attr.ib(attr.validators.optional(attr.validators.instance_of(KgtkValueOptions)), default=None) value_options: typing.Optional[KgtkValueOptions] = attr.ib(default=None) + error_file: typing.TextIO = attr.ib(default=sys.stderr) verbose: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) very_verbose: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) @@ -111,26 +112,26 @@ def build_join_idx_list(self, kr: KgtkReader, who: str, join_columns: typing.Opt col_num: int = 1 if join_columns is not None and len(join_columns) > 0: if self.verbose: - print("Using %s file join columns: %s" % (who, " ".join(join_columns)), flush=True) + print("Using %s file join columns: %s" % (who, " ".join(join_columns)), file=self.error_file, flush=True) join_column:str for join_column in join_columns: if join_column not in kr.column_name_map: raise ValueError("Join column %s not found in in the %s input file" % (join_column, who)) join_idx = kr.column_name_map[join_column] if self.verbose: - print("Join column %d: %s (index %d in the %s input file)" % (col_num, join_column, join_idx, who), flush=True) + print("Join column %d: %s (index %d in the %s input file)" % (col_num, join_column, join_idx, who), file=self.error_file, flush=True) join_idx_list.append(join_idx) return join_idx_list if kr.is_edge_file: join_idx = self.node1_column_idx(kr, who) if self.verbose: - print("Joining on node1 (index %s in the %s input file)" % (join_idx, who), flush=True) + print("Joining on node1 (index %s in the %s input file)" % (join_idx, who), file=self.error_file, flush=True) join_idx_list.append(join_idx) elif kr.is_node_file: join_idx = self.id_column_idx(kr, who) if self.verbose: - print("Joining on id (index %s in the %s input file)" % (join_idx, who), flush=True) + print("Joining on id (index %s in the %s input file)" % (join_idx, who), file=self.error_file, flush=True) join_idx_list.append(join_idx) else: raise ValueError("Unknown file type in build_join_idx_list(...)") @@ -141,21 +142,21 @@ def build_join_idx_list(self, kr: KgtkReader, who: str, join_columns: typing.Opt if kr.label_column_idx < 0: raise ValueError("join_on_label may not be used because the %s input file does not have a label column." % who) if self.verbose: - print("Joining on label (index %s in the %s input file)" % (kr.label_column_idx, who), flush=True) + print("Joining on label (index %s in the %s input file)" % (kr.label_column_idx, who), file=self.error_file, flush=True) join_idx_list.append(kr.label_column_idx) if self.join_on_node2: if kr.node2_column_idx < 0: raise ValueError("join_on_node2 may not be used because the %s input file does not have a node2 column." % who) if self.verbose: - print("Joining on node2 (index %s in the %s input file)" % (kr.node2_column_idx, who), flush=True) + print("Joining on node2 (index %s in the %s input file)" % (kr.node2_column_idx, who), file=self.error_file, flush=True) join_idx_list.append(kr.node2_column_idx) return join_idx_list def extract_join_key_set(self, file_path: Path, who: str, join_idx_list: typing.List[int])->typing.Set[str]: if self.verbose: - print("Extracting the join key set from the %s input file: %s" % (who, str(file_path)), flush=True) + print("Extracting the join key set from the %s input file: %s" % (who, str(file_path)), file=self.error_file, flush=True) reader_options: typing.Optional[KgtkReaderOptions] if who == self.LEFT: reader_options = self.left_reader_options @@ -185,36 +186,36 @@ def join_key_sets(self, left_join_idx_list: typing.List[int], right_join_idx_lis join_key_set: typing.Set[str] if self.left_join and self.right_join: if self.verbose: - print("Outer join, no need to compute join keys.", flush=True) + print("Outer join, no need to compute join keys.", file=self.error_file, flush=True) return None elif self.left_join and not self.right_join: if self.verbose: - print("Computing the left join key set", flush=True) + print("Computing the left join key set", file=self.error_file, flush=True) join_key_set = self.extract_join_key_set(self.left_file_path, self.LEFT, left_join_idx_list).copy() if self.verbose: - print("There are %d keys in the left join key set." % len(join_key_set), flush=True) + print("There are %d keys in the left join key set." % len(join_key_set), file=self.error_file, flush=True) return join_key_set elif self.right_join and not self.left_join: if self.verbose: - print("Computing the right join key set", flush=True) + print("Computing the right join key set", file=self.error_file, flush=True) join_key_set = self.extract_join_key_set(self.right_file_path, self.RIGHT, right_join_idx_list).copy() if self.verbose: - print("There are %d keys in the right join key set." % len(join_key_set), flush=True) + print("There are %d keys in the right join key set." % len(join_key_set), file=self.error_file, flush=True) return join_key_set else: if self.verbose: - print("Computing the inner join key set", flush=True) + print("Computing the inner join key set", file=self.error_file, flush=True) left_join_key_set: typing.Set[str] = self.extract_join_key_set(self.left_file_path, self.LEFT, left_join_idx_list) if self.verbose: - print("There are %d keys in the left file key set." % len(left_join_key_set), flush=True) + print("There are %d keys in the left file key set." % len(left_join_key_set), file=self.error_file, flush=True) right_join_key_set: typing.Set[str] = self.extract_join_key_set(self.right_file_path, self.RIGHT, right_join_idx_list) if self.verbose: - print("There are %d keys in the right file key set." % len(right_join_key_set), flush=True) + print("There are %d keys in the right file key set." % len(right_join_key_set), file=self.error_file, flush=True) join_key_set = left_join_key_set.intersection(right_join_key_set) if self.verbose: - print("There are %d keys in the inner join key set." % len(join_key_set), flush=True) + print("There are %d keys in the inner join key set." % len(join_key_set), file=self.error_file, flush=True) return join_key_set def merge_columns(self, left_kr: KgtkReader, right_kr: KgtkReader)->typing.Tuple[typing.List[str], typing.List[str]]: @@ -256,34 +257,34 @@ def merge_columns(self, left_kr: KgtkReader, right_kr: KgtkReader)->typing.Tuple def process(self): if self.verbose: - print("Opening the left edge file: %s" % str(self.left_file_path), flush=True) + print("Opening the left edge file: %s" % str(self.left_file_path), file=self.error_file, flush=True) left_kr: KgtkReader = KgtkReader.open(self.left_file_path, options=self.left_reader_options, value_options = self.value_options, - error_limit=self.error_limit) + ) if self.verbose: - print("Opening the right edge file: %s" % str(self.right_file_path), flush=True) + print("Opening the right edge file: %s" % str(self.right_file_path), file=self.error_file, flush=True) right_kr: KgtkReader = KgtkReader.open(self.right_file_path, options=self.right_reader_options, value_options = self.value_options, - error_limit=self.error_limit) + ) if left_kr.is_edge_file and right_kr.is_edge_file: if self.verbose: - print("Both input files are edge files.", flush=True) + print("Both input files are edge files.", file=self.error_file, flush=True) elif left_kr.is_node_file and right_kr.is_node_file: if self.verbose: - print("Both input files are node files.", flush=True) + print("Both input files are node files.", file=self.error_file, flush=True) else: - print("Cannot join edge and node files.", flush=True) + print("Cannot join edge and node files.", file=self.error_file, flush=True) return left_join_idx_list: typing.List[int] = self.build_join_idx_list(left_kr, self.LEFT, self.left_join_columns) right_join_idx_list: typing.List[int] = self.build_join_idx_list(right_kr, self.RIGHT, self.right_join_columns) if len(left_join_idx_list) != len(right_join_idx_list): - print("the left join key has %d components, the right join key has %d columns. Exiting." % (len(left_join_idx_list), len(right_join_idx_list)), flush=True) + print("the left join key has %d components, the right join key has %d columns. Exiting." % (len(left_join_idx_list), len(right_join_idx_list)), file=self.error_file, flush=True) left_kr.close() right_kr.close() return @@ -292,19 +293,19 @@ def process(self): joined_key_set: typing.Optional[typing.Set[str]] = self.join_key_sets(left_join_idx_list, right_join_idx_list) if self.verbose: - print("Mapping the column names for the join.", flush=True) + print("Mapping the column names for the join.", file=self.error_file, flush=True) joined_column_names: typing.List[str] right_column_names: typing.List[str] (joined_column_names, right_column_names) = self.merge_columns(left_kr, right_kr) if self.verbose: - print(" left columns: %s" % " ".join(left_kr.column_names), flush=True) - print(" right columns: %s" % " ".join(right_kr.column_names), flush=True) - print("mapped right columns: %s" % " ".join(right_column_names), flush=True) - print(" joined columns: %s" % " ".join(joined_column_names), flush=True) + print(" left columns: %s" % " ".join(left_kr.column_names), file=self.error_file, flush=True) + print(" right columns: %s" % " ".join(right_kr.column_names), file=self.error_file, flush=True) + print("mapped right columns: %s" % " ".join(right_column_names), file=self.error_file, flush=True) + print(" joined columns: %s" % " ".join(joined_column_names), file=self.error_file, flush=True) if self.verbose: - print("Opening the output edge file: %s" % str(self.output_path), flush=True) + print("Opening the output edge file: %s" % str(self.output_path), file=self.error_file, flush=True) ew: KgtkWriter = KgtkWriter.open(joined_column_names, self.output_path, require_all_columns=False, @@ -321,7 +322,7 @@ def process(self): right_data_lines_kept: int = 0 if self.verbose: - print("Processing the left input file: %s" % str(self.left_file_path), flush=True) + print("Processing the left input file: %s" % str(self.left_file_path), file=self.error_file, flush=True) row: typing.list[str] for row in left_kr: left_data_lines_read += 1 @@ -339,7 +340,7 @@ def process(self): ew.flush() if self.verbose: - print("Processing the right input file: %s" % str(self.right_file_path), flush=True) + print("Processing the right input file: %s" % str(self.right_file_path), file=self.error_file, flush=True) right_shuffle_list: typing.List[int] = ew.build_shuffle_list(right_column_names) for row in right_kr: right_data_lines_read += 1 @@ -356,10 +357,10 @@ def process(self): ew.close() if self.verbose: - print("The join is complete", flush=True) - print("%d left input data lines read, %d kept" % (left_data_lines_read, left_data_lines_kept), flush=True) - print("%d right input data lines read, %d kept" % (right_data_lines_read, right_data_lines_kept), flush=True) - print("%d data lines written." % output_data_lines, flush=True) + print("The join is complete", file=self.error_file, flush=True) + print("%d left input data lines read, %d kept" % (left_data_lines_read, left_data_lines_kept), file=self.error_file, flush=True) + print("%d right input data lines read, %d kept" % (right_data_lines_read, right_data_lines_kept), file=self.error_file, flush=True) + print("%d data lines written." % output_data_lines, file=self.error_file, flush=True) def main(): """ @@ -385,15 +386,15 @@ def main(): parser.add_argument( "--right-file-join-columns", dest="right_join_columns", help="Right file join columns.", nargs='+') parser.add_argument( "--right-join", dest="right_join", help="Perform a right outer join.", action='store_true') - parser.add_argument("-v", "--verbose", dest="verbose", help="Print additional progress messages.", action='store_true') - parser.add_argument( "--very-verbose", dest="very_verbose", help="Print additional progress messages.", action='store_true') - - KgtkReaderOptions.add_arguments(parser, mode_options=True, who=KgtkJoiner.LEFT) - KgtkReaderOptions.add_arguments(parser, mode_options=True, who=KgtkJoiner.RIGHT) - KgtkValueOptions.add_arguments(parser) + KgtkReader.add_debug_arguments(parser, expert=True) + KgtkReaderOptions.add_arguments(parser, mode_options=True, who=KgtkJoiner.LEFT, expert=True) + KgtkReaderOptions.add_arguments(parser, mode_options=True, who=KgtkJoiner.RIGHT, expert=True) + KgtkValueOptions.add_arguments(parser, expert=True) args = parser.parse_args() + error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr + # Build the option structures. left_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args, who=KgtkJoiner.LEFT) right_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args, who=KgtkJoiner.RIGHT) @@ -413,6 +414,7 @@ def main(): left_reader_options=left_reader_options, right_reader_options=right_reader_options, value_options=value_options, + error_file=error_file, verbose=args.verbose, very_verbose=args.very_verbose) diff --git a/kgtk/join/test/edgejoiner-test1-inner.sh b/kgtk/join/test/edgejoiner-test1-inner.sh deleted file mode 100755 index a4461d54d..000000000 --- a/kgtk/join/test/edgejoiner-test1-inner.sh +++ /dev/null @@ -1,5 +0,0 @@ -#! /bin/sh -python3 kgtk/join/edgejoiner.py \ - kgtk/join/test/edgejoiner-test1-file1.tsv \ - kgtk/join/test/edgejoiner-test1-file2.tsv \ - --output-file kgtk/join/test/edgejoiner-test1-inner-output.tsv diff --git a/kgtk/join/test/edgejoiner-test1-file1.tsv b/kgtk/join/test/kgtkjoiner-test1-file1.tsv similarity index 100% rename from kgtk/join/test/edgejoiner-test1-file1.tsv rename to kgtk/join/test/kgtkjoiner-test1-file1.tsv diff --git a/kgtk/join/test/edgejoiner-test1-file2.tsv b/kgtk/join/test/kgtkjoiner-test1-file2.tsv similarity index 100% rename from kgtk/join/test/edgejoiner-test1-file2.tsv rename to kgtk/join/test/kgtkjoiner-test1-file2.tsv diff --git a/kgtk/join/test/kgtkjoiner-test1-inner.sh b/kgtk/join/test/kgtkjoiner-test1-inner.sh new file mode 100755 index 000000000..4b02bcd7e --- /dev/null +++ b/kgtk/join/test/kgtkjoiner-test1-inner.sh @@ -0,0 +1,5 @@ +#! /bin/sh +python3 kgtk/join/kgtkjoiner.py \ + kgtk/join/test/kgtkjoiner-test1-file1.tsv \ + kgtk/join/test/kgtkjoiner-test1-file2.tsv \ + --output-file kgtk/join/test/kgtkjoiner-test1-inner-output.tsv From 23a134ed142ae7bc68ba93368897549c5434ed53 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Mon, 11 May 2020 20:46:46 -0700 Subject: [PATCH 153/278] Remove obsolete test. --- kgtk/join/kgtkjoiner.py | 3 --- kgtk/join/test/edgejoiner-test1-left.sh | 6 ------ kgtk/join/test/kgtkjoiner-test1-left.sh | 6 ++++++ ...odejoiner-test1-file1.tsv => kgtkjoiner-test2-file1.tsv} | 0 ...odejoiner-test1-file2.tsv => kgtkjoiner-test2-file2.tsv} | 0 kgtk/join/test/kgtkjoiner-test2-inner.sh | 5 +++++ kgtk/join/test/nodejoiner-test1-inner.sh | 5 ----- 7 files changed, 11 insertions(+), 14 deletions(-) delete mode 100755 kgtk/join/test/edgejoiner-test1-left.sh create mode 100755 kgtk/join/test/kgtkjoiner-test1-left.sh rename kgtk/join/test/{nodejoiner-test1-file1.tsv => kgtkjoiner-test2-file1.tsv} (100%) rename kgtk/join/test/{nodejoiner-test1-file2.tsv => kgtkjoiner-test2-file2.tsv} (100%) create mode 100755 kgtk/join/test/kgtkjoiner-test2-inner.sh delete mode 100755 kgtk/join/test/nodejoiner-test1-inner.sh diff --git a/kgtk/join/kgtkjoiner.py b/kgtk/join/kgtkjoiner.py index 0e7c9dcc8..1f5aafd1d 100644 --- a/kgtk/join/kgtkjoiner.py +++ b/kgtk/join/kgtkjoiner.py @@ -169,9 +169,6 @@ def extract_join_key_set(self, file_path: Path, who: str, join_idx_list: typing. verbose=self.verbose, very_verbose=self.very_verbose) - if not kr.is_edge_file: - raise ValueError("The %s file is not an edge file" % who) - if len(join_idx_list) == 1: # This uses optimized code: return self.single_column_key_set(kr, join_idx_list[0]) # closes er file diff --git a/kgtk/join/test/edgejoiner-test1-left.sh b/kgtk/join/test/edgejoiner-test1-left.sh deleted file mode 100755 index 97ddc3f6f..000000000 --- a/kgtk/join/test/edgejoiner-test1-left.sh +++ /dev/null @@ -1,6 +0,0 @@ -#! /bin/sh -python3 kgtk/join/edgejoiner.py \ - kgtk/join/test/edgejoiner-test1-file1.tsv \ - kgtk/join/test/edgejoiner-test1-file2.tsv \ - --left-join \ - --output-file kgtk/join/test/edgejoiner-test1-left-output.tsv diff --git a/kgtk/join/test/kgtkjoiner-test1-left.sh b/kgtk/join/test/kgtkjoiner-test1-left.sh new file mode 100755 index 000000000..5db9e4c31 --- /dev/null +++ b/kgtk/join/test/kgtkjoiner-test1-left.sh @@ -0,0 +1,6 @@ +#! /bin/sh +python3 kgtk/join/kgtkjoiner.py \ + kgtk/join/test/kgtkjoiner-test1-file1.tsv \ + kgtk/join/test/kgtkjoiner-test1-file2.tsv \ + --left-join \ + --output-file kgtk/join/test/kgtkjoiner-test1-left-output.tsv diff --git a/kgtk/join/test/nodejoiner-test1-file1.tsv b/kgtk/join/test/kgtkjoiner-test2-file1.tsv similarity index 100% rename from kgtk/join/test/nodejoiner-test1-file1.tsv rename to kgtk/join/test/kgtkjoiner-test2-file1.tsv diff --git a/kgtk/join/test/nodejoiner-test1-file2.tsv b/kgtk/join/test/kgtkjoiner-test2-file2.tsv similarity index 100% rename from kgtk/join/test/nodejoiner-test1-file2.tsv rename to kgtk/join/test/kgtkjoiner-test2-file2.tsv diff --git a/kgtk/join/test/kgtkjoiner-test2-inner.sh b/kgtk/join/test/kgtkjoiner-test2-inner.sh new file mode 100755 index 000000000..299b4b089 --- /dev/null +++ b/kgtk/join/test/kgtkjoiner-test2-inner.sh @@ -0,0 +1,5 @@ +#! /bin/sh +python3 kgtk/join/kgtkjoiner.py \ + kgtk/join/test/kgtkjoiner-test2-file1.tsv \ + kgtk/join/test/kgtkjoiner-test2-file2.tsv \ + --output-file kgtk/join/test/kgtkjoiner-test2-inner-output.tsv diff --git a/kgtk/join/test/nodejoiner-test1-inner.sh b/kgtk/join/test/nodejoiner-test1-inner.sh deleted file mode 100755 index 6827347f2..000000000 --- a/kgtk/join/test/nodejoiner-test1-inner.sh +++ /dev/null @@ -1,5 +0,0 @@ -#! /bin/sh -python3 kgtk/join/nodejoiner.py \ - kgtk/join/test/nodejoiner-test1-file1.tsv \ - kgtk/join/test/nodejoiner-test1-file2.tsv \ - --output-file kgtk/join/test/nodejoiner-test1-inner-output.tsv From 1cc18e335356628c90662c71927d2db291f6b7d6 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Mon, 11 May 2020 20:48:07 -0700 Subject: [PATCH 154/278] Updated scripts. --- kgtk/join/test/kgtkjoiner-test2-left.sh | 6 ++++++ kgtk/join/test/nodejoiner-test1-left.sh | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) create mode 100755 kgtk/join/test/kgtkjoiner-test2-left.sh delete mode 100755 kgtk/join/test/nodejoiner-test1-left.sh diff --git a/kgtk/join/test/kgtkjoiner-test2-left.sh b/kgtk/join/test/kgtkjoiner-test2-left.sh new file mode 100755 index 000000000..37ade789e --- /dev/null +++ b/kgtk/join/test/kgtkjoiner-test2-left.sh @@ -0,0 +1,6 @@ +#! /bin/sh +python3 kgtk/join/kgtkjoiner.py \ + kgtk/join/test/kgtkjoiner-test2-file1.tsv \ + kgtk/join/test/kgtkjoiner-test2-file2.tsv \ + --left-join \ + --output-file kgtk/join/test/kgtkjoiner-test2-left-output.tsv diff --git a/kgtk/join/test/nodejoiner-test1-left.sh b/kgtk/join/test/nodejoiner-test1-left.sh deleted file mode 100755 index c5bee612c..000000000 --- a/kgtk/join/test/nodejoiner-test1-left.sh +++ /dev/null @@ -1,6 +0,0 @@ -#! /bin/sh -python3 kgtk/join/nodejoiner.py \ - kgtk/join/test/nodejoiner-test1-file1.tsv \ - kgtk/join/test/nodejoiner-test1-file2.tsv \ - --left-join \ - --output-file kgtk/join/test/nodejoiner-test1-left-output.tsv From dda432753ab59e40b191008f7f51d1d1841dc7c2 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Mon, 11 May 2020 20:50:19 -0700 Subject: [PATCH 155/278] Update the command syntax. --- kgtk/join/test/ifexists-test1-default.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kgtk/join/test/ifexists-test1-default.sh b/kgtk/join/test/ifexists-test1-default.sh index a03a36cc0..3a3da6cd0 100755 --- a/kgtk/join/test/ifexists-test1-default.sh +++ b/kgtk/join/test/ifexists-test1-default.sh @@ -1,5 +1,5 @@ #! /bin/sh python3 kgtk/join/ifexists.py \ kgtk/join/test/ifexists-test1-file1.tsv \ - kgtk/join/test/ifexists-test1-file2.tsv \ + --filter-on kgtk/join/test/ifexists-test1-file2.tsv \ --output-file kgtk/join/test/ifexists-test1-default-output.tsv From 088b938864f4775f6d56077d645474a2e1b3dac4 Mon Sep 17 00:00:00 2001 From: Rongpeng Date: Mon, 11 May 2020 20:58:18 -0700 Subject: [PATCH 156/278] enable users to define properties in the input kgtk file --- kgtk/triple_generator.py | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/kgtk/triple_generator.py b/kgtk/triple_generator.py index 1ed5b5414..04db7d733 100644 --- a/kgtk/triple_generator.py +++ b/kgtk/triple_generator.py @@ -43,6 +43,16 @@ def __init__( ): from etk.wikidata.statement import Rank self.ignore = ignore + self.datatype_mapping = { + "item": Item, + "time": TimeValue, + "globe-coordinate": GlobeCoordinate, + "quantity": QuantityValue, + "monolingualtext": MonolingualText, + "string": StringValue, + "external-identifier": ExternalIdentifier, + "url": URLValue + } self.prop_types = self.set_properties(prop_file) self.label_set, self.alias_set, self.description_set = self.set_sets( label_set, alias_set, description_set @@ -68,6 +78,7 @@ def __init__( self.order_map = {} self.use_id = use_id + def _node_2_entity(self, node: str): ''' A node can be Qxxx or Pxxx, return the proper entity. @@ -79,23 +90,13 @@ def _node_2_entity(self, node: str): return entity def set_properties(self, prop_file: str): - datatype_mapping = { - "item": Item, - "time": TimeValue, - "globe-coordinate": GlobeCoordinate, - "quantity": QuantityValue, - "monolingualtext": MonolingualText, - "string": StringValue, - "external-identifier": ExternalIdentifier, - "url": URLValue - } with open(prop_file, "r") as fp: props = fp.readlines() prop_types = {} for line in props[1:]: node1, _, node2 = line.split("\t") try: - prop_types[node1] = datatype_mapping[node2.strip()] + prop_types[node1] = self.datatype_mapping[node2.strip()] except: if not self.ignore: raise KGTKException( @@ -195,7 +196,11 @@ def generate_alias_triple(self, node1: str, label: str, node2: str) -> bool: return True def generate_prop_declaration_triple(self, node1: str, label: str, node2: str) -> bool: - prop = WDProperty(node1, self.prop_types[node1]) + # update the known prop_types + if node1 in self.prop_types: + raise KGTKException("Duplicated property definition of {} found!".format(node1)) + self.prop_types[node1] = node2 + prop = WDProperty(node1, self.datatype_mapping[node2]) self.doc.kg.add_subject(prop) return True @@ -420,7 +425,7 @@ def entry_point(self, line_number: int, edge: str): success = self.generate_description_triple(node1, prop, node2) elif prop in self.alias_set: success = self.generate_alias_triple(node1, prop, node2) - elif prop == "type": + elif prop == "data_type": # special edge of prop declaration success = self.generate_prop_declaration_triple( node1, prop, node2) From b83a945cc1f44d6bff461c3a914788a986012a41 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Mon, 11 May 2020 20:59:21 -0700 Subject: [PATCH 157/278] Demonstrate filterint on a quasi-KGTK file without a node1 column. --- kgtk/join/test/ifexists-test2-file1.tsv | 7 +++++++ kgtk/join/test/ifexists-test2-file2.tsv | 4 ++++ kgtk/join/test/ifexists-test2-label-and-node2.sh | 8 ++++++++ 3 files changed, 19 insertions(+) create mode 100644 kgtk/join/test/ifexists-test2-file1.tsv create mode 100644 kgtk/join/test/ifexists-test2-file2.tsv create mode 100755 kgtk/join/test/ifexists-test2-label-and-node2.sh diff --git a/kgtk/join/test/ifexists-test2-file1.tsv b/kgtk/join/test/ifexists-test2-file1.tsv new file mode 100644 index 000000000..0e648d2c3 --- /dev/null +++ b/kgtk/join/test/ifexists-test2-file1.tsv @@ -0,0 +1,7 @@ +node1 label node2 location +john zipcode 12345 home +john zipcode 12346 work +peter zipcode 12040 home +peter zipcode 12041 work +steve zipcode 45601 home +steve zipcode 45602 work diff --git a/kgtk/join/test/ifexists-test2-file2.tsv b/kgtk/join/test/ifexists-test2-file2.tsv new file mode 100644 index 000000000..721f644c0 --- /dev/null +++ b/kgtk/join/test/ifexists-test2-file2.tsv @@ -0,0 +1,4 @@ +label node2 +zipcode 12345 +zipcode 12040 +zipcode 45601 diff --git a/kgtk/join/test/ifexists-test2-label-and-node2.sh b/kgtk/join/test/ifexists-test2-label-and-node2.sh new file mode 100755 index 000000000..d2fea5d62 --- /dev/null +++ b/kgtk/join/test/ifexists-test2-label-and-node2.sh @@ -0,0 +1,8 @@ +#! /bin/sh +python3 kgtk/join/ifexists.py \ + kgtk/join/test/ifexists-test2-file1.tsv \ + --input-keys label node2 \ + --filter-on kgtk/join/test/ifexists-test2-file2.tsv \ + --filter-keys label node2 \ + --filter-mode NONE \ + --output-file kgtk/join/test/ifexists-test1-node1-output.tsv From ea67db5e16e7eeac2e3a9d3f2b7e24393e79cc1f Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Mon, 11 May 2020 21:03:17 -0700 Subject: [PATCH 158/278] Demonstrate kgtk ifexists with a quasi-KGTK filter file. --- kgtk/join/test/kgtk-ifexists-test2-label-and-node2.sh | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100755 kgtk/join/test/kgtk-ifexists-test2-label-and-node2.sh diff --git a/kgtk/join/test/kgtk-ifexists-test2-label-and-node2.sh b/kgtk/join/test/kgtk-ifexists-test2-label-and-node2.sh new file mode 100755 index 000000000..5e8c295ce --- /dev/null +++ b/kgtk/join/test/kgtk-ifexists-test2-label-and-node2.sh @@ -0,0 +1,8 @@ +#! /bin/sh +python3 -m kgtk ifexists \ + kgtk/join/test/ifexists-test2-file1.tsv \ + --input-keys label node2 \ + --filter-on kgtk/join/test/ifexists-test2-file2.tsv \ + --filter-keys label node2 \ + --filter-mode NONE \ + --output-file kgtk/join/test/ifexists-test1-node1-output.tsv From f4599401489684b75172d4f6e29a373f49dedc85 Mon Sep 17 00:00:00 2001 From: Rongpeng Date: Mon, 11 May 2020 21:09:14 -0700 Subject: [PATCH 159/278] moved datatype_mapping to class attributes --- kgtk/triple_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kgtk/triple_generator.py b/kgtk/triple_generator.py index 04db7d733..e646d6fa2 100644 --- a/kgtk/triple_generator.py +++ b/kgtk/triple_generator.py @@ -84,7 +84,7 @@ def _node_2_entity(self, node: str): A node can be Qxxx or Pxxx, return the proper entity. ''' if node in self.prop_types: - entity = WDProperty(node, self.prop_types[node]) + entity = WDProperty(node, self.datatype_mapping[self.prop_types[node]]) else: entity = WDItem(TripleGenerator.replace_illegal_string(node)) return entity From d3358c4ee052bfcfa7714533ebcd741af6623f21 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Mon, 11 May 2020 21:15:47 -0700 Subject: [PATCH 160/278] Update column mapping to handle node files as well as edge files. --- kgtk/join/kgtkjoiner.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/kgtk/join/kgtkjoiner.py b/kgtk/join/kgtkjoiner.py index 1f5aafd1d..d0d2f0f4c 100644 --- a/kgtk/join/kgtkjoiner.py +++ b/kgtk/join/kgtkjoiner.py @@ -226,14 +226,12 @@ def merge_columns(self, left_kr: KgtkReader, right_kr: KgtkReader)->typing.Tuple idx: int = 0 for column_name in right_kr.column_names: - if idx == right_kr.node1_column_idx: - # The right file is an edge file and this is its node1 column index. - if left_kr.node1_column_idx >= 0: - # The left file has a node1 column. Map to that. - column_name = left_kr.column_names[left_kr.node1_column_idx] - else: - # Apparently we don't have a destination in the left file. Punt. - raise ValueError("Can't map right join column name to the left file #2.") + if idx == right_kr.id_column_idx and left_kr.id_column_idx >= 0: + # Map the id columns to the name used in the left file. + column_name = left_kr.column_names[left_kr.id_column_idx] + elif idx == right_kr.node1_column_idx and left_kr.node1_column_idx >= 0: + # Map the node1 columns to the name used in the left file, + column_name = left_kr.column_names[left_kr.node1_column_idx] elif idx == right_kr.label_column_idx and left_kr.label_column_idx >= 0: # Map the right file's label column to the left file's label column. column_name = left_kr.column_names[left_kr.label_column_idx] From 9ad07e98c6fce54a32116338066777ecb3f2a6d1 Mon Sep 17 00:00:00 2001 From: saggu Date: Tue, 12 May 2020 11:04:48 -0700 Subject: [PATCH 161/278] update regex pattern --- kgtk/triple_generator.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kgtk/triple_generator.py b/kgtk/triple_generator.py index 61133179c..ddc5d051c 100644 --- a/kgtk/triple_generator.py +++ b/kgtk/triple_generator.py @@ -73,13 +73,13 @@ def __init__( self.yyyy_mm_dd_pattern = re.compile( "[12]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])") self.yyyy_pattern = re.compile("[12]\d{3}") + # self.quantity_pattern = re.compile("([\+|\-]?[0-9]+\.?[0-9]*)(?:\[([\+|\-]?[0-9]+\.?[0-9]*),([\+|\-]?[0-9]+\.?[0-9]*)\])?([U|Q](?:[0-9]+))?") self.quantity_pattern = re.compile( - "([\+|\-]?[0-9]+\.?[0-9]*)(?:\[([\+|\-]?[0-9]+\.?[0-9]*),([\+|\-]?[0-9]+\.?[0-9]*)\])?([U|Q](?:[0-9]+))?") + "([\+|\-]?[0-9]+\.?[0-9]*[e|E]?[\-]?[0-9]*)(?:\[([\+|\-]?[0-9]+\.?[0-9]*),([\+|\-]?[0-9]+\.?[0-9]*)\])?([U|Q](?:[0-9]+))?") # order map, know the column index of ["node1","property","node2",id] self.order_map = {} self.use_id = use_id - def _node_2_entity(self, node: str): ''' A node can be Qxxx or Pxxx, return the proper entity. @@ -276,12 +276,13 @@ def generate_normal_triple( elif edge_type == QuantityValue: # +70[+60,+80]Q743895 + res = self.quantity_pattern.match(node2).groups() amount, lower_bound, upper_bound, unit = res amount = TripleGenerator.clean_number_string(amount) num_type = self.xsd_number_type(amount) - + lower_bound = TripleGenerator.clean_number_string(lower_bound) upper_bound = TripleGenerator.clean_number_string(upper_bound) if unit != None: From 599ae0badc76926164eecfe349584b6af6124715 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Tue, 12 May 2020 12:38:19 -0700 Subject: [PATCH 162/278] Ensure that all special column indexes are looked up. Share code better between KgtkReader and its subclasses. --- kgtk/io/edgereader.py | 71 +++++++---------------- kgtk/io/kgtkbase.py | 51 +++++++++-------- kgtk/io/kgtkreader.py | 128 ++++++++++++++++++++---------------------- kgtk/io/kgtkwriter.py | 20 +++---- kgtk/io/nodereader.py | 67 +++++++--------------- 5 files changed, 135 insertions(+), 202 deletions(-) diff --git a/kgtk/io/edgereader.py b/kgtk/io/edgereader.py index c1234ea78..3225c4579 100644 --- a/kgtk/io/edgereader.py +++ b/kgtk/io/edgereader.py @@ -10,7 +10,7 @@ import sys import typing -from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions +from kgtk.io.kgtkreader import KgtkReader, KgtkReaderMode, KgtkReaderOptions from kgtk.utils.closableiter import ClosableIter from kgtk.utils.enumnameaction import EnumNameAction from kgtk.utils.validationaction import ValidationAction @@ -28,56 +28,24 @@ def open_edge_file(cls, verbose: bool = False, very_verbose: bool = False)->"EdgeReader": - # Supply the default reader and value options: - (options, value_options) = cls._default_options(options, value_options) - - source: ClosableIter[str] = cls._openfile(file_path, options=options, error_file=error_file, verbose=verbose) - - # Read the edge file header and split it into column names. - header: str - column_names: typing.List[str] - (header, column_names) = cls._build_column_names(source, options=options, error_file=error_file, verbose=verbose) - - # Check for unsafe column names. - cls.check_column_names(column_names, - header_line=header, - error_action=options.unsafe_column_name_action, - error_file=error_file) - - # Build a map from column name to column index. - column_name_map: typing.Mapping[str, int] = cls.build_column_name_map(column_names, - header_line=header, - error_action=options.header_error_action, - error_file=error_file) - # Get the indices of the required columns. - node1_column_idx: int - node2_column_idx: int - label_column_idx: int - (node1_column_idx, node2_column_idx, label_column_idx) = cls.required_edge_columns(column_name_map, - header_line=header, - error_action=options.header_error_action, - error_file=error_file) - - if verbose: - print("EdgeReader: Reading an edge file. node1=%d label=%d node2=%d" % (node1_column_idx, label_column_idx, node2_column_idx)) - - - return cls(file_path=file_path, - source=source, - column_names=column_names, - column_name_map=column_name_map, - column_count=len(column_names), - node1_column_idx=node1_column_idx, - node2_column_idx=node2_column_idx, - label_column_idx=label_column_idx, - error_file=error_file, - options=options, - value_options=value_options, - is_edge_file=True, - is_node_file=False, - verbose=verbose, - very_verbose=very_verbose, - ) + result: KgtkReader = cls.open(file_path=file_path, + error_file=error_file, + mode=KgtkReaderMode.EDGE, + options=options, + value_options=value_options, + verbose=verbose, + very_verbose=very_verbose) + # This doesn't work because the EdgeReader imported inside KgtkReader + # is a different class than this one! + # + # TODO: Fix this. + # + #if isinstance(result, cls): + # return result + #else: + # # TODO: throw a better exception + # raise ValueError("open_edge_file expected to produce an EdgeReader") + return typing.cast(EdgeReader, result) def _ignore_if_blank_required_fields(self, values: typing.List[str], line: str)->bool: # Ignore line_action with blank node1 fields. This code comes after @@ -126,7 +94,6 @@ def main(): error_file=error_file, options=reader_options, value_options=value_options, - column_separator=args.column_separator, verbose=args.verbose, very_verbose=args.very_verbose) line_count: int = 0 diff --git a/kgtk/io/kgtkbase.py b/kgtk/io/kgtkbase.py index 7cbcef44c..afa7093b9 100644 --- a/kgtk/io/kgtkbase.py +++ b/kgtk/io/kgtkbase.py @@ -145,34 +145,39 @@ def build_column_name_map(cls, return column_name_map @classmethod - def required_edge_columns(cls, - column_name_map: typing.Mapping[str, int], - header_line: str, - error_action: ValidationAction, - error_file: typing.TextIO = sys.stderr - )->typing.Tuple[int, int, int]: - # Ensure that the three required columns are present: + def get_special_columns(cls, + column_name_map: typing.Mapping[str, int], + header_line: str, + error_action: ValidationAction, + error_file: typing.TextIO = sys.stderr, + is_edge_file: bool = False, + is_node_file: bool = False, + )->typing.Tuple[int, int, int, int]: + """ + Four predefined column names are special: they may have name aliases, and + they may be required in ede or node files. + + """ + + # These three predefined columns columns are required for edge files: node1_column_idx: int = cls.get_column_idx(cls.NODE1_COLUMN_NAMES, column_name_map, - header_line=header_line, error_action=error_action, error_file=error_file) + header_line=header_line, error_action=error_action, error_file=error_file, + is_optional=not is_edge_file) - node2_column_idx: int = cls.get_column_idx(cls.NODE2_COLUMN_NAMES, column_name_map, - header_line=header_line, error_action=error_action, error_file=error_file) - label_column_idx: int = cls.get_column_idx(cls.LABEL_COLUMN_NAMES, column_name_map, - header_line=header_line, error_action=error_action, error_file=error_file) + header_line=header_line, error_action=error_action, error_file=error_file, + is_optional=not is_edge_file) - return (node1_column_idx, node2_column_idx, label_column_idx) + node2_column_idx: int = cls.get_column_idx(cls.NODE2_COLUMN_NAMES, column_name_map, + header_line=header_line, error_action=error_action, error_file=error_file, + is_optional=not is_edge_file) + + # This predefined column is required for node files: + id_column_idx: int = cls.get_column_idx(cls.ID_COLUMN_NAMES, column_name_map, + header_line=header_line, error_action=error_action, error_file=error_file, + is_optional=not is_node_file) - @classmethod - def required_node_column(cls, - column_name_map: typing.Mapping[str, int], - header_line: str, - error_action: ValidationAction, - error_file: typing.TextIO = sys.stderr - )->int: - # Ensure that the required column is present: - return cls.get_column_idx(cls.ID_COLUMN_NAMES, column_name_map, - header_line=header_line, error_action=error_action, error_file=error_file) + return (node1_column_idx, label_column_idx, node2_column_idx, id_column_idx) @classmethod def additional_edge_columns(cls, column_names: typing.List[str])->typing.List[str]: diff --git a/kgtk/io/kgtkreader.py b/kgtk/io/kgtkreader.py index ed6de11d9..abc5e76fa 100644 --- a/kgtk/io/kgtkreader.py +++ b/kgtk/io/kgtkreader.py @@ -326,10 +326,16 @@ class KgtkReader(KgtkBase, ClosableIter[typing.List[str]]): column_name_map: typing.Mapping[str, int] = attr.ib(validator=attr.validators.deep_mapping(key_validator=attr.validators.instance_of(str), value_validator=attr.validators.instance_of(int))) - # The index of the mandatory columns. -1 means missing: + # The actual mode used. + # + # TODO: fix the validator. + # mode: KgtkReaderMode = attr.ib(validator=attr.validators.instance_of(KgtkReaderMode), default=KgtkReaderMode.NONE) + mode: KgtkReaderMode = attr.ib(default=KgtkReaderMode.NONE) + + # The index of the mandatory/aliased columns. -1 means missing: node1_column_idx: int = attr.ib(validator=attr.validators.instance_of(int), default=-1) # edge file - node2_column_idx: int = attr.ib(validator=attr.validators.instance_of(int), default=-1) # edge file label_column_idx: int = attr.ib(validator=attr.validators.instance_of(int), default=-1) # edge file + node2_column_idx: int = attr.ib(validator=attr.validators.instance_of(int), default=-1) # edge file id_column_idx: int = attr.ib(validator=attr.validators.instance_of(int), default=-1) # node file data_lines_read: int = attr.ib(validator=attr.validators.instance_of(int), default=0) @@ -364,6 +370,7 @@ def _default_options( def open(cls, file_path: typing.Optional[Path], error_file: typing.TextIO = sys.stderr, + mode: typing.Optional[KgtkReaderMode] = None, options: typing.Optional[KgtkReaderOptions] = None, value_options: typing.Optional[KgtkValueOptions] = None, verbose: bool = False, @@ -395,9 +402,11 @@ def open(cls, error_file=error_file) # Should we automatically determine if this is an edge file or a node file? + if mode is None: + mode = options.mode is_edge_file: bool = False is_node_file: bool = False - if options.mode is KgtkReaderMode.AUTO: + if mode is KgtkReaderMode.AUTO: # If we have a node1 (or alias) column, then this must be an edge file. Otherwise, assume it is a node file. node1_idx: int = cls.get_column_idx(cls.NODE1_COLUMN_NAMES, column_name_map, @@ -416,88 +425,71 @@ def open(cls, if verbose: print("node1 column not found, assuming this is a KGTK node file", file=error_file, flush=True) - elif options.mode is KgtkReaderMode.EDGE: + elif mode is KgtkReaderMode.EDGE: is_edge_file = True - elif options.mode is KgtkReaderMode.NODE: + elif mode is KgtkReaderMode.NODE: is_node_file = True - elif options.mode is KgtkReaderMode.NONE: + elif mode is KgtkReaderMode.NONE: pass + # Get the indices of the special columns. + node1_column_idx: int + label_column_idx: int + node2_column_idx: int + id_column_idx: int + (node1_column_idx, + label_column_idx, + node2_column_idx, + id_column_idx) = cls.get_special_columns(column_name_map, + header_line=header, + error_action=options.header_error_action, + error_file=error_file, + is_edge_file=is_edge_file, + is_node_file=is_node_file) + + if verbose: + print("KgtkReader: Special columns: node1=%d label=%d node2=%d id=%d" % (node1_column_idx, + label_column_idx, + node2_column_idx, + id_column_idx), file=error_file, flush=True) if is_edge_file: # We'll instantiate an EdgeReader, which is a subclass of KgtkReader. # The EdgeReader import is deferred to avoid circular imports. from kgtk.io.edgereader import EdgeReader - # Get the indices of the required columns. - node1_column_idx: int - node2_column_idx: int - label_column_idx: int - (node1_column_idx, node2_column_idx, label_column_idx) = cls.required_edge_columns(column_name_map, - header_line=header, - error_action=options.header_error_action, - error_file=error_file) - if verbose: - print("KgtkReader: Reading an edge file. node1=%d label=%d node2=%d" % (node1_column_idx, label_column_idx, node2_column_idx), file=error_file, flush=True) - - return EdgeReader(file_path=file_path, - source=source, - column_names=column_names, - column_name_map=column_name_map, - column_count=len(column_names), - node1_column_idx=node1_column_idx, - node2_column_idx=node2_column_idx, - label_column_idx=label_column_idx, - error_file=error_file, - options=options, - value_options=value_options, - is_edge_file=is_edge_file, - is_node_file=is_node_file, - verbose=verbose, - very_verbose=very_verbose) + print("KgtkReader: Reading an edge file.", file=error_file, flush=True) + + cls = EdgeReader elif is_node_file: # We'll instantiate an NodeReader, which is a subclass of KgtkReader. # The NodeReader import is deferred to avoid circular imports. from kgtk.io.nodereader import NodeReader - # Get the index of the required column: - id_column_idx: int = cls.required_node_column(column_name_map, - header_line=header, - error_action=options.header_error_action, - error_file=error_file) - if verbose: - print("KgtkReader: Reading an node file. id=%d" % (id_column_idx), file=error_file, flush=True) - - return NodeReader(file_path=file_path, - source=source, - column_names=column_names, - column_name_map=column_name_map, - column_count=len(column_names), - id_column_idx=id_column_idx, - error_file=error_file, - options=options, - value_options=value_options, - is_edge_file=is_edge_file, - is_node_file=is_node_file, - verbose=verbose, - very_verbose=very_verbose, - ) - else: - return cls(file_path=file_path, - source=source, - column_names=column_names, - column_name_map=column_name_map, - column_count=len(column_names), - error_file=error_file, - options=options, - value_options=value_options, - is_edge_file=is_edge_file, - is_node_file=is_node_file, - verbose=verbose, - very_verbose=very_verbose, - ) + print("KgtkReader: Reading an node file.", file=error_file, flush=True) + + cls = NodeReader + + return cls(file_path=file_path, + source=source, + column_names=column_names, + column_name_map=column_name_map, + column_count=len(column_names), + mode=mode, + node1_column_idx=node1_column_idx, + label_column_idx=label_column_idx, + node2_column_idx=node2_column_idx, + id_column_idx=id_column_idx, + error_file=error_file, + options=options, + value_options=value_options, + is_edge_file=is_edge_file, + is_node_file=is_node_file, + verbose=verbose, + very_verbose=very_verbose, + ) @classmethod def _open_compressed_file(cls, diff --git a/kgtk/io/kgtkwriter.py b/kgtk/io/kgtkwriter.py index d3d256c07..f740e4c55 100644 --- a/kgtk/io/kgtkwriter.py +++ b/kgtk/io/kgtkwriter.py @@ -202,18 +202,14 @@ def _setup(cls, elif mode is KgtkWriter.Mode.NONE: pass - if is_edge_file: - # Validate that we have the proper columns for an edge file. - cls.required_edge_columns(column_name_map, - header_line=header, - error_action=header_error_action, - error_file=error_file) - elif is_node_file: - # Validate that we have the proper columns for an node file. - cls.required_node_column(column_name_map, - header_line=header, - error_action=header_error_action, - error_file=error_file) + # Validate that we have the proper columns for an edge or node file, + # ignoring the result. + cls.get_special_columns(column_name_map, + header_line=header, + error_action=header_error_action, + error_file=error_file, + is_edge_file=is_edge_file, + is_node_file=is_node_file) # Write the column names to the first line. if verbose: diff --git a/kgtk/io/nodereader.py b/kgtk/io/nodereader.py index 4d73ca3a6..bf74beb85 100644 --- a/kgtk/io/nodereader.py +++ b/kgtk/io/nodereader.py @@ -10,7 +10,7 @@ import sys import typing -from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions +from kgtk.io.kgtkreader import KgtkReader, KgtkReaderMode, KgtkReaderOptions from kgtk.utils.closableiter import ClosableIter from kgtk.utils.enumnameaction import EnumNameAction from kgtk.utils.validationaction import ValidationAction @@ -28,50 +28,24 @@ def open_node_file(cls, verbose: bool = False, very_verbose: bool = False)->"NodeReader": - # Supply the default reader and value options: - (options, value_options) = cls._default_options(options, value_options) - - source: ClosableIter[str] = cls._openfile(file_path, options=options, error_file=error_file, verbose=verbose) - - # Read the edge file header and split it into column names. - header: str - column_names: typing.List[str] - (header, column_names) = cls._build_column_names(source, options=options, error_file=error_file, verbose=verbose) - - # Check for unsafe column names. - cls.check_column_names(column_names, - header_line=header, - error_action=options.unsafe_column_name_action, - error_file=error_file) - - # Build a map from column name to column index. - column_name_map: typing.Mapping[str, int] = cls.build_column_name_map(column_names, - header_line=header, - error_action=options.header_error_action, - error_file=error_file) - # Get the index of the required column. - id_column_idx: int = cls.required_node_column(column_name_map, - header_line=header, - error_action=options.header_error_action, - error_file=error_file) - - if verbose: - print("NodeReader: Reading an node file. id=%d" % (id_column_idx)) - - return cls(file_path=file_path, - source=source, - column_names=column_names, - column_name_map=column_name_map, - column_count=len(column_names), - id_column_idx=id_column_idx, - error_file=error_file, - options=options, - value_options=value_options, - is_edge_file=False, - is_node_file=True, - verbose=verbose, - very_verbose=very_verbose, - ) + result: KgtkReader = cls.open(file_path=file_path, + error_file=error_file, + mode=KgtkReaderMode.NODE, + options=options, + value_options=value_options, + verbose=verbose, + very_verbose=very_verbose) + # This doesn't work because the EdgeReader imported inside KgtkReader + # is a different class than this one! + # + # TODO: Fix this. + # + #if isinstance(result, cls): + # return result + #else: + # # TODO: throw a better exception + # raise ValueError("open_node_file expected to produce a NodeReader") + return typing.cast(NodeReader, result) def _ignore_if_blank_required_fields(self, values: typing.List[str], line: str)->bool: # Ignore line_action with blank id fields. This code comes after @@ -105,11 +79,10 @@ def main(): reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args, mode=KgtkReaderMode.NODE) value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) - nr: NodeReader = NodeReader.open_edge_file(args.kgtk_file, + nr: NodeReader = NodeReader.open_node_file(args.kgtk_file, error_file=error_file, options=reader_options, value_options=value_options, - column_separator=args.column_separator, verbose=args.verbose, very_verbose=args.very_verbose) line_count: int = 0 From fd78a76c3409d44608854a6c27ba49e91456e994 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Tue, 12 May 2020 12:40:52 -0700 Subject: [PATCH 163/278] Use the proper output file. --- kgtk/join/test/ifexists-test2-label-and-node2.sh | 2 +- kgtk/join/test/kgtk-ifexists-test2-label-and-node2.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kgtk/join/test/ifexists-test2-label-and-node2.sh b/kgtk/join/test/ifexists-test2-label-and-node2.sh index d2fea5d62..7f4cb0850 100755 --- a/kgtk/join/test/ifexists-test2-label-and-node2.sh +++ b/kgtk/join/test/ifexists-test2-label-and-node2.sh @@ -5,4 +5,4 @@ python3 kgtk/join/ifexists.py \ --filter-on kgtk/join/test/ifexists-test2-file2.tsv \ --filter-keys label node2 \ --filter-mode NONE \ - --output-file kgtk/join/test/ifexists-test1-node1-output.tsv + --output-file kgtk/join/test/ifexists-test2-label-and-node2-output.tsv diff --git a/kgtk/join/test/kgtk-ifexists-test2-label-and-node2.sh b/kgtk/join/test/kgtk-ifexists-test2-label-and-node2.sh index 5e8c295ce..2a98c2b7f 100755 --- a/kgtk/join/test/kgtk-ifexists-test2-label-and-node2.sh +++ b/kgtk/join/test/kgtk-ifexists-test2-label-and-node2.sh @@ -5,4 +5,4 @@ python3 -m kgtk ifexists \ --filter-on kgtk/join/test/ifexists-test2-file2.tsv \ --filter-keys label node2 \ --filter-mode NONE \ - --output-file kgtk/join/test/ifexists-test1-node1-output.tsv + --output-file kgtk/join/test/ifexists-test2-label-and-node2-output.tsv From 6e969b301d469fad50326c870d86e9ef16abc621 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Tue, 12 May 2020 13:01:28 -0700 Subject: [PATCH 164/278] Pass error_file and verbose flags in open. --- kgtk/join/kgtkjoiner.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kgtk/join/kgtkjoiner.py b/kgtk/join/kgtkjoiner.py index d0d2f0f4c..f51e0eba7 100644 --- a/kgtk/join/kgtkjoiner.py +++ b/kgtk/join/kgtkjoiner.py @@ -166,6 +166,7 @@ def extract_join_key_set(self, file_path: Path, who: str, join_idx_list: typing. kr: KgtkReader = KgtkReader.open(file_path, options=reader_options, value_options = self.value_options, + error_file=self.error_file, verbose=self.verbose, very_verbose=self.very_verbose) @@ -256,6 +257,9 @@ def process(self): left_kr: KgtkReader = KgtkReader.open(self.left_file_path, options=self.left_reader_options, value_options = self.value_options, + error_file=self.error_file, + verbose=self.verbose, + very_verbose=self.very_verbose ) @@ -264,6 +268,9 @@ def process(self): right_kr: KgtkReader = KgtkReader.open(self.right_file_path, options=self.right_reader_options, value_options = self.value_options, + error_file=self.error_file, + verbose=self.verbose, + very_verbose=self.very_verbose ) if left_kr.is_edge_file and right_kr.is_edge_file: From 2ae42e79f3deb0b3f8b7c97f9268064aa81a8231 Mon Sep 17 00:00:00 2001 From: Rongpeng Date: Tue, 12 May 2020 14:46:23 -0700 Subject: [PATCH 165/278] fix a bug when property is defined in prop_file --- kgtk/triple_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kgtk/triple_generator.py b/kgtk/triple_generator.py index 61133179c..51aeb52bf 100644 --- a/kgtk/triple_generator.py +++ b/kgtk/triple_generator.py @@ -85,7 +85,7 @@ def _node_2_entity(self, node: str): A node can be Qxxx or Pxxx, return the proper entity. ''' if node in self.prop_types: - entity = WDProperty(node, self.datatype_mapping[self.prop_types[node]]) + entity = WDProperty(node, self.prop_types[node]) else: entity = WDItem(TripleGenerator.replace_illegal_string(node)) return entity From b21d9e39048831ffded5cb19ae0edc25a79fd923 Mon Sep 17 00:00:00 2001 From: Rongpeng Date: Tue, 12 May 2020 15:07:41 -0700 Subject: [PATCH 166/278] current workaround of handling url --- kgtk/triple_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kgtk/triple_generator.py b/kgtk/triple_generator.py index 9c1bda570..3aa85ffa9 100644 --- a/kgtk/triple_generator.py +++ b/kgtk/triple_generator.py @@ -52,7 +52,7 @@ def __init__( "monolingualtext": MonolingualText, "string": StringValue, "external-identifier": ExternalIdentifier, - "url": URLValue + "url": StringValue } self.prop_types = self.set_properties(prop_file) self.label_set, self.alias_set, self.description_set = self.set_sets( From 3d88fa4cde1902c31034f9b0438c5e055a0f5887 Mon Sep 17 00:00:00 2001 From: Rongpeng Date: Tue, 12 May 2020 16:14:06 -0700 Subject: [PATCH 167/278] fixing the bug when there is trailing tab, the edge_list drops empty list members --- kgtk/triple_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kgtk/triple_generator.py b/kgtk/triple_generator.py index 9c1bda570..9ee2c0b64 100644 --- a/kgtk/triple_generator.py +++ b/kgtk/triple_generator.py @@ -374,7 +374,7 @@ def entry_point(self, line_number: int, edge: str): Call corresponding downstream functions """ - edge_list = edge.strip().split("\t") + edge_list = edge.strip("\n").split("\t") l = len(edge_list) if line_number == 1: # initialize the order_map From f0d546018e290b28464ea393d28cfdd8efcb273a Mon Sep 17 00:00:00 2001 From: ckxz105 Date: Tue, 12 May 2020 16:45:56 -0700 Subject: [PATCH 168/278] fix bug if meet empty node value for text embedding --- kgtk/gt/embedding_utils.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kgtk/gt/embedding_utils.py b/kgtk/gt/embedding_utils.py index 8a4b2b833..a4bfda51a 100644 --- a/kgtk/gt/embedding_utils.py +++ b/kgtk/gt/embedding_utils.py @@ -430,10 +430,15 @@ def read_input(self, file_path: str, target_properties: dict, property_labels_di if "@" in node_value and node_value[0] != "@": node_value = node_value[:node_value.index("@")] + # in case we meet an empty value, skip it + if node_value == "": + self._logger.warning("""Skip line "{}" because of empty value.""".format(each_line)) + continue + # remove extra double quote " and single quote ' - while node_value[0] == '"' and node_value[-1] == '"': + while len(node_value) >= 3 and node_value[0] == '"' and node_value[-1] == '"': node_value = node_value[1:-1] - while node_value[0] == "'" and node_value[-1] == "'": + while len(node_value) >= 3 and node_value[0] == "'" and node_value[-1] == "'": node_value = node_value[1:-1] if current_process_node_id != node_id: From bf2c9a5cd6730ce171e21471b4cae41e71b6ce52 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Tue, 12 May 2020 17:25:45 -0700 Subject: [PATCH 169/278] Fix type case in typing. --- kgtk/join/kgtkjoiner.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kgtk/join/kgtkjoiner.py b/kgtk/join/kgtkjoiner.py index f51e0eba7..c8416e10e 100644 --- a/kgtk/join/kgtkjoiner.py +++ b/kgtk/join/kgtkjoiner.py @@ -8,17 +8,13 @@ from argparse import ArgumentParser import attr -import gzip from pathlib import Path -from multiprocessing import Queue import sys import typing from kgtk.kgtkformat import KgtkFormat from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter -from kgtk.utils.enumnameaction import EnumNameAction -from kgtk.utils.validationaction import ValidationAction from kgtk.value.kgtkvalueoptions import KgtkValueOptions @attr.s(slots=True, frozen=True) @@ -325,7 +321,7 @@ def process(self): if self.verbose: print("Processing the left input file: %s" % str(self.left_file_path), file=self.error_file, flush=True) - row: typing.list[str] + row: typing.List[str] for row in left_kr: left_data_lines_read += 1 if joined_key_set is None: From 961240ade10e09100d7dfb40338de0f41b1a3a72 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Tue, 12 May 2020 17:25:55 -0700 Subject: [PATCH 170/278] Initial cat support. --- kgtk/join/kgtkcat.py | 185 ++++++++++++++++++++++++++++++++++ kgtk/join/kgtkmergecolumns.py | 86 ++++++++++++++++ 2 files changed, 271 insertions(+) create mode 100644 kgtk/join/kgtkcat.py create mode 100644 kgtk/join/kgtkmergecolumns.py diff --git a/kgtk/join/kgtkcat.py b/kgtk/join/kgtkcat.py new file mode 100644 index 000000000..5c2c6bf56 --- /dev/null +++ b/kgtk/join/kgtkcat.py @@ -0,0 +1,185 @@ +""" +Cat multuple KGTK file together. + +""" + +from argparse import ArgumentParser +import attr +from pathlib import Path +import sys +import typing + +1>from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions +from kgtk.io.kgtkwriter import KgtkWriter +from kgtk.join.kgtkmergecolumns import KgtkMergeColumns +from kgtk.value.kgtkvalueoptions import KgtkValueOptions + +@attr.s(slots=True, frozen=True) +class KgtkCat(): + input_file_paths: typing.List[Path] = attr.ib() + output_path: typing.Optional[Path] = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(Path))) + + # TODO: find working validators: + reader_options: typing.Optional[KgtkReaderOptions] = attr.ib(default=None) + # value_options: typing.Optional[KgtkValueOptions] = attr.ib(attr.validators.optional(attr.validators.instance_of(KgtkValueOptions)), default=None) + value_options: typing.Optional[KgtkValueOptions] = attr.ib(default=None) + + error_file: typing.TextIO = attr.ib(default=sys.stderr) + verbose: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) + very_verbose: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False) + + def process(self): + kmc: KgtkMergeColumns = KgtkMergeColumns() + + # Is the output file an edge file, a node file, or unknown? + is_edge_file: bool = False + is_node_file: bool = False + + krs: typing.List[KgtkReader] = [ ] + kr: KgtkReader + idx: int + + if self.verbose: + print("Opening the %d input files." % len(self.input_file_paths), file=self.error_file, flush=True) + + saw_stdin: bool = False + input_file_path: Path + for idx, input_file_path in enumerate(self.input_file_paths): + if str(input_file_path) == "-": + if saw_stdin: + raise ValueError("Duplicate standard input file %d" % idx + 1) + else: + saw_stdin = False + if self.verbose: + print("Opening file %d: standard input" % idx + 1, file=self.error_file, flush=True) + else: + if self.verbose: + print("Opening file %d: %s" % (idx + 1, str(input_file_path)), file=self.error_file, flush=True) + + kr = KgtkReader.open(input_file_path, + options=self.reader_options, + value_options = self.value_options, + error_file=self.error_file, + verbose=self.verbose, + very_verbose=self.very_verbose, + ) + krs.append(kr) + + # Unless directed otherwise, do not merge edge files with node + # files. If options.mode == KgtkReaderMode.NONE, then neither + # kr.is_edge_file nor kr.is_node_file will be set and the + # consistency check will be skipped. + if kr.is_edge_file: + if is_node_file: + raise ValueError("Cannot merge an edge file to a node file: %s" % input_file_path) + if is_edge_file == False and self.verbose: + print("The output file will be an edge file.") + is_edge_file = True + elif kr.is_node_file: + if is_edge_file: + raise ValueError("Cannot merge a node file to an edge file: %s" % input_file_path) + if is_node_file == False and self.verbose: + print("The output file will be an node file.") + is_node_file = True + + if self.verbose or self.very_verbose: + print("Mapping the %d column names in %s." % (len(kr.column_names), input_file_path), file=self.error_file, flush=True) + if self.very_verbose: + print(" ".join(kr.column_names)) + new_column_names: typing.List[str] = kmc.merge(kr) + if self.very_verbose: + print(" ".join(new_column_names)) + + if self.verbose or self.very_verbose: + print("There are %d merged columns." % len(kmc.column_names)) + if self.very_verbose: + print(" ".join(self.column_names)) + + output_mode: KgtkWriter.Mode = KgtkWriter.Mode.NONE + if is_edge_file: + output_mode = KgtkWriter.Mode.EDGE + if self.verbose: + print("Opening the output edge file: %s" % str(self.output_path), file=self.error_file, flush=True) + elif is_node_file: + output_mode = KgtkWriter.Mode.NODE + if self.verbose: + print("Opening the output node file: %s" % str(self.output_path), file=self.error_file, flush=True) + else: + if self.verbose: + print("Opening the output file: %s" % str(self.output_path), file=self.error_file, flush=True) + + ew: KgtkWriter = KgtkWriter.open(kmc.column_names, + self.output_path, + require_all_columns=False, + prohibit_extra_columns=True, + fill_missing_columns=True, + gzip_in_parallel=False, + mode=output_mode, + verbose=self.verbose, + very_verbose=self.very_verbose) + + output_data_lines: int = 0 + for idx, kr in enumerate(krs): + if kr.file_path is None: + # This shouldn't happen because we constrined all + # input_file_path elements to be not None. However, + # checking here keeps mypy happy. + # + # TODO: throw a better exception. + raise ValueError("Missing file path.") + input_file_path = self.file_path + if self.verbose: + print("Copying data from file %d: %s" % (idx + 1, input_file_path)) + + shuffle_list: typing.List[int] = ew.build_shuffle_list(kmc.new_column_name_lists[idx]) + + input_data_lines: int = 0 + row: typing.List[str] + for row in kr: + input_data_lines += 1 + output_data_lines += 1 + ew.write(row, shuffle_list=shuffle_list) + + # Flush the output file so far: + ew.flush() + + if self.verbose: + print("Read %d data lines from file %d: %s" % (input_data_lines, idx + 1, input_file_path)) + + ew.close() + if self.verbose: + print("Wrote %d lines total from %d files" % (output_data_lines, len(krs))) + +def main(): + """ + Test the KGTK file concatenator. + """ + parser = ArgumentParser() + parser.add_argument(dest="input_file_paths", help="The KGTK files to concatenate", type=Path, nargs='+') + parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to read", type=Path, default=None) + + KgtkReader.add_debug_arguments(parser, expert=True) + KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=True) + KgtkValueOptions.add_arguments(parser, expert=True) + + args = parser.parse_args() + + error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr + + # Build the option structures. + reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args) + value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) + + ec: KgtkCat = KgtkCatr(input_file_paths=args.input_file_paths, + output_path=args.output_file_path, + reader_options=reader_options, + value_options=value_options, + error_file=error_file, + verbose=args.verbose, + very_verbose=args.very_verbose) + + ej.process() + +if __name__ == "__main__": + main() + diff --git a/kgtk/join/kgtkmergecolumns.py b/kgtk/join/kgtkmergecolumns.py new file mode 100644 index 000000000..955e947f6 --- /dev/null +++ b/kgtk/join/kgtkmergecolumns.py @@ -0,0 +1,86 @@ +import attr +import typing + + +from kgtk.io.kgtkreader import KgtkReader + +@attr.s(slots=True, frozen=False) +class KgtkMergeColumns: + """Merge columns from multiple KgtkReaders, respecting predefined column + names with aliases. + + """ + # For attrs 19.1.0 and later: + column_names: typing.List[str] = attr.ib(validator=attr.validators.deep_iterable(member_validator=attr.validators.instance_of(str), + iterable_validator=attr.validators.instance_of(list)), + factory=list) + + # Keep a record of the reserved columns with aliases as we encounter them. + # We will retain the first alias encountered of each group. + id_column_idx: int = attr.ib(validator=attr.validators.instance_of(int), default=-1) + node1_column_idx: int = attr.ib(validator=attr.validators.instance_of(int), default=-1) + label_column_idx: int = attr.ib(validator=attr.validators.instance_of(int), default=-1) + node2_column_idx: int = attr.ib(validator=attr.validators.instance_of(int), default=-1) + + # The column name map is a debugging convenience. It is not required for + # the merge algorithm. + column_name_map: typing.MutableMapping[str, int] = attr.ib(validator=attr.validators.deep_mapping(key_validator=attr.validators.instance_of(str), + value_validator=attr.validators.instance_of(int)), + factory=dict) + + # Maintain a list of the old and new column name lists as a convenience + # for debugging and feedback. + old_column_name_lists: typing.List[typing.List[str]] = attr.ib(factory=list) + new_column_name_lists: typing.List[typing.List[str]] = attr.ib(factory=list) + + def merge(self, kr: KgtkReader): + """ + Add the columns from a KgtkReader into the merged column list, + respecting predefined column names with aliases. + + Return a list of new column names for the KgtkReader, with + predefined names replaced with the name first used in the + joint list of column names. + """ + new_column_names: typing.List[str] = [ ] + + # Record the old column names for debugging. + self.old_column_name_lists.append(kr.column_names) + + column_name: str + idx: int = 0 + for idx, column_name in enumerate(kr.column_names): + if idx == kr.id_column_idx: + if self.id_column_idx >= 0: + column_name = self.column_names[self.id_column_idx] + else: + self.idx_column_idx = len(self.column_names) + + elif idx == kr.node1_column_idx: + if self.node1_column_idx >= 0: + column_name = self.column_names[self.node1_column_idx] + else: + self.node1_column_idx = len(self.column_names) + + elif idx == kr.label_column_idx: + if self.label_column_idx >= 0: + column_name = self.column_names[self.labelcolumn_idx] + else: + self.label_column_idx = len(self.column_names) + + elif idx == kr.node2_column_idx: + if self.node2_column_idx >= 0: + column_name = self.column_names[self.node2_column_idx] + else: + self.node2_column_idx = len(self.column_names) + + new_column_names.append(column_name) + if column_name not in self.column_name_map: + self.column_name_map[column_name] = len(self.column_names) + self.column_names.append(column_name) + + self.new_column_name_lists.append(new_column_names) + return new_column_names + + + From c484a244b0d0f14f241f83e94fbd80caa360f278 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Tue, 12 May 2020 17:26:49 -0700 Subject: [PATCH 171/278] Fix typo. --- kgtk/join/kgtkcat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kgtk/join/kgtkcat.py b/kgtk/join/kgtkcat.py index 5c2c6bf56..282dc6163 100644 --- a/kgtk/join/kgtkcat.py +++ b/kgtk/join/kgtkcat.py @@ -9,7 +9,7 @@ import sys import typing -1>from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions +from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.join.kgtkmergecolumns import KgtkMergeColumns from kgtk.value.kgtkvalueoptions import KgtkValueOptions From 5c9eafb50257d23e687bb48ca925550ff0be320b Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Tue, 12 May 2020 17:33:51 -0700 Subject: [PATCH 172/278] Fixed various bugs. --- kgtk/join/kgtkcat.py | 39 ++++++++++++++++++----------------- kgtk/join/kgtkmergecolumns.py | 2 +- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/kgtk/join/kgtkcat.py b/kgtk/join/kgtkcat.py index 282dc6163..11e7e521b 100644 --- a/kgtk/join/kgtkcat.py +++ b/kgtk/join/kgtkcat.py @@ -73,27 +73,27 @@ def process(self): if is_node_file: raise ValueError("Cannot merge an edge file to a node file: %s" % input_file_path) if is_edge_file == False and self.verbose: - print("The output file will be an edge file.") + print("The output file will be an edge file.", file=self.error_file, flush=True) is_edge_file = True elif kr.is_node_file: if is_edge_file: raise ValueError("Cannot merge a node file to an edge file: %s" % input_file_path) if is_node_file == False and self.verbose: - print("The output file will be an node file.") + print("The output file will be an node file.", file=self.error_file, flush=True) is_node_file = True if self.verbose or self.very_verbose: print("Mapping the %d column names in %s." % (len(kr.column_names), input_file_path), file=self.error_file, flush=True) if self.very_verbose: - print(" ".join(kr.column_names)) + print(" ".join(kr.column_names), file=self.error_file, flush=True) new_column_names: typing.List[str] = kmc.merge(kr) if self.very_verbose: - print(" ".join(new_column_names)) + print(" ".join(new_column_names), file=self.error_file, flush=True) if self.verbose or self.very_verbose: - print("There are %d merged columns." % len(kmc.column_names)) + print("There are %d merged columns." % len(kmc.column_names), file=self.error_file, flush=True) if self.very_verbose: - print(" ".join(self.column_names)) + print(" ".join(self.column_names), file=self.error_file, flush=True) output_mode: KgtkWriter.Mode = KgtkWriter.Mode.NONE if is_edge_file: @@ -127,9 +127,9 @@ def process(self): # # TODO: throw a better exception. raise ValueError("Missing file path.") - input_file_path = self.file_path + input_file_path = kr.file_path if self.verbose: - print("Copying data from file %d: %s" % (idx + 1, input_file_path)) + print("Copying data from file %d: %s" % (idx + 1, input_file_path), file=self.error_file, flush=True) shuffle_list: typing.List[int] = ew.build_shuffle_list(kmc.new_column_name_lists[idx]) @@ -146,9 +146,10 @@ def process(self): if self.verbose: print("Read %d data lines from file %d: %s" % (input_data_lines, idx + 1, input_file_path)) - ew.close() if self.verbose: - print("Wrote %d lines total from %d files" % (output_data_lines, len(krs))) + print("Wrote %d lines total from %d files" % (output_data_lines, len(krs)), file=self.error_file, flush=True) + + ew.close() def main(): """ @@ -156,7 +157,7 @@ def main(): """ parser = ArgumentParser() parser.add_argument(dest="input_file_paths", help="The KGTK files to concatenate", type=Path, nargs='+') - parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to read", type=Path, default=None) + parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to read (default=%(default)s)", type=Path, default="-") KgtkReader.add_debug_arguments(parser, expert=True) KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=True) @@ -170,15 +171,15 @@ def main(): reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args) value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) - ec: KgtkCat = KgtkCatr(input_file_paths=args.input_file_paths, - output_path=args.output_file_path, - reader_options=reader_options, - value_options=value_options, - error_file=error_file, - verbose=args.verbose, - very_verbose=args.very_verbose) + ec: KgtkCat = KgtkCat(input_file_paths=args.input_file_paths, + output_path=args.output_file_path, + reader_options=reader_options, + value_options=value_options, + error_file=error_file, + verbose=args.verbose, + very_verbose=args.very_verbose) - ej.process() + ec.process() if __name__ == "__main__": main() diff --git a/kgtk/join/kgtkmergecolumns.py b/kgtk/join/kgtkmergecolumns.py index 955e947f6..a5a75d4b5 100644 --- a/kgtk/join/kgtkmergecolumns.py +++ b/kgtk/join/kgtkmergecolumns.py @@ -64,7 +64,7 @@ def merge(self, kr: KgtkReader): elif idx == kr.label_column_idx: if self.label_column_idx >= 0: - column_name = self.column_names[self.labelcolumn_idx] + column_name = self.column_names[self.label_column_idx] else: self.label_column_idx = len(self.column_names) From 3386ff350aeee797879031a0154fdce16cee133b Mon Sep 17 00:00:00 2001 From: ckxz105 Date: Tue, 12 May 2020 18:17:11 -0700 Subject: [PATCH 173/278] code improvement --- kgtk/gt/embedding_utils.py | 114 ++++++++++++++++++------------------- 1 file changed, 54 insertions(+), 60 deletions(-) diff --git a/kgtk/gt/embedding_utils.py b/kgtk/gt/embedding_utils.py index a4bfda51a..f76c76801 100644 --- a/kgtk/gt/embedding_utils.py +++ b/kgtk/gt/embedding_utils.py @@ -597,35 +597,26 @@ def dump_vectors(self, file_name, type_=None): _ = f.write("\n") def print_vector(self, vectors, output_properties: str = "text_embedding", output_format="kgtk_format"): + self._logger.debug("START printing the vectors") if output_format == "kgtk_format": print("node\tproperty\tvalue\n", end="") - if self.input_format == "kgtk_format": - for i, each_vector in enumerate(vectors): - print(str(list(self.candidates.keys())[i]) + "\t", end="") - print(output_properties + "\t", end="") - for j, each_dimension in enumerate(each_vector): - if j != len(each_vector) - 1: - print(str(each_dimension) + ",", end="") - else: - print(str(each_dimension) + "\n", end="") - elif self.input_format == "test_format": - all_nodes = list(self.vectors_map.keys()) - for i, each_vector in enumerate(vectors): - print(all_nodes[i] + "\t", end="") - print(output_properties + "\t", end="") - for j, each_dimension in enumerate(each_vector): - if j != len(each_vector) - 1: - print(str(each_dimension) + ",", end="") - else: - print(str(each_dimension) + "\n", end="") + all_nodes = list(self.vectors_map.keys()) + ten_percent_len = math.ceil(len(vectors) / 10) + for i, each_vector in enumerate(vectors): + if i % ten_percent_len == 0: + percent = i / ten_percent_len * 10 + self._logger.debug("Finished {}%".format(percent)) + print("{}\t{}\t".format(all_nodes[i], output_properties), end="") + for each_dimension in each_vector[:-1]: + print(str(each_dimension) + ",", end="") + print(str(each_vector[-1])) elif output_format == "tsv_format": for each_vector in vectors: - for i, each_dimension in enumerate(each_vector): - if i != len(each_vector) - 1: - print(str(each_dimension) + "\t", end="") - else: - print(str(each_dimension) + "\n", end="") + for each_dimension in each_vector[:-1]: + print(str(each_dimension) + "\t", end="") + print(str(each_vector[-1])) + self._logger.debug("END printing the vectors") def plot_result(self, output_properties: dict, input_format="kgtk_format", output_uri: str = "", output_format="kgtk_format", @@ -655,48 +646,51 @@ def plot_result(self, output_properties: dict, input_format="kgtk_format", else: raise KGTKException("Unknown or unsupport dimensional reduction type: {}".format(dimensional_reduction)) - if input_format == "test_format": - gt_indexes = set() - vector_map_keys = list(self.vectors_map.keys()) - for each_node in self.gt_nodes: - gt_indexes.add(vector_map_keys.index(each_node)) - - self.metadata.append("Q_nodes\tType\tLabel\tDescription") - for i, each in enumerate(self.vectors_map.keys()): - label = self.node_labels[each] - description = self.candidates[each]["sentence"] - if i in gt_indexes: - self.metadata.append("{}\tground_truth_node\t{}\t{}".format(each, label, description)) + if output_uri not in {"", "none"}: + if not os.path.exists(output_uri): + raise ValueError("The given metadata output folder does not exist!") + + metadata_output_path = os.path.join(output_uri, self.vector_dump_file.split("/")[-1]) + if input_format == "test_format": + gt_indexes = set() + vector_map_keys = list(self.vectors_map.keys()) + for each_node in self.gt_nodes: + gt_indexes.add(vector_map_keys.index(each_node)) + + self.metadata.append("Q_nodes\tType\tLabel\tDescription") + for i, each in enumerate(self.vectors_map.keys()): + label = self.node_labels[each] + description = self.candidates[each]["sentence"] + if i in gt_indexes: + self.metadata.append("{}\tground_truth_node\t{}\t{}".format(each, label, description)) + else: + self.metadata.append("{}\tcandidates\t{}\t{}".format(each, label, description)) + self.gt_indexes = gt_indexes + + elif input_format == "kgtk_format": + if len(output_properties.get("metadata_properties", [])) == 0: + for k, v in self.candidates.items(): + label = v.get("label_properties", "") + if len(label) > 0 and isinstance(label, list): + label = label[0] + description = v.get("description_properties", "") + if len(description) > 0 and isinstance(description, list): + description = description[0] + self.metadata.append("{}\t\t{}\t{}".format(k, label, description)) else: - self.metadata.append("{}\tcandidates\t{}\t{}".format(each, label, description)) - self.gt_indexes = gt_indexes + required_properties = output_properties["metadata_properties"] + self.metadata.append("node\t" + "\t".join(required_properties)) + for k, v in self.candidates.items(): + each_metadata = k + "\t" + for each in required_properties: + each_metadata += v.get(each, " ") + "\t" + self.metadata.append(each_metadata) + self.dump_vectors(metadata_output_path, "metadata") - elif input_format == "kgtk_format": - if len(output_properties.get("metadata_properties", [])) == 0: - for k, v in self.candidates.items(): - label = v.get("label_properties", "") - if len(label) > 0 and isinstance(label, list): - label = label[0] - description = v.get("description_properties", "") - if len(description) > 0 and isinstance(description, list): - description = description[0] - self.metadata.append("{}\t\t{}\t{}".format(k, label, description)) - else: - required_properties = output_properties["metadata_properties"] - self.metadata.append("node\t" + "\t".join(required_properties)) - for k, v in self.candidates.items(): - each_metadata = k + "\t" - for each in required_properties: - each_metadata += v.get(each, " ") + "\t" - self.metadata.append(each_metadata) - - metadata_output_path = os.path.join(output_uri, self.vector_dump_file.split("/")[-1]) if self.vectors_2D is not None: self.print_vector(self.vectors_2D, output_properties.get("output_properties"), output_format) else: self.print_vector(vectors, output_properties.get("output_properties"), output_format) - if output_uri != "none": - self.dump_vectors(metadata_output_path, "metadata") def evaluate_result(self): """ From 8c1660e85ef0ac73f7fe507194f5106a5a49dd39 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Tue, 12 May 2020 21:25:02 -0700 Subject: [PATCH 174/278] Decouple KgtkMergeColumns from KgtkReader. --- kgtk/join/kgtkcat.py | 2 +- kgtk/join/kgtkjoiner.py | 38 +++++------------------------------ kgtk/join/kgtkmergecolumns.py | 31 +++++++++++++++------------- 3 files changed, 23 insertions(+), 48 deletions(-) diff --git a/kgtk/join/kgtkcat.py b/kgtk/join/kgtkcat.py index 11e7e521b..537d54601 100644 --- a/kgtk/join/kgtkcat.py +++ b/kgtk/join/kgtkcat.py @@ -86,7 +86,7 @@ def process(self): print("Mapping the %d column names in %s." % (len(kr.column_names), input_file_path), file=self.error_file, flush=True) if self.very_verbose: print(" ".join(kr.column_names), file=self.error_file, flush=True) - new_column_names: typing.List[str] = kmc.merge(kr) + new_column_names: typing.List[str] = kmc.merge(kr.column_names) if self.very_verbose: print(" ".join(new_column_names), file=self.error_file, flush=True) diff --git a/kgtk/join/kgtkjoiner.py b/kgtk/join/kgtkjoiner.py index c8416e10e..c949fdfa2 100644 --- a/kgtk/join/kgtkjoiner.py +++ b/kgtk/join/kgtkjoiner.py @@ -15,6 +15,7 @@ from kgtk.kgtkformat import KgtkFormat from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter +from kgtk.join.kgtkmergecolumns import KgtkMergeColumns from kgtk.value.kgtkvalueoptions import KgtkValueOptions @attr.s(slots=True, frozen=True) @@ -213,39 +214,10 @@ def join_key_sets(self, left_join_idx_list: typing.List[int], right_join_idx_lis return join_key_set def merge_columns(self, left_kr: KgtkReader, right_kr: KgtkReader)->typing.Tuple[typing.List[str], typing.List[str]]: - joined_column_names: typing.List[str] = [ ] - right_column_names: typing.List[str] = [ ] - - # First step: copy the left column names. - column_name: str - for column_name in left_kr.column_names: - joined_column_names.append(column_name) - - idx: int = 0 - for column_name in right_kr.column_names: - if idx == right_kr.id_column_idx and left_kr.id_column_idx >= 0: - # Map the id columns to the name used in the left file. - column_name = left_kr.column_names[left_kr.id_column_idx] - elif idx == right_kr.node1_column_idx and left_kr.node1_column_idx >= 0: - # Map the node1 columns to the name used in the left file, - column_name = left_kr.column_names[left_kr.node1_column_idx] - elif idx == right_kr.label_column_idx and left_kr.label_column_idx >= 0: - # Map the right file's label column to the left file's label column. - column_name = left_kr.column_names[left_kr.label_column_idx] - elif idx == right_kr.node2_column_idx and left_kr.node2_column_idx >= 0: - # Map the right file's node2 column to the left file's node2 column. - column_name = left_kr.column_names[left_kr.node2_column_idx] - else: - # Apply the prefix. - if self.prefix is not None and len(self.prefix) > 0: - column_name = self.prefix + column_name - - right_column_names.append(column_name) - if column_name not in joined_column_names: - joined_column_names.append(column_name) - idx += 1 - - return (joined_column_names, right_column_names) + kmc: KgtkMergeColumns = KgtkMergeColumns() + kmc.merge(left_kr.column_names) + right_column_names: typing.List[str] = kmc.merge(right_kr.column_names, prefix=self.prefix) + return (kmc.column_names, right_column_names) def process(self): if self.verbose: diff --git a/kgtk/join/kgtkmergecolumns.py b/kgtk/join/kgtkmergecolumns.py index a5a75d4b5..d2d4eef5d 100644 --- a/kgtk/join/kgtkmergecolumns.py +++ b/kgtk/join/kgtkmergecolumns.py @@ -2,7 +2,7 @@ import typing -from kgtk.io.kgtkreader import KgtkReader +from kgtk.kgtkformat import KgtkFormat @attr.s(slots=True, frozen=False) class KgtkMergeColumns: @@ -33,46 +33,49 @@ class KgtkMergeColumns: old_column_name_lists: typing.List[typing.List[str]] = attr.ib(factory=list) new_column_name_lists: typing.List[typing.List[str]] = attr.ib(factory=list) - def merge(self, kr: KgtkReader): - """ - Add the columns from a KgtkReader into the merged column list, - respecting predefined column names with aliases. + def merge(self, column_names: typing.List[str], prefix: typing.Optional[str]=None): + """Add column names into the merged column name list, respecting predefined + column names with aliases. + + Return a list of new column names with predefined name aliases replaced with + the name first used in each alias group in the joint list of column names. - Return a list of new column names for the KgtkReader, with - predefined names replaced with the name first used in the - joint list of column names. """ new_column_names: typing.List[str] = [ ] # Record the old column names for debugging. - self.old_column_name_lists.append(kr.column_names) + self.old_column_name_lists.append(column_names.copy()) column_name: str idx: int = 0 - for idx, column_name in enumerate(kr.column_names): - if idx == kr.id_column_idx: + for idx, column_name in enumerate(column_names): + if column_name in KgtkFormat.ID_COLUMN_NAMES: if self.id_column_idx >= 0: column_name = self.column_names[self.id_column_idx] else: self.idx_column_idx = len(self.column_names) - elif idx == kr.node1_column_idx: + elif column_name in KgtkFormat.NODE1_COLUMN_NAMES: if self.node1_column_idx >= 0: column_name = self.column_names[self.node1_column_idx] else: self.node1_column_idx = len(self.column_names) - elif idx == kr.label_column_idx: + elif column_name in KgtkFormat.LABEL_COLUMN_NAMES: if self.label_column_idx >= 0: column_name = self.column_names[self.label_column_idx] else: self.label_column_idx = len(self.column_names) - elif idx == kr.node2_column_idx: + elif column_name in KgtkFormat.NODE2_COLUMN_NAMES: if self.node2_column_idx >= 0: column_name = self.column_names[self.node2_column_idx] else: self.node2_column_idx = len(self.column_names) + else: + # Apply the optional prefix. + if prefix is not None and len(prefix) > 0: + column_name = prefix + column_name new_column_names.append(column_name) if column_name not in self.column_name_map: From 374ce947ec703d043097344e6b64c1d9a48077b1 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 13 May 2020 00:23:17 -0700 Subject: [PATCH 175/278] Move merge_columns inline. --- kgtk/join/kgtkjoiner.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/kgtk/join/kgtkjoiner.py b/kgtk/join/kgtkjoiner.py index c949fdfa2..332daf235 100644 --- a/kgtk/join/kgtkjoiner.py +++ b/kgtk/join/kgtkjoiner.py @@ -213,12 +213,6 @@ def join_key_sets(self, left_join_idx_list: typing.List[int], right_join_idx_lis print("There are %d keys in the inner join key set." % len(join_key_set), file=self.error_file, flush=True) return join_key_set - def merge_columns(self, left_kr: KgtkReader, right_kr: KgtkReader)->typing.Tuple[typing.List[str], typing.List[str]]: - kmc: KgtkMergeColumns = KgtkMergeColumns() - kmc.merge(left_kr.column_names) - right_column_names: typing.List[str] = kmc.merge(right_kr.column_names, prefix=self.prefix) - return (kmc.column_names, right_column_names) - def process(self): if self.verbose: print("Opening the left edge file: %s" % str(self.left_file_path), file=self.error_file, flush=True) @@ -264,9 +258,10 @@ def process(self): if self.verbose: print("Mapping the column names for the join.", file=self.error_file, flush=True) - joined_column_names: typing.List[str] - right_column_names: typing.List[str] - (joined_column_names, right_column_names) = self.merge_columns(left_kr, right_kr) + kmc: KgtkMergeColumns = KgtkMergeColumns() + kmc.merge(left_kr.column_names) + right_column_names: typing.List[str] = kmc.merge(right_kr.column_names, prefix=self.prefix) + joined_column_names: typing.List[str] = kmc.column_names if self.verbose: print(" left columns: %s" % " ".join(left_kr.column_names), file=self.error_file, flush=True) From c632deac5a5dbebb283926d0be357cb91f657a68 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 13 May 2020 00:51:50 -0700 Subject: [PATCH 176/278] Add a kgtk join command. Add more defaults. --- kgtk/cli/join.py | 143 ++++++++++++++++++++++++++++++++++++++++ kgtk/join/ifexists.py | 2 +- kgtk/join/kgtkcat.py | 2 +- kgtk/join/kgtkjoiner.py | 4 +- 4 files changed, 147 insertions(+), 4 deletions(-) create mode 100644 kgtk/cli/join.py diff --git a/kgtk/cli/join.py b/kgtk/cli/join.py new file mode 100644 index 000000000..20c585e24 --- /dev/null +++ b/kgtk/cli/join.py @@ -0,0 +1,143 @@ +""" +Join two KGTK edge files or two KGTK node files. + +TODO: Need KgtkWriterOptions +""" + +from argparse import Namespace, SUPPRESS +from pathlib import Path +import sys +import typing + +from kgtk.cli_argparse import KGTKArgumentParser +from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions +from kgtk.io.kgtkwriter import KgtkWriter +from kgtk.join.kgtkjoiner import KgtkJoiner +from kgtk.value.kgtkvalueoptions import KgtkValueOptions + +def parser(): + return { + 'help': 'Join two KGTK files', + 'description': 'Join two KGTK edge files or two KGTK node files.' + } + + +def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Namespace): + """ + Parse arguments + Args: + parser (argparse.ArgumentParser) + """ + + _expert: bool = parsed_shared_args._expert + + # This helper function makes it easy to suppress options from + # The help message. The options are still there, and initialize + # what they need to initialize. + def h(msg: str)->str: + if _expert: + return msg + else: + return SUPPRESS + + parser.add_argument( "left_file_path", help="The left-side KGTK file to join. Use '-' for stdin (default=%(default)s).", type=Path, default="-") + + parser.add_argument( "right_file_path", help="The right-side KGTK file to join (no default).", type=Path, default="-") + + parser.add_argument( "--join-on-label", dest="join_on_label", + help="If both input files are edge files, include the label column in the join (default=%(default)s).", + action='store_true') + + parser.add_argument( "--join-on-node2", dest="join_on_node2", + help="If both input files are edge files, include the node2 column in the join (default=%(default)s).", + action='store_true') + + parser.add_argument( "--left-file-join-columns", dest="left_join_columns", help="Left file join columns (default=None).", nargs='+') + + parser.add_argument( "--left-join", dest="left_join", help="Perform a left outer join (default=%(default)s).", action='store_true') + + parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to write (default=%(default)s).", type=Path, default="-") + + parser.add_argument( "--prefix", dest="prefix", + help="An optional prefix applied to right file column names in the output file (default=None).") + + parser.add_argument( "--right-file-join-columns", dest="right_join_columns", help="Right file join columns (default=None).", nargs='+') + + parser.add_argument( "--right-join", dest="right_join", help="Perform a right outer join (default=%(default)s).", action='store_true') + + # This argument is retained for compatability with earlier versions of this command. + parser.add_argument( "--error-limit", dest="error_limit", + help=h("The maximum number of errors per input fule (default=%(default)s)"), + default=KgtkReaderOptions.ERROR_LIMIT_DEFAULT) + + parser.add_argument( "--field-separator", dest="field_separator", + help=h("Separator for multifield keys (default=%(default)s)") + , default=KgtkJoiner.FIELD_SEPARATOR_DEFAULT) + + KgtkReader.add_debug_arguments(parser, expert=_expert) + KgtkReaderOptions.add_arguments(parser, mode_options=True, who="left", expert=_expert) + KgtkReaderOptions.add_arguments(parser, mode_options=True, who="right", expert=_expert) + KgtkValueOptions.add_arguments(parser, expert=_expert) + +def run(left_file_path: Path, + right_file_path: Path, + left_join: bool, + right_join: bool, + join_on_label: bool, + join_on_node2: bool, + left_join_columns: typing.Optional[typing.List[str]], + right_join_columns: typing.Optional[typing.List[str]], + output_file_path: Path, + prefix: typing.Optional[str] = None, + + field_separator: str = KgtkJoiner.FIELD_SEPARATOR_DEFAULT, + + errors_to_stdout: bool = False, + errors_to_stderr: bool = True, + verbose: bool = False, + very_verbose: bool = False, + + **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. +)->int: + # import modules locally + from kgtk.exceptions import KGTKException + + + # Select where to send error messages, defaulting to stderr. + error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr + + # Build the option structures. + left_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="left", fallback=True) + right_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="right", fallback=True) + value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) + + try: + kr: KgtkJoiner = KgtkJoiner( + left_file_path=left_file_path, + right_file_path=right_file_path, + output_path=output_file_path, + left_join=left_join, + right_join=right_join, + join_on_label=join_on_label, + join_on_node2=join_on_node2, + left_join_columns=left_join_columns, + right_join_columns=right_join_columns, + prefix=prefix, + field_separator=field_separator, + left_reader_options=left_reader_options, + right_reader_options=right_reader_options, + value_options=value_options, + error_file=error_file, + verbose=verbose, + very_verbose=very_verbose, + ) + + kr.process() + + return 0 + + except SystemExit as e: + raise KGTKException("Exit requested") + except Exception as e: + raise KGTKException(str(e)) + diff --git a/kgtk/join/ifexists.py b/kgtk/join/ifexists.py index 9e5a788c0..26f1f1965 100644 --- a/kgtk/join/ifexists.py +++ b/kgtk/join/ifexists.py @@ -207,7 +207,7 @@ def main(): parser.add_argument( "--filter-on", dest="filter_file_path", help="The KGTK file with the filter data", type=Path, required=True) - parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to read", type=Path, default=None) + parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to write (default=%(default)s).", type=Path, default="-") parser.add_argument( "--field-separator", dest="field_separator", help="Separator for multifield keys", default=IfExists.FIELD_SEPARATOR_DEFAULT) diff --git a/kgtk/join/kgtkcat.py b/kgtk/join/kgtkcat.py index 537d54601..7861185d8 100644 --- a/kgtk/join/kgtkcat.py +++ b/kgtk/join/kgtkcat.py @@ -157,7 +157,7 @@ def main(): """ parser = ArgumentParser() parser.add_argument(dest="input_file_paths", help="The KGTK files to concatenate", type=Path, nargs='+') - parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to read (default=%(default)s)", type=Path, default="-") + parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to write (default=%(default)s)", type=Path, default="-") KgtkReader.add_debug_arguments(parser, expert=True) KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=True) diff --git a/kgtk/join/kgtkjoiner.py b/kgtk/join/kgtkjoiner.py index 332daf235..5c2f70139 100644 --- a/kgtk/join/kgtkjoiner.py +++ b/kgtk/join/kgtkjoiner.py @@ -346,8 +346,8 @@ def main(): parser.add_argument( "--left-file-join-columns", dest="left_join_columns", help="Left file join columns.", nargs='+') parser.add_argument( "--left-join", dest="left_join", help="Perform a left outer join.", action='store_true') - parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to read", type=Path, default=None) - parser.add_argument( "--prefix", dest="prefix", help="The prefix applied to right file column names in the output file.") + parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to write", type=Path, default=None) + parser.add_argument( "--prefix", dest="prefix", help="An optional prefix applied to right file column names in the output file (default=None).") parser.add_argument( "--right-file-join-columns", dest="right_join_columns", help="Right file join columns.", nargs='+') parser.add_argument( "--right-join", dest="right_join", help="Perform a right outer join.", action='store_true') From 7cbbcb3c199bd12b95e0f9335e5358f3dc8120f2 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 13 May 2020 01:04:09 -0700 Subject: [PATCH 177/278] Check for forbidden uses of stdin. --- kgtk/cli/join.py | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/kgtk/cli/join.py b/kgtk/cli/join.py index 20c585e24..1c6386e56 100644 --- a/kgtk/cli/join.py +++ b/kgtk/cli/join.py @@ -18,7 +18,7 @@ def parser(): return { 'help': 'Join two KGTK files', - 'description': 'Join two KGTK edge files or two KGTK node files.' + 'description': 'Join two KGTK edge files or two KGTK node files. ' } @@ -40,9 +40,9 @@ def h(msg: str)->str: else: return SUPPRESS - parser.add_argument( "left_file_path", help="The left-side KGTK file to join. Use '-' for stdin (default=%(default)s).", type=Path, default="-") + parser.add_argument( "left_file_path", help="The left-side KGTK file to join (no default).", type=Path, default=None) - parser.add_argument( "right_file_path", help="The right-side KGTK file to join (no default).", type=Path, default="-") + parser.add_argument( "right_file_path", help="The right-side KGTK file to join (no default).", type=Path, default=None) parser.add_argument( "--join-on-label", dest="join_on_label", help="If both input files are edge files, include the label column in the join (default=%(default)s).", @@ -79,8 +79,8 @@ def h(msg: str)->str: KgtkReaderOptions.add_arguments(parser, mode_options=True, who="right", expert=_expert) KgtkValueOptions.add_arguments(parser, expert=_expert) -def run(left_file_path: Path, - right_file_path: Path, +def run(left_file_path: typing.Optional[Path], + right_file_path: typing.Optional[Path], left_join: bool, right_join: bool, join_on_label: bool, @@ -106,6 +106,27 @@ def run(left_file_path: Path, # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr + if not right_join: + if left_file_path is None or str(left_file_path) == "-": + print("The left file may not be stdin when an inner join or left join is requested.", file=error_file, flush=True) + return 1 + + if not left_join: + if right_file_path is None or str(right_file_path) == "-": + print("The right file may not be stdin when an inner join or right join is requested.", file=error_file, flush=True) + return 1 + + if (left_file_path is None or str(left_file_path) == "-") and (right_file_path is None or str(right_file_path) == "-"): + print("The left and right files may not both be stdin.", file=error_file, flush=True) + return 1 + + if left_file_path is None: + left_file_path = Path("-") + + if right_file_path is None: + right_file_path = Path("-") + + # Build the option structures. left_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="left", fallback=True) right_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="right", fallback=True) From fd39326c4dc1877d04b6b9d794aaeef17a12a75c Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 13 May 2020 01:05:24 -0700 Subject: [PATCH 178/278] Improve the description. --- kgtk/cli/join.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kgtk/cli/join.py b/kgtk/cli/join.py index 1c6386e56..10e754c58 100644 --- a/kgtk/cli/join.py +++ b/kgtk/cli/join.py @@ -18,7 +18,7 @@ def parser(): return { 'help': 'Join two KGTK files', - 'description': 'Join two KGTK edge files or two KGTK node files. ' + 'description': 'Join two KGTK edge files or two KGTK node files. Two passes may be needed, stdin may be forbidden.' } From 0613b4d3c6996fb32d4ddb79c0a6880869587120 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 13 May 2020 01:14:26 -0700 Subject: [PATCH 179/278] Add a kgtk cat command. --- kgtk/cli/cat.py | 92 ++++++++++++++++++++++++++++++++++++++++++++ kgtk/join/kgtkcat.py | 4 +- 2 files changed, 94 insertions(+), 2 deletions(-) create mode 100644 kgtk/cli/cat.py diff --git a/kgtk/cli/cat.py b/kgtk/cli/cat.py new file mode 100644 index 000000000..e911c98c6 --- /dev/null +++ b/kgtk/cli/cat.py @@ -0,0 +1,92 @@ +""" +Concatenate KGTK files. + +TODO: Need KgtkWriterOptions +""" + +from argparse import Namespace, SUPPRESS +from pathlib import Path +import sys +import typing + +from kgtk.cli_argparse import KGTKArgumentParser +from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions +from kgtk.io.kgtkwriter import KgtkWriter +from kgtk.join.kgtkcat import KgtkCat +from kgtk.value.kgtkvalueoptions import KgtkValueOptions + +def parser(): + return { + 'help': 'Join two KGTK files', + 'description': 'Join two KGTK edge files or two KGTK node files. Two passes may be needed, stdin may be forbidden.' + } + + +def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Namespace): + """ + Parse arguments + Args: + parser (argparse.ArgumentParser) + """ + + _expert: bool = parsed_shared_args._expert + + # This helper function makes it easy to suppress options from + # The help message. The options are still there, and initialize + # what they need to initialize. + def h(msg: str)->str: + if _expert: + return msg + else: + return SUPPRESS + + parser.add_argument( "input_file_paths", help="The KGTK files to concatenate.", type=Path, nargs='+', default=[Path("-")]) + + parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to write (default=%(default)s).", type=Path, default="-") + + KgtkReader.add_debug_arguments(parser, expert=_expert) + KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=_expert) + KgtkValueOptions.add_arguments(parser, expert=_expert) + +def run(input_file_paths: typing.List[Path], + output_file_path: Path, + + errors_to_stdout: bool = False, + errors_to_stderr: bool = True, + verbose: bool = False, + very_verbose: bool = False, + + **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. +)->int: + # import modules locally + from kgtk.exceptions import KGTKException + + + # Select where to send error messages, defaulting to stderr. + error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr + + # TODO: check that at most one input file is stdin? + + # Build the option structures. + reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) + value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) + + try: + kc: KgtkCat = KgtkCat(input_file_paths=input_file_paths, + output_path=output_file_path, + reader_options=reader_options, + value_options=value_options, + error_file=error_file, + verbose=verbose, + very_verbose=very_verbose + ) + + kc.process() + + return 0 + + except SystemExit as e: + raise KGTKException("Exit requested") + except Exception as e: + raise KGTKException(str(e)) + diff --git a/kgtk/join/kgtkcat.py b/kgtk/join/kgtkcat.py index 7861185d8..cb0d6ba8d 100644 --- a/kgtk/join/kgtkcat.py +++ b/kgtk/join/kgtkcat.py @@ -171,7 +171,7 @@ def main(): reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args) value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) - ec: KgtkCat = KgtkCat(input_file_paths=args.input_file_paths, + kc: KgtkCat = KgtkCat(input_file_paths=args.input_file_paths, output_path=args.output_file_path, reader_options=reader_options, value_options=value_options, @@ -179,7 +179,7 @@ def main(): verbose=args.verbose, very_verbose=args.very_verbose) - ec.process() + kc.process() if __name__ == "__main__": main() From e36f94624cbc28aeccfee67447312d6139024504 Mon Sep 17 00:00:00 2001 From: Rongpeng Date: Wed, 13 May 2020 08:27:39 -0700 Subject: [PATCH 180/278] no change of - to _ in qnode --- kgtk/triple_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kgtk/triple_generator.py b/kgtk/triple_generator.py index a948908ee..c79789d3b 100644 --- a/kgtk/triple_generator.py +++ b/kgtk/triple_generator.py @@ -21,7 +21,7 @@ ) from etk.knowledge_graph.node import LiteralType -BAD_CHARS = [":", "-", "&", ",", " ", +BAD_CHARS = [":", "&", ",", " ", "(", ")", "\'", '\"', "/", "\\", "[", "]", ";", "|"] From 14d4e3813913deef48aff711f637226da90d7ccb Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 13 May 2020 10:31:36 -0700 Subject: [PATCH 181/278] Refactor slightly and close input files on an error return. --- kgtk/join/kgtkjoiner.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/kgtk/join/kgtkjoiner.py b/kgtk/join/kgtkjoiner.py index 5c2f70139..a1662ae0e 100644 --- a/kgtk/join/kgtkjoiner.py +++ b/kgtk/join/kgtkjoiner.py @@ -213,6 +213,21 @@ def join_key_sets(self, left_join_idx_list: typing.List[int], right_join_idx_lis print("There are %d keys in the inner join key set." % len(join_key_set), file=self.error_file, flush=True) return join_key_set + def ok_to_join(self, left_kr: KgtkReader, right_kr: KgtkReader)->bool: + if left_kr.is_edge_file and right_kr.is_edge_file: + if self.verbose: + print("Both input files are edge files.", file=self.error_file, flush=True) + return True + + elif left_kr.is_node_file and right_kr.is_node_file: + if self.verbose: + print("Both input files are node files.", file=self.error_file, flush=True) + return True + + else: + print("Cannot join edge and node files.", file=self.error_file, flush=True) + return False + def process(self): if self.verbose: print("Opening the left edge file: %s" % str(self.left_file_path), file=self.error_file, flush=True) @@ -235,15 +250,10 @@ def process(self): very_verbose=self.very_verbose ) - if left_kr.is_edge_file and right_kr.is_edge_file: - if self.verbose: - print("Both input files are edge files.", file=self.error_file, flush=True) - elif left_kr.is_node_file and right_kr.is_node_file: - if self.verbose: - print("Both input files are node files.", file=self.error_file, flush=True) - else: - print("Cannot join edge and node files.", file=self.error_file, flush=True) - return + if not self.ok_to_join(left_kr, right_kr): + left_kr.close() + right_kr.close() + return 1 left_join_idx_list: typing.List[int] = self.build_join_idx_list(left_kr, self.LEFT, self.left_join_columns) right_join_idx_list: typing.List[int] = self.build_join_idx_list(right_kr, self.RIGHT, self.right_join_columns) @@ -251,7 +261,7 @@ def process(self): print("the left join key has %d components, the right join key has %d columns. Exiting." % (len(left_join_idx_list), len(right_join_idx_list)), file=self.error_file, flush=True) left_kr.close() right_kr.close() - return + return 1 # This might open the input files for a second time. This won't work with stdin. joined_key_set: typing.Optional[typing.Set[str]] = self.join_key_sets(left_join_idx_list, right_join_idx_list) From 19c06ec43fe5ac329f430a5987998db884bf1234 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 13 May 2020 10:48:20 -0700 Subject: [PATCH 182/278] Add file processing arguments that apply to both files. Fix the suppression of the --errors-to- arguments. --- kgtk/cli/join.py | 8 +++----- kgtk/io/kgtkreader.py | 38 +++++++++++++++++++++++++------------- 2 files changed, 28 insertions(+), 18 deletions(-) diff --git a/kgtk/cli/join.py b/kgtk/cli/join.py index 10e754c58..09044526a 100644 --- a/kgtk/cli/join.py +++ b/kgtk/cli/join.py @@ -65,16 +65,14 @@ def h(msg: str)->str: parser.add_argument( "--right-join", dest="right_join", help="Perform a right outer join (default=%(default)s).", action='store_true') - # This argument is retained for compatability with earlier versions of this command. - parser.add_argument( "--error-limit", dest="error_limit", - help=h("The maximum number of errors per input fule (default=%(default)s)"), - default=KgtkReaderOptions.ERROR_LIMIT_DEFAULT) - parser.add_argument( "--field-separator", dest="field_separator", help=h("Separator for multifield keys (default=%(default)s)") , default=KgtkJoiner.FIELD_SEPARATOR_DEFAULT) + # Build the command arguments. File arguments can be set for individual + # files, or for all files. KgtkReader.add_debug_arguments(parser, expert=_expert) + KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=_expert) KgtkReaderOptions.add_arguments(parser, mode_options=True, who="left", expert=_expert) KgtkReaderOptions.add_arguments(parser, mode_options=True, who="right", expert=_expert) KgtkValueOptions.add_arguments(parser, expert=_expert) diff --git a/kgtk/io/kgtkreader.py b/kgtk/io/kgtkreader.py index abc5e76fa..23f8e6c0c 100644 --- a/kgtk/io/kgtkreader.py +++ b/kgtk/io/kgtkreader.py @@ -953,19 +953,31 @@ def h(msg: str)->str: else: return SUPPRESS - # TODO: Fix the argparse bug that prevents these two arguments from - # having their help messages suppressed. - errors_to = parser.add_mutually_exclusive_group() - errors_to.add_argument( "--errors-to-stdout", dest="errors_to_stdout", - help="Send errors to stdout instead of stderr", - action="store_true") - errors_to.add_argument( "--errors-to-stderr", dest="errors_to_stderr", - help="Send errors to stderr instead of stdout", - action="store_true") - - parser.add_argument("-v", "--verbose", dest="verbose", help="Print additional progress messages.", action='store_true') - - parser.add_argument( "--very-verbose", dest="very_verbose", + egroup: _ArgumentGroup = parser.add_argument_group(h("Error and feedback messages"), + h("Send error messages and feedback to stderr or stdout, " + + "control the amount of feedback and debugging messages.")) + + # Avoid the argparse bug that prevents these two arguments from having + # their help messages suppressed directly. + if expert: + errors_to = egroup.add_mutually_exclusive_group() + errors_to.add_argument( "--errors-to-stdout", dest="errors_to_stdout", + help="Send errors to stdout instead of stderr", + action="store_true") + errors_to.add_argument( "--errors-to-stderr", dest="errors_to_stderr", + help="Send errors to stderr instead of stdout", + action="store_true") + else: + egroup.add_argument( "--errors-to-stdout", dest="errors_to_stdout", + help=h("Send errors to stdout instead of stderr"), + action="store_true") + egroup.add_argument( "--errors-to-stderr", dest="errors_to_stderr", + help=h("Send errors to stderr instead of stdout"), + action="store_true") + + egroup.add_argument("-v", "--verbose", dest="verbose", help="Print additional progress messages.", action='store_true') + + egroup.add_argument( "--very-verbose", dest="very_verbose", help=h("Print additional progress messages."), action='store_true') From 92813786ed3b6b91613e1096be675244b866d2cb Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 13 May 2020 10:59:41 -0700 Subject: [PATCH 183/278] Better documentation. --- kgtk/cli/join.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/kgtk/cli/join.py b/kgtk/cli/join.py index 09044526a..f7bf8820a 100644 --- a/kgtk/cli/join.py +++ b/kgtk/cli/join.py @@ -18,7 +18,19 @@ def parser(): return { 'help': 'Join two KGTK files', - 'description': 'Join two KGTK edge files or two KGTK node files. Two passes may be needed, stdin may be forbidden.' + 'description': """Join two KGTK edge files or two KGTK node files. +Join keys are extracted from one or both input files and stored in memory, +then the data is processed in a second pass. +stdin will not work as an input file if two passes are needed. + +The output file contains the union of the columns in the two +input files, adjusted for predefined name aliasing. + +Specify --left-join to get a left outer join. +Specify --right-join to get a right outer join. +Specify both to get a full outer join (equivalent to cat). +Specify neither to get an inner join. +""" } @@ -40,9 +52,9 @@ def h(msg: str)->str: else: return SUPPRESS - parser.add_argument( "left_file_path", help="The left-side KGTK file to join (no default).", type=Path, default=None) + parser.add_argument( "left_file_path", help="The left-side KGTK file to join (required).", type=Path, default=None) - parser.add_argument( "right_file_path", help="The right-side KGTK file to join (no default).", type=Path, default=None) + parser.add_argument( "right_file_path", help="The right-side KGTK file to join (required).", type=Path, default=None) parser.add_argument( "--join-on-label", dest="join_on_label", help="If both input files are edge files, include the label column in the join (default=%(default)s).", From 7b9783bcb3a10970ad1f0f34dc0a490e6d395ac3 Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 13 May 2020 11:02:33 -0700 Subject: [PATCH 184/278] Mention expert mode. --- kgtk/cli/join.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kgtk/cli/join.py b/kgtk/cli/join.py index f7bf8820a..7e88b5449 100644 --- a/kgtk/cli/join.py +++ b/kgtk/cli/join.py @@ -30,6 +30,8 @@ def parser(): Specify --right-join to get a right outer join. Specify both to get a full outer join (equivalent to cat). Specify neither to get an inner join. + +Expert mode provides additional command arguments. """ } From 0285933c708f548fb1da562d370f8fae482efd7a Mon Sep 17 00:00:00 2001 From: saggu Date: Wed, 13 May 2020 12:02:00 -0700 Subject: [PATCH 185/278] add unit testsf or triple generation --- kgtk/tests/data/P10.tsv | 22 + kgtk/tests/data/P10_not_truthy.ttl | 154 + kgtk/tests/data/P10_truthy.ttl | 169 + kgtk/tests/data/Q57160439.tsv | 40 + kgtk/tests/data/Q57160439_not_truthy.ttl | 200 + kgtk/tests/data/Q57160439_truthy.ttl | 224 + kgtk/tests/data/wikidata_properties.tsv | 7440 ++++++++++++++++++++++ kgtk/tests/test_triple_generation.py | 97 + 8 files changed, 8346 insertions(+) create mode 100644 kgtk/tests/data/P10.tsv create mode 100644 kgtk/tests/data/P10_not_truthy.ttl create mode 100644 kgtk/tests/data/P10_truthy.ttl create mode 100644 kgtk/tests/data/Q57160439.tsv create mode 100644 kgtk/tests/data/Q57160439_not_truthy.ttl create mode 100644 kgtk/tests/data/Q57160439_truthy.ttl create mode 100644 kgtk/tests/data/wikidata_properties.tsv create mode 100644 kgtk/tests/test_triple_generation.py diff --git a/kgtk/tests/data/P10.tsv b/kgtk/tests/data/P10.tsv new file mode 100644 index 000000000..412cd3a05 --- /dev/null +++ b/kgtk/tests/data/P10.tsv @@ -0,0 +1,22 @@ +node1 property node2 id +P10 P1628 http://schema.org/video P10-P1628-1 +P10 P1628 http://www.w3.org/2006/vcard/ns#Video P10-P1628-2 +P10 P1629 Q34508 P10-P1629-1 +P10 P1659 P1651 P10-P1659-1 +P10 P1659 P51 P10-P1659-2 +P10 P1659 P18 P10-P1659-3 +P10 P1659 P4238 P10-P1659-4 +P10 P1855 Q4504 P10-P1855-1 +P10 P1855 Q69063653 P10-P1855-2 +P10 P1855 Q15075950 P10-P1855-3 +P10 P1855 Q7378 P10-P1855-4 +P10 P2302 Q21510852 P10-P2302-1 +P10 P2302 Q21502404 P10-P2302-2 +P10 P2302 Q21510851 P10-P2302-3 +P10 P31 Q18610173 P10-P31-1 +P10 aliases 'media'@en P10-alias-0 +P10 aliases 'animation'@en P10-alias-1 +P10 aliases 'gif'@en P10-alias-2 +P10 aliases 'trailer (Commons)'@en P10-alias-3 +P10 descriptions "'relevant video. For images, use the property P18. For film trailers, qualify with ""object has role"" (P3831)=""trailer"" (Q622550)'@en" P10-description-0 +P10 label 'video'@en P10-label-1 diff --git a/kgtk/tests/data/P10_not_truthy.ttl b/kgtk/tests/data/P10_not_truthy.ttl new file mode 100644 index 000000000..688da992a --- /dev/null +++ b/kgtk/tests/data/P10_not_truthy.ttl @@ -0,0 +1,154 @@ +@prefix wikibase: . +@prefix wd: . +@prefix wdt: . +@prefix wdtn: . +@prefix wdno: . +@prefix wds: . +@prefix wdv: . +@prefix wdref: . +@prefix p: . +@prefix pr: . +@prefix prv: . +@prefix prn: . +@prefix ps: . +@prefix psv: . +@prefix psn: . +@prefix pq: . +@prefix pqv: . +@prefix pqn: . +@prefix prov: . +@prefix skos: . +@prefix schema: . + +wd:P10 a wikibase:Property ; + rdfs:label "video"@en ; + schema:description "relevant video. For images, use the property P18. For film trailers, qualify with object has role (P3831)=trailer (Q622550)"@en ; + schema:name "video"@en ; + wikibase:claim p:P10 ; + wikibase:directClaim wdt:P10 ; + wikibase:directClaimNormalized wdtn:P10 ; + wikibase:novalue wdno:P10 ; + wikibase:propertyType wikibase:String ; + wikibase:qualifier pq:P10 ; + wikibase:qualifierValue pqv:P10 ; + wikibase:qualifierValueNormalized pqn:P10 ; + wikibase:reference pr:P10 ; + wikibase:referenceValue prv:P10 ; + wikibase:referenceValueNormalized prn:P10 ; + wikibase:statementProperty ps:P10 ; + wikibase:statementValue psv:P10 ; + wikibase:statementValueNormalized psn:P10 ; + skos:altLabel "animation"@en, + "gif"@en, + "media"@en, + "trailer (Commons)"@en ; + skos:prefLabel "video"@en ; + p:P1628 wds:P10-P10-P1628-1, + wds:P10-P10-P1628-2 ; + p:P1629 wds:P10-P10-P1629-1 ; + p:P1659 wds:P10-P10-P1659-1, + wds:P10-P10-P1659-2, + wds:P10-P10-P1659-3, + wds:P10-P10-P1659-4 ; + p:P1855 wds:P10-P10-P1855-1, + wds:P10-P10-P1855-2, + wds:P10-P10-P1855-3, + wds:P10-P10-P1855-4 ; + p:P2302 wds:P10-P10-P2302-1, + wds:P10-P10-P2302-2, + wds:P10-P10-P2302-3 ; + p:P31 wds:P10-P10-P31-1 . + +wd:Q15075950 a wikibase:Item . + +wd:Q18610173 a wikibase:Item . + +wd:Q21502404 a wikibase:Item . + +wd:Q21510851 a wikibase:Item . + +wd:Q21510852 a wikibase:Item . + +wd:Q34508 a wikibase:Item . + +wd:Q4504 a wikibase:Item . + +wd:Q69063653 a wikibase:Item . + +wd:Q7378 a wikibase:Item . + +wds:P10-P10-P1628-1 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + ps:P1628 "http://schema.org/video"^^xsd:string . + +wds:P10-P10-P1628-2 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + ps:P1628 "http://www.w3.org/2006/vcard/ns#Video"^^xsd:string . + +wds:P10-P10-P1629-1 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + ps:P1629 wd:Q34508 . + +wds:P10-P10-P1659-1 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + ps:P1659 "P1651"^^xsd:string . + +wds:P10-P10-P1659-2 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + ps:P1659 "P51"^^xsd:string . + +wds:P10-P10-P1659-3 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + ps:P1659 "P18"^^xsd:string . + +wds:P10-P10-P1659-4 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + ps:P1659 "P4238"^^xsd:string . + +wds:P10-P10-P1855-1 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + ps:P1855 wd:Q4504 . + +wds:P10-P10-P1855-2 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + ps:P1855 wd:Q69063653 . + +wds:P10-P10-P1855-3 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + ps:P1855 wd:Q15075950 . + +wds:P10-P10-P1855-4 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + ps:P1855 wd:Q7378 . + +wds:P10-P10-P2302-1 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + ps:P2302 wd:Q21510852 . + +wds:P10-P10-P2302-2 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + ps:P2302 wd:Q21502404 . + +wds:P10-P10-P2302-3 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + ps:P2302 wd:Q21510851 . + +wds:P10-P10-P31-1 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + ps:P31 wd:Q18610173 . + diff --git a/kgtk/tests/data/P10_truthy.ttl b/kgtk/tests/data/P10_truthy.ttl new file mode 100644 index 000000000..eb2a54df1 --- /dev/null +++ b/kgtk/tests/data/P10_truthy.ttl @@ -0,0 +1,169 @@ +@prefix wikibase: . +@prefix wd: . +@prefix wdt: . +@prefix wdtn: . +@prefix wdno: . +@prefix wds: . +@prefix wdv: . +@prefix wdref: . +@prefix p: . +@prefix pr: . +@prefix prv: . +@prefix prn: . +@prefix ps: . +@prefix psv: . +@prefix psn: . +@prefix pq: . +@prefix pqv: . +@prefix pqn: . +@prefix prov: . +@prefix skos: . +@prefix schema: . + +wd:P10 a wikibase:Property ; + rdfs:label "video"@en ; + schema:description "relevant video. For images, use the property P18. For film trailers, qualify with object has role (P3831)=trailer (Q622550)"@en ; + schema:name "video"@en ; + wikibase:claim p:P10 ; + wikibase:directClaim wdt:P10 ; + wikibase:directClaimNormalized wdtn:P10 ; + wikibase:novalue wdno:P10 ; + wikibase:propertyType wikibase:String ; + wikibase:qualifier pq:P10 ; + wikibase:qualifierValue pqv:P10 ; + wikibase:qualifierValueNormalized pqn:P10 ; + wikibase:reference pr:P10 ; + wikibase:referenceValue prv:P10 ; + wikibase:referenceValueNormalized prn:P10 ; + wikibase:statementProperty ps:P10 ; + wikibase:statementValue psv:P10 ; + wikibase:statementValueNormalized psn:P10 ; + skos:altLabel "animation"@en, + "gif"@en, + "media"@en, + "trailer (Commons)"@en ; + skos:prefLabel "video"@en ; + p:P1628 wds:P10-P10-P1628-1, + wds:P10-P10-P1628-2 ; + p:P1629 wds:P10-P10-P1629-1 ; + p:P1659 wds:P10-P10-P1659-1, + wds:P10-P10-P1659-2, + wds:P10-P10-P1659-3, + wds:P10-P10-P1659-4 ; + p:P1855 wds:P10-P10-P1855-1, + wds:P10-P10-P1855-2, + wds:P10-P10-P1855-3, + wds:P10-P10-P1855-4 ; + p:P2302 wds:P10-P10-P2302-1, + wds:P10-P10-P2302-2, + wds:P10-P10-P2302-3 ; + p:P31 wds:P10-P10-P31-1 ; + wdt:P1628 "http://schema.org/video"^^xsd:string, + "http://www.w3.org/2006/vcard/ns#Video"^^xsd:string ; + wdt:P1629 wd:Q34508 ; + wdt:P1659 "P1651"^^xsd:string, + "P18"^^xsd:string, + "P4238"^^xsd:string, + "P51"^^xsd:string ; + wdt:P1855 wd:Q15075950, + wd:Q4504, + wd:Q69063653, + wd:Q7378 ; + wdt:P2302 wd:Q21502404, + wd:Q21510851, + wd:Q21510852 ; + wdt:P31 wd:Q18610173 . + +wds:P10-P10-P1628-1 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + ps:P1628 "http://schema.org/video"^^xsd:string . + +wds:P10-P10-P1628-2 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + ps:P1628 "http://www.w3.org/2006/vcard/ns#Video"^^xsd:string . + +wds:P10-P10-P1629-1 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + ps:P1629 wd:Q34508 . + +wds:P10-P10-P1659-1 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + ps:P1659 "P1651"^^xsd:string . + +wds:P10-P10-P1659-2 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + ps:P1659 "P51"^^xsd:string . + +wds:P10-P10-P1659-3 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + ps:P1659 "P18"^^xsd:string . + +wds:P10-P10-P1659-4 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + ps:P1659 "P4238"^^xsd:string . + +wds:P10-P10-P1855-1 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + ps:P1855 wd:Q4504 . + +wds:P10-P10-P1855-2 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + ps:P1855 wd:Q69063653 . + +wds:P10-P10-P1855-3 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + ps:P1855 wd:Q15075950 . + +wds:P10-P10-P1855-4 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + ps:P1855 wd:Q7378 . + +wds:P10-P10-P2302-1 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + ps:P2302 wd:Q21510852 . + +wds:P10-P10-P2302-2 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + ps:P2302 wd:Q21502404 . + +wds:P10-P10-P2302-3 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + ps:P2302 wd:Q21510851 . + +wds:P10-P10-P31-1 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + ps:P31 wd:Q18610173 . + +wd:Q15075950 a wikibase:Item . + +wd:Q18610173 a wikibase:Item . + +wd:Q21502404 a wikibase:Item . + +wd:Q21510851 a wikibase:Item . + +wd:Q21510852 a wikibase:Item . + +wd:Q34508 a wikibase:Item . + +wd:Q4504 a wikibase:Item . + +wd:Q69063653 a wikibase:Item . + +wd:Q7378 a wikibase:Item . + diff --git a/kgtk/tests/data/Q57160439.tsv b/kgtk/tests/data/Q57160439.tsv new file mode 100644 index 000000000..2307da78a --- /dev/null +++ b/kgtk/tests/data/Q57160439.tsv @@ -0,0 +1,40 @@ +node1 property node2 id +Q57160439 P1433 Q1146531 Q57160439-P1433-1 +Q57160439 P1476 'A quantitative test to estimate neutralizing antibodies to the hepatitis C virus: cytofluorimetric assessment of envelope glycoprotein 2 binding to target cells'@en Q57160439-P1476-1 +Q57160439 P2093 D. Rosa Q57160439-P2093-1 +Q57160439-P2093-1 P1545 1 Q57160439-P2093-1-P1545-1 +Q57160439 P2093 Q. L. Choo Q57160439-P2093-10 +Q57160439-P2093-10 P1545 10 Q57160439-P2093-10-P1545-1 +Q57160439 P2093 D. Chien Q57160439-P2093-11 +Q57160439-P2093-11 P1545 11 Q57160439-P2093-11-P1545-1 +Q57160439 P2093 P. Pileri Q57160439-P2093-12 +Q57160439-P2093-12 P1545 12 Q57160439-P2093-12-P1545-1 +Q57160439 P2093 M. Houghton Q57160439-P2093-13 +Q57160439-P2093-13 P1545 13 Q57160439-P2093-13-P1545-1 +Q57160439 P2093 S. Abrignani Q57160439-P2093-14 +Q57160439-P2093-14 P1545 14 Q57160439-P2093-14-P1545-1 +Q57160439 P2093 S. Campagnoli Q57160439-P2093-2 +Q57160439-P2093-2 P1545 2 Q57160439-P2093-2-P1545-1 +Q57160439 P2093 C. Moretto Q57160439-P2093-3 +Q57160439-P2093-3 P1545 3 Q57160439-P2093-3-P1545-1 +Q57160439 P2093 E. Guenzi Q57160439-P2093-4 +Q57160439-P2093-4 P1545 4 Q57160439-P2093-4-P1545-1 +Q57160439 P2093 L. Cousens Q57160439-P2093-5 +Q57160439-P2093-5 P1545 5 Q57160439-P2093-5-P1545-1 +Q57160439 P2093 M. Chin Q57160439-P2093-6 +Q57160439-P2093-6 P1545 6 Q57160439-P2093-6-P1545-1 +Q57160439 P2093 C. Dong Q57160439-P2093-7 +Q57160439-P2093-7 P1545 7 Q57160439-P2093-7-P1545-1 +Q57160439 P2093 A. J. Weiner Q57160439-P2093-8 +Q57160439-P2093-8 P1545 8 Q57160439-P2093-8-P1545-1 +Q57160439 P2093 J. Y. Lau Q57160439-P2093-9 +Q57160439-P2093-9 P1545 9 Q57160439-P2093-9-P1545-1 +Q57160439 P304 1759-1763 Q57160439-P304-1 +Q57160439 P31 Q13442814 Q57160439-P31-1 +Q57160439 P356 10.1073/PNAS.93.5.1759 Q57160439-P356-1 +Q57160439 P407 Q1860 Q57160439-P407-1 +Q57160439 P433 5 Q57160439-P433-1 +Q57160439 P478 93 Q57160439-P478-1 +Q57160439 P577 ^1996-03-05T00:00:00Z/11 Q57160439-P577-1 +Q57160439 P921 Q79460 Q57160439-P921-1 +Q57160439 label 'A quantitative test to estimate neutralizing antibodies to the hepatitis C virus: cytofluorimetric assessment of envelope glycoprotein 2 binding to target cells'@en Q57160439-label-1 diff --git a/kgtk/tests/data/Q57160439_not_truthy.ttl b/kgtk/tests/data/Q57160439_not_truthy.ttl new file mode 100644 index 000000000..58ebdbf77 --- /dev/null +++ b/kgtk/tests/data/Q57160439_not_truthy.ttl @@ -0,0 +1,200 @@ +@prefix wikibase: . +@prefix wd: . +@prefix wdt: . +@prefix wdtn: . +@prefix wdno: . +@prefix wds: . +@prefix wdv: . +@prefix wdref: . +@prefix p: . +@prefix pr: . +@prefix prv: . +@prefix prn: . +@prefix ps: . +@prefix psv: . +@prefix psn: . +@prefix pq: . +@prefix pqv: . +@prefix pqn: . +@prefix prov: . +@prefix skos: . +@prefix schema: . + +wd:Q57160439 a wikibase:Item ; + rdfs:label "A quantitative test to estimate neutralizing antibodies to the hepatitis C virus: cytofluorimetric assessment of envelope glycoprotein 2 binding to target cells"@en ; + schema:name "A quantitative test to estimate neutralizing antibodies to the hepatitis C virus: cytofluorimetric assessment of envelope glycoprotein 2 binding to target cells"@en ; + skos:prefLabel "A quantitative test to estimate neutralizing antibodies to the hepatitis C virus: cytofluorimetric assessment of envelope glycoprotein 2 binding to target cells"@en ; + p:P1433 wds:Q57160439-Q57160439-P1433-1 ; + p:P1476 wds:Q57160439-Q57160439-P1476-1 ; + p:P2093 wds:Q57160439-Q57160439-P2093-1, + wds:Q57160439-Q57160439-P2093-10, + wds:Q57160439-Q57160439-P2093-11, + wds:Q57160439-Q57160439-P2093-12, + wds:Q57160439-Q57160439-P2093-13, + wds:Q57160439-Q57160439-P2093-14, + wds:Q57160439-Q57160439-P2093-2, + wds:Q57160439-Q57160439-P2093-3, + wds:Q57160439-Q57160439-P2093-4, + wds:Q57160439-Q57160439-P2093-5, + wds:Q57160439-Q57160439-P2093-6, + wds:Q57160439-Q57160439-P2093-7, + wds:Q57160439-Q57160439-P2093-8, + wds:Q57160439-Q57160439-P2093-9 ; + p:P304 wds:Q57160439-Q57160439-P304-1 ; + p:P31 wds:Q57160439-Q57160439-P31-1 ; + p:P356 wds:Q57160439-Q57160439-P356-1 ; + p:P407 wds:Q57160439-Q57160439-P407-1 ; + p:P433 wds:Q57160439-Q57160439-P433-1 ; + p:P478 wds:Q57160439-Q57160439-P478-1 ; + p:P577 wds:Q57160439-Q57160439-P577-1 ; + p:P921 wds:Q57160439-Q57160439-P921-1 . + +wd:Q1146531 a wikibase:Item . + +wd:Q13442814 a wikibase:Item . + +wd:Q1860 a wikibase:Item . + +wd:Q79460 a wikibase:Item . + +wds:Q57160439-Q57160439-P1433-1 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + ps:P1433 wd:Q1146531 . + +wds:Q57160439-Q57160439-P1476-1 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + ps:P1476 "A quantitative test to estimate neutralizing antibodies to the hepatitis C virus: cytofluorimetric assessment of envelope glycoprotein 2 binding to target cells"@en . + +wds:Q57160439-Q57160439-P2093-1 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + pq:P1545 "1"^^xsd:string ; + ps:P2093 "D. Rosa"^^xsd:string . + +wds:Q57160439-Q57160439-P2093-10 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + pq:P1545 "10"^^xsd:string ; + ps:P2093 "Q. L. Choo"^^xsd:string . + +wds:Q57160439-Q57160439-P2093-11 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + pq:P1545 "11"^^xsd:string ; + ps:P2093 "D. Chien"^^xsd:string . + +wds:Q57160439-Q57160439-P2093-12 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + pq:P1545 "12"^^xsd:string ; + ps:P2093 "P. Pileri"^^xsd:string . + +wds:Q57160439-Q57160439-P2093-13 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + pq:P1545 "13"^^xsd:string ; + ps:P2093 "M. Houghton"^^xsd:string . + +wds:Q57160439-Q57160439-P2093-14 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + pq:P1545 "14"^^xsd:string ; + ps:P2093 "S. Abrignani"^^xsd:string . + +wds:Q57160439-Q57160439-P2093-2 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + pq:P1545 "2"^^xsd:string ; + ps:P2093 "S. Campagnoli"^^xsd:string . + +wds:Q57160439-Q57160439-P2093-3 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + pq:P1545 "3"^^xsd:string ; + ps:P2093 "C. Moretto"^^xsd:string . + +wds:Q57160439-Q57160439-P2093-4 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + pq:P1545 "4"^^xsd:string ; + ps:P2093 "E. Guenzi"^^xsd:string . + +wds:Q57160439-Q57160439-P2093-5 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + pq:P1545 "5"^^xsd:string ; + ps:P2093 "L. Cousens"^^xsd:string . + +wds:Q57160439-Q57160439-P2093-6 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + pq:P1545 "6"^^xsd:string ; + ps:P2093 "M. Chin"^^xsd:string . + +wds:Q57160439-Q57160439-P2093-7 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + pq:P1545 "7"^^xsd:string ; + ps:P2093 "C. Dong"^^xsd:string . + +wds:Q57160439-Q57160439-P2093-8 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + pq:P1545 "8"^^xsd:string ; + ps:P2093 "A. J. Weiner"^^xsd:string . + +wds:Q57160439-Q57160439-P2093-9 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + pq:P1545 "9"^^xsd:string ; + ps:P2093 "J. Y. Lau"^^xsd:string . + +wds:Q57160439-Q57160439-P304-1 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + ps:P304 "1759-1763"^^xsd:string . + +wds:Q57160439-Q57160439-P31-1 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + ps:P31 wd:Q13442814 . + +wds:Q57160439-Q57160439-P356-1 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + ps:P356 "10.1073/PNAS.93.5.1759"^^xsd:string . + +wds:Q57160439-Q57160439-P407-1 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + ps:P407 wd:Q1860 . + +wds:Q57160439-Q57160439-P433-1 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + ps:P433 "5"^^xsd:string . + +wds:Q57160439-Q57160439-P478-1 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + ps:P478 "93"^^xsd:string . + +wds:Q57160439-Q57160439-P577-1 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + ps:P577 "1996-03-05T00:00:00"^^xsd:dateTime ; + psv:P577 wdv:Timec1996-03-05T000000cQc11c0 . + +wds:Q57160439-Q57160439-P921-1 a wikibase:Statement ; + wikibase:rank wikibase:NormalRank ; + ; + ps:P921 wd:Q79460 . + +wdv:Timec1996-03-05T000000cQc11c0 a wikibase:Time ; + wikibase:timeCalendarModel wd:Q1985727 ; + wikibase:timePrecision 11 ; + wikibase:timeTimezone 0 ; + wikibase:timeValue "1996-03-05T00:00:00"^^xsd:dateTime . + diff --git a/kgtk/tests/data/Q57160439_truthy.ttl b/kgtk/tests/data/Q57160439_truthy.ttl new file mode 100644 index 000000000..79d181af1 --- /dev/null +++ b/kgtk/tests/data/Q57160439_truthy.ttl @@ -0,0 +1,224 @@ +@prefix wikibase: . +@prefix wd: . +@prefix wdt: . +@prefix wdtn: . +@prefix wdno: . +@prefix wds: . +@prefix wdv: . +@prefix wdref: . +@prefix p: . +@prefix pr: . +@prefix prv: . +@prefix prn: . +@prefix ps: . +@prefix psv: . +@prefix psn: . +@prefix pq: . +@prefix pqv: . +@prefix pqn: . +@prefix prov: . +@prefix skos: . +@prefix schema: . + +wd:Q57160439 a wikibase:Item ; + rdfs:label "A quantitative test to estimate neutralizing antibodies to the hepatitis C virus: cytofluorimetric assessment of envelope glycoprotein 2 binding to target cells"@en ; + schema:name "A quantitative test to estimate neutralizing antibodies to the hepatitis C virus: cytofluorimetric assessment of envelope glycoprotein 2 binding to target cells"@en ; + skos:prefLabel "A quantitative test to estimate neutralizing antibodies to the hepatitis C virus: cytofluorimetric assessment of envelope glycoprotein 2 binding to target cells"@en ; + p:P1433 wds:Q57160439-Q57160439-P1433-1 ; + p:P1476 wds:Q57160439-Q57160439-P1476-1 ; + p:P2093 wds:Q57160439-Q57160439-P2093-1, + wds:Q57160439-Q57160439-P2093-10, + wds:Q57160439-Q57160439-P2093-11, + wds:Q57160439-Q57160439-P2093-12, + wds:Q57160439-Q57160439-P2093-13, + wds:Q57160439-Q57160439-P2093-14, + wds:Q57160439-Q57160439-P2093-2, + wds:Q57160439-Q57160439-P2093-3, + wds:Q57160439-Q57160439-P2093-4, + wds:Q57160439-Q57160439-P2093-5, + wds:Q57160439-Q57160439-P2093-6, + wds:Q57160439-Q57160439-P2093-7, + wds:Q57160439-Q57160439-P2093-8, + wds:Q57160439-Q57160439-P2093-9 ; + p:P304 wds:Q57160439-Q57160439-P304-1 ; + p:P31 wds:Q57160439-Q57160439-P31-1 ; + p:P356 wds:Q57160439-Q57160439-P356-1 ; + p:P407 wds:Q57160439-Q57160439-P407-1 ; + p:P433 wds:Q57160439-Q57160439-P433-1 ; + p:P478 wds:Q57160439-Q57160439-P478-1 ; + p:P577 wds:Q57160439-Q57160439-P577-1 ; + p:P921 wds:Q57160439-Q57160439-P921-1 ; + wdt:P1433 wd:Q1146531 ; + wdt:P1476 "A quantitative test to estimate neutralizing antibodies to the hepatitis C virus: cytofluorimetric assessment of envelope glycoprotein 2 binding to target cells"@en ; + wdt:P2093 "A. J. Weiner"^^xsd:string, + "C. Dong"^^xsd:string, + "C. Moretto"^^xsd:string, + "D. Chien"^^xsd:string, + "D. Rosa"^^xsd:string, + "E. Guenzi"^^xsd:string, + "J. Y. Lau"^^xsd:string, + "L. Cousens"^^xsd:string, + "M. Chin"^^xsd:string, + "M. Houghton"^^xsd:string, + "P. Pileri"^^xsd:string, + "Q. L. Choo"^^xsd:string, + "S. Abrignani"^^xsd:string, + "S. Campagnoli"^^xsd:string ; + wdt:P304 "1759-1763"^^xsd:string ; + wdt:P31 wd:Q13442814 ; + wdt:P356 "10.1073/PNAS.93.5.1759"^^xsd:string ; + wdt:P407 wd:Q1860 ; + wdt:P433 "5"^^xsd:string ; + wdt:P478 "93"^^xsd:string ; + wdt:P577 "1996-03-05T00:00:00"^^xsd:dateTime ; + wdt:P921 wd:Q79460 . + +wds:Q57160439-Q57160439-P1433-1 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + ps:P1433 wd:Q1146531 . + +wds:Q57160439-Q57160439-P1476-1 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + ps:P1476 "A quantitative test to estimate neutralizing antibodies to the hepatitis C virus: cytofluorimetric assessment of envelope glycoprotein 2 binding to target cells"@en . + +wds:Q57160439-Q57160439-P2093-1 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + pq:P1545 "1"^^xsd:string ; + ps:P2093 "D. Rosa"^^xsd:string . + +wds:Q57160439-Q57160439-P2093-10 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + pq:P1545 "10"^^xsd:string ; + ps:P2093 "Q. L. Choo"^^xsd:string . + +wds:Q57160439-Q57160439-P2093-11 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + pq:P1545 "11"^^xsd:string ; + ps:P2093 "D. Chien"^^xsd:string . + +wds:Q57160439-Q57160439-P2093-12 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + pq:P1545 "12"^^xsd:string ; + ps:P2093 "P. Pileri"^^xsd:string . + +wds:Q57160439-Q57160439-P2093-13 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + pq:P1545 "13"^^xsd:string ; + ps:P2093 "M. Houghton"^^xsd:string . + +wds:Q57160439-Q57160439-P2093-14 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + pq:P1545 "14"^^xsd:string ; + ps:P2093 "S. Abrignani"^^xsd:string . + +wds:Q57160439-Q57160439-P2093-2 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + pq:P1545 "2"^^xsd:string ; + ps:P2093 "S. Campagnoli"^^xsd:string . + +wds:Q57160439-Q57160439-P2093-3 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + pq:P1545 "3"^^xsd:string ; + ps:P2093 "C. Moretto"^^xsd:string . + +wds:Q57160439-Q57160439-P2093-4 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + pq:P1545 "4"^^xsd:string ; + ps:P2093 "E. Guenzi"^^xsd:string . + +wds:Q57160439-Q57160439-P2093-5 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + pq:P1545 "5"^^xsd:string ; + ps:P2093 "L. Cousens"^^xsd:string . + +wds:Q57160439-Q57160439-P2093-6 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + pq:P1545 "6"^^xsd:string ; + ps:P2093 "M. Chin"^^xsd:string . + +wds:Q57160439-Q57160439-P2093-7 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + pq:P1545 "7"^^xsd:string ; + ps:P2093 "C. Dong"^^xsd:string . + +wds:Q57160439-Q57160439-P2093-8 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + pq:P1545 "8"^^xsd:string ; + ps:P2093 "A. J. Weiner"^^xsd:string . + +wds:Q57160439-Q57160439-P2093-9 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + pq:P1545 "9"^^xsd:string ; + ps:P2093 "J. Y. Lau"^^xsd:string . + +wds:Q57160439-Q57160439-P304-1 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + ps:P304 "1759-1763"^^xsd:string . + +wds:Q57160439-Q57160439-P31-1 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + ps:P31 wd:Q13442814 . + +wds:Q57160439-Q57160439-P356-1 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + ps:P356 "10.1073/PNAS.93.5.1759"^^xsd:string . + +wds:Q57160439-Q57160439-P407-1 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + ps:P407 wd:Q1860 . + +wds:Q57160439-Q57160439-P433-1 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + ps:P433 "5"^^xsd:string . + +wds:Q57160439-Q57160439-P478-1 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + ps:P478 "93"^^xsd:string . + +wds:Q57160439-Q57160439-P577-1 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + ps:P577 "1996-03-05T00:00:00"^^xsd:dateTime ; + psv:P577 wdv:Timec1996-03-05T000000cQc11c0 . + +wds:Q57160439-Q57160439-P921-1 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + ps:P921 wd:Q79460 . + +wdv:Timec1996-03-05T000000cQc11c0 a wikibase:Time ; + wikibase:timeCalendarModel wd:Q1985727 ; + wikibase:timePrecision 11 ; + wikibase:timeTimezone 0 ; + wikibase:timeValue "1996-03-05T00:00:00"^^xsd:dateTime . + +wd:Q1146531 a wikibase:Item . + +wd:Q13442814 a wikibase:Item . + +wd:Q1860 a wikibase:Item . + +wd:Q79460 a wikibase:Item . + diff --git a/kgtk/tests/data/wikidata_properties.tsv b/kgtk/tests/data/wikidata_properties.tsv new file mode 100644 index 000000000..687d3175e --- /dev/null +++ b/kgtk/tests/data/wikidata_properties.tsv @@ -0,0 +1,7440 @@ +node1 label node2 +P493 property_type external-identifier +P494 property_type external-identifier +P495 property_type item +P496 property_type external-identifier +P497 property_type external-identifier +P498 property_type external-identifier +P500 property_type item +P501 property_type item +P502 property_type string +P503 property_type external-identifier +P504 property_type item +P505 property_type item +P506 property_type external-identifier +P507 property_type external-identifier +P508 property_type external-identifier +P509 property_type item +P511 property_type item +P512 property_type item +P514 property_type item +P515 property_type item +P516 property_type item +P517 property_type item +P518 property_type item +P520 property_type item +P521 property_type item +P522 property_type item +P523 property_type item +P524 property_type item +P525 property_type external-identifier +P527 property_type item +P528 property_type string +P529 property_type string +P530 property_type item +P531 property_type item +P532 property_type item +P533 property_type item +P534 property_type item +P535 property_type external-identifier +P536 property_type external-identifier +P537 property_type item +P538 property_type item +P539 property_type external-identifier +P541 property_type item +P542 property_type item +P543 property_type item +P545 property_type item +P546 property_type item +P547 property_type item +P548 property_type item +P549 property_type external-identifier +P550 property_type item +P551 property_type item +P552 property_type item +P553 property_type item +P554 property_type string +P555 property_type string +P556 property_type item +P557 property_type external-identifier +P559 property_type item +P560 property_type item +P561 property_type string +P562 property_type item +P563 property_type external-identifier +P564 property_type string +P565 property_type item +P566 property_type item +P567 property_type item +P568 property_type item +P569 property_type time +P570 property_type time +P571 property_type time +P574 property_type time +P575 property_type time +P576 property_type time +P577 property_type time +P578 property_type time +P579 property_type item +P580 property_type time +P582 property_type time +P585 property_type time +P586 property_type external-identifier +P587 property_type external-identifier +P588 property_type item +P589 property_type item +P590 property_type external-identifier +P591 property_type string +P592 property_type external-identifier +P593 property_type string +P594 property_type external-identifier +P595 property_type external-identifier +P597 property_type external-identifier +P598 property_type item +P599 property_type external-identifier +P600 property_type external-identifier +P604 property_type external-identifier +P605 property_type external-identifier +P606 property_type time +P607 property_type item +P608 property_type item +P609 property_type item +P610 property_type item +P611 property_type item +P612 property_type item +P613 property_type string +P617 property_type string +P618 property_type item +P619 property_type time +P620 property_type time +P621 property_type time +P622 property_type time +P624 property_type item +P625 property_type globe-coordinate +P626 property_type globe-coordinate +P627 property_type string +P628 property_type external-identifier +P629 property_type item +P630 property_type external-identifier +P631 property_type item +P632 property_type external-identifier +P633 property_type external-identifier +P634 property_type item +P635 property_type external-identifier +P636 property_type item +P637 property_type external-identifier +P638 property_type external-identifier +P639 property_type external-identifier +P640 property_type external-identifier +P641 property_type item +P642 property_type item +P644 property_type string +P645 property_type string +P646 property_type external-identifier +P647 property_type item +P648 property_type external-identifier +P649 property_type external-identifier +P650 property_type external-identifier +P651 property_type external-identifier +P652 property_type external-identifier +P653 property_type external-identifier +P654 property_type item +P655 property_type item +P656 property_type string +P657 property_type external-identifier +P658 property_type item +P659 property_type item +P660 property_type item +P661 property_type external-identifier +P662 property_type external-identifier +P663 property_type external-identifier +P664 property_type item +P665 property_type external-identifier +P667 property_type string +P668 property_type external-identifier +P669 property_type item +P670 property_type string +P671 property_type external-identifier +P672 property_type external-identifier +P673 property_type external-identifier +P674 property_type item +P675 property_type external-identifier +P676 property_type item +P677 property_type external-identifier +P678 property_type item +P679 property_type external-identifier +P680 property_type item +P681 property_type item +P682 property_type item +P683 property_type external-identifier +P684 property_type item +P685 property_type external-identifier +P686 property_type external-identifier +P687 property_type external-identifier +P688 property_type item +P689 property_type item +P690 property_type item +P691 property_type external-identifier +P692 property_type string +P693 property_type item +P694 property_type item +P695 property_type external-identifier +P696 property_type external-identifier +P697 property_type item +P698 property_type external-identifier +P699 property_type external-identifier +P700 property_type external-identifier +P701 property_type external-identifier +P702 property_type item +P703 property_type item +P704 property_type external-identifier +P705 property_type external-identifier +P706 property_type item +P707 property_type item +P708 property_type item +P709 property_type external-identifier +P710 property_type item +P711 property_type string +P712 property_type string +P713 property_type string +P714 property_type external-identifier +P715 property_type external-identifier +P360 property_type item +P361 property_type item +P364 property_type item +P366 property_type item +P367 property_type string +P368 property_type string +P369 property_type item +P370 property_type string +P371 property_type item +P373 property_type string +P374 property_type external-identifier +P375 property_type item +P376 property_type item +P377 property_type external-identifier +P380 property_type external-identifier +P381 property_type external-identifier +P382 property_type external-identifier +P393 property_type string +P395 property_type string +P396 property_type external-identifier +P397 property_type item +P398 property_type item +P399 property_type item +P400 property_type item +P402 property_type external-identifier +P403 property_type item +P404 property_type item +P405 property_type item +P406 property_type item +P407 property_type item +P408 property_type item +P409 property_type external-identifier +P410 property_type item +P411 property_type item +P412 property_type item +P413 property_type item +P414 property_type item +P415 property_type item +P416 property_type string +P417 property_type item +P418 property_type item +P421 property_type item +P423 property_type item +P424 property_type string +P425 property_type item +P426 property_type string +P427 property_type item +P428 property_type external-identifier +P429 property_type external-identifier +P432 property_type external-identifier +P433 property_type string +P434 property_type external-identifier +P435 property_type external-identifier +P436 property_type external-identifier +P437 property_type item +P439 property_type external-identifier +P440 property_type external-identifier +P442 property_type external-identifier +P443 property_type string +P444 property_type string +P447 property_type item +P449 property_type item +P450 property_type item +P451 property_type item +P452 property_type item +P453 property_type item +P454 property_type external-identifier +P455 property_type external-identifier +P457 property_type item +P458 property_type external-identifier +P459 property_type item +P460 property_type item +P461 property_type item +P462 property_type item +P463 property_type item +P464 property_type external-identifier +P465 property_type string +P466 property_type item +P467 property_type item +P468 property_type item +P469 property_type item +P470 property_type item +P473 property_type string +P474 property_type string +P476 property_type external-identifier +P477 property_type external-identifier +P478 property_type string +P479 property_type item +P480 property_type external-identifier +P481 property_type external-identifier +P483 property_type item +P484 property_type external-identifier +P485 property_type item +P486 property_type external-identifier +P487 property_type string +P488 property_type item +P489 property_type item +P490 property_type string +P491 property_type string +P492 property_type external-identifier +P179 property_type item +P180 property_type item +P181 property_type string +P183 property_type item +P184 property_type item +P185 property_type item +P186 property_type item +P189 property_type item +P190 property_type item +P193 property_type item +P194 property_type item +P195 property_type item +P196 property_type item +P197 property_type item +P199 property_type item +P200 property_type item +P201 property_type item +P205 property_type item +P206 property_type item +P207 property_type string +P208 property_type item +P209 property_type item +P210 property_type item +P212 property_type external-identifier +P213 property_type external-identifier +P214 property_type external-identifier +P215 property_type string +P217 property_type string +P218 property_type external-identifier +P219 property_type external-identifier +P220 property_type external-identifier +P221 property_type external-identifier +P223 property_type string +P225 property_type string +P227 property_type external-identifier +P229 property_type string +P230 property_type string +P231 property_type external-identifier +P232 property_type external-identifier +P233 property_type string +P234 property_type external-identifier +P235 property_type external-identifier +P236 property_type external-identifier +P237 property_type item +P238 property_type string +P239 property_type string +P240 property_type string +P241 property_type item +P242 property_type string +P243 property_type external-identifier +P244 property_type external-identifier +P245 property_type external-identifier +P246 property_type string +P247 property_type external-identifier +P248 property_type item +P249 property_type string +P263 property_type item +P264 property_type item +P267 property_type external-identifier +P268 property_type external-identifier +P269 property_type external-identifier +P270 property_type external-identifier +P271 property_type external-identifier +P272 property_type item +P274 property_type string +P275 property_type item +P276 property_type item +P277 property_type item +P278 property_type external-identifier +P279 property_type item +P281 property_type string +P282 property_type item +P286 property_type item +P287 property_type item +P289 property_type item +P291 property_type item +P296 property_type string +P297 property_type external-identifier +P298 property_type external-identifier +P299 property_type external-identifier +P300 property_type external-identifier +P301 property_type item +P303 property_type external-identifier +P304 property_type string +P305 property_type external-identifier +P306 property_type item +P344 property_type item +P345 property_type external-identifier +P347 property_type external-identifier +P348 property_type string +P349 property_type external-identifier +P350 property_type external-identifier +P351 property_type external-identifier +P352 property_type external-identifier +P353 property_type external-identifier +P354 property_type external-identifier +P355 property_type item +P356 property_type external-identifier +P358 property_type item +P359 property_type external-identifier +P6 property_type item +P10 property_type string +P14 property_type string +P15 property_type string +P16 property_type item +P17 property_type item +P18 property_type string +P19 property_type item +P20 property_type item +P21 property_type item +P22 property_type item +P25 property_type item +P26 property_type item +P27 property_type item +P30 property_type item +P31 property_type item +P35 property_type item +P36 property_type item +P37 property_type item +P38 property_type item +P39 property_type item +P40 property_type item +P41 property_type string +P47 property_type item +P50 property_type item +P51 property_type string +P53 property_type item +P54 property_type item +P57 property_type item +P58 property_type item +P59 property_type item +P61 property_type item +P65 property_type item +P66 property_type item +P69 property_type item +P78 property_type item +P81 property_type item +P84 property_type item +P85 property_type item +P86 property_type item +P87 property_type item +P88 property_type item +P91 property_type item +P92 property_type item +P94 property_type string +P97 property_type item +P98 property_type item +P101 property_type item +P102 property_type item +P103 property_type item +P105 property_type item +P106 property_type item +P108 property_type item +P109 property_type string +P110 property_type item +P111 property_type item +P112 property_type item +P113 property_type item +P114 property_type item +P115 property_type item +P117 property_type string +P118 property_type item +P119 property_type item +P121 property_type item +P122 property_type item +P123 property_type item +P126 property_type item +P127 property_type item +P128 property_type item +P129 property_type item +P131 property_type item +P135 property_type item +P136 property_type item +P137 property_type item +P138 property_type item +P140 property_type item +P141 property_type item +P143 property_type item +P144 property_type item +P149 property_type item +P150 property_type item +P154 property_type string +P155 property_type item +P156 property_type item +P157 property_type item +P158 property_type string +P159 property_type item +P161 property_type item +P162 property_type item +P163 property_type item +P166 property_type item +P167 property_type item +P169 property_type item +P170 property_type item +P171 property_type item +P172 property_type item +P175 property_type item +P176 property_type item +P177 property_type item +P178 property_type item +P716 property_type external-identifier +P717 property_type external-identifier +P718 property_type external-identifier +P720 property_type item +P721 property_type external-identifier +P722 property_type external-identifier +P723 property_type external-identifier +P724 property_type external-identifier +P725 property_type item +P726 property_type item +P729 property_type time +P730 property_type time +P731 property_type external-identifier +P732 property_type external-identifier +P733 property_type external-identifier +P734 property_type item +P735 property_type item +P736 property_type item +P737 property_type item +P739 property_type item +P740 property_type item +P741 property_type item +P742 property_type string +P744 property_type item +P745 property_type external-identifier +P746 property_type time +P747 property_type item +P748 property_type item +P749 property_type item +P750 property_type item +P751 property_type item +P756 property_type item +P757 property_type external-identifier +P758 property_type external-identifier +P759 property_type external-identifier +P760 property_type external-identifier +P761 property_type external-identifier +P762 property_type external-identifier +P763 property_type external-identifier +P764 property_type external-identifier +P765 property_type item +P767 property_type item +P768 property_type item +P769 property_type item +P770 property_type item +P771 property_type external-identifier +P772 property_type external-identifier +P773 property_type external-identifier +P774 property_type external-identifier +P775 property_type external-identifier +P776 property_type external-identifier +P777 property_type external-identifier +P778 property_type external-identifier +P779 property_type external-identifier +P780 property_type item +P781 property_type external-identifier +P782 property_type external-identifier +P783 property_type item +P784 property_type item +P785 property_type item +P786 property_type item +P787 property_type item +P788 property_type item +P789 property_type item +P790 property_type item +P791 property_type string +P792 property_type string +P793 property_type item +P795 property_type item +P797 property_type item +P798 property_type string +P799 property_type string +P800 property_type item +P802 property_type item +P803 property_type item +P804 property_type external-identifier +P805 property_type item +P806 property_type external-identifier +P807 property_type item +P808 property_type external-identifier +P809 property_type external-identifier +P811 property_type item +P812 property_type item +P813 property_type time +P814 property_type item +P815 property_type external-identifier +P816 property_type item +P817 property_type item +P818 property_type external-identifier +P819 property_type external-identifier +P820 property_type string +P821 property_type external-identifier +P822 property_type item +P823 property_type item +P824 property_type external-identifier +P825 property_type item +P826 property_type item +P827 property_type external-identifier +P828 property_type item +P829 property_type external-identifier +P2131 property_type quantity +P2132 property_type quantity +P2133 property_type quantity +P2134 property_type quantity +P2135 property_type quantity +P2136 property_type quantity +P2137 property_type quantity +P2138 property_type quantity +P2139 property_type quantity +P2140 property_type quantity +P2141 property_type quantity +P2142 property_type quantity +P2143 property_type quantity +P2144 property_type quantity +P2145 property_type quantity +P2146 property_type quantity +P2147 property_type quantity +P2148 property_type quantity +P2149 property_type quantity +P2150 property_type quantity +P2151 property_type quantity +P2152 property_type item +P2153 property_type external-identifier +P2154 property_type quantity +P2155 property_type item +P2156 property_type item +P2158 property_type external-identifier +P2159 property_type item +P2160 property_type quantity +P2161 property_type external-identifier +P2162 property_type external-identifier +P2163 property_type external-identifier +P2164 property_type external-identifier +P2165 property_type external-identifier +P2166 property_type external-identifier +P2167 property_type external-identifier +P2168 property_type external-identifier +P2169 property_type external-identifier +P2170 property_type external-identifier +P2171 property_type external-identifier +P2172 property_type external-identifier +P2173 property_type external-identifier +P2174 property_type external-identifier +P2175 property_type item +P2176 property_type item +P2177 property_type quantity +P2178 property_type item +P2179 property_type string +P2180 property_type external-identifier +P2181 property_type external-identifier +P2182 property_type external-identifier +P2183 property_type string +P2184 property_type item +P2185 property_type external-identifier +P2186 property_type external-identifier +P2187 property_type external-identifier +P2188 property_type external-identifier +P2189 property_type external-identifier +P2190 property_type external-identifier +P2191 property_type external-identifier +P2192 property_type external-identifier +P2193 property_type external-identifier +P2194 property_type external-identifier +P2195 property_type external-identifier +P2196 property_type quantity +P2197 property_type quantity +P2198 property_type quantity +P2199 property_type quantity +P2200 property_type quantity +P2201 property_type quantity +P2202 property_type quantity +P2203 property_type quantity +P2204 property_type quantity +P2205 property_type external-identifier +P2206 property_type external-identifier +P2207 property_type external-identifier +P2208 property_type quantity +P2209 property_type external-identifier +P2210 property_type item +P2211 property_type quantity +P2212 property_type quantity +P2213 property_type quantity +P2214 property_type quantity +P2215 property_type quantity +P2216 property_type quantity +P2217 property_type quantity +P2218 property_type quantity +P2219 property_type quantity +P2220 property_type quantity +P2221 property_type quantity +P2222 property_type quantity +P2223 property_type quantity +P2225 property_type quantity +P2226 property_type quantity +P2227 property_type quantity +P2228 property_type quantity +P2229 property_type quantity +P2230 property_type quantity +P2231 property_type quantity +P2232 property_type quantity +P1913 property_type item +P1914 property_type item +P1915 property_type item +P1916 property_type item +P1917 property_type item +P1918 property_type item +P1919 property_type external-identifier +P1920 property_type external-identifier +P1921 property_type string +P1922 property_type monolingualtext +P1923 property_type item +P1924 property_type item +P1925 property_type external-identifier +P1928 property_type external-identifier +P1929 property_type external-identifier +P1930 property_type external-identifier +P1931 property_type string +P1932 property_type string +P1933 property_type external-identifier +P1934 property_type external-identifier +P1935 property_type external-identifier +P1936 property_type external-identifier +P1937 property_type external-identifier +P1938 property_type external-identifier +P1939 property_type external-identifier +P1940 property_type external-identifier +P1942 property_type string +P1943 property_type string +P1944 property_type string +P1945 property_type string +P1947 property_type external-identifier +P1948 property_type external-identifier +P1949 property_type external-identifier +P1950 property_type item +P1951 property_type item +P1952 property_type external-identifier +P1953 property_type external-identifier +P1954 property_type external-identifier +P1955 property_type external-identifier +P1956 property_type item +P1957 property_type url +P1958 property_type external-identifier +P1959 property_type external-identifier +P1960 property_type external-identifier +P1961 property_type external-identifier +P1963 property_type string +P1966 property_type external-identifier +P1967 property_type external-identifier +P1968 property_type external-identifier +P1969 property_type external-identifier +P1970 property_type external-identifier +P1971 property_type quantity +P1972 property_type external-identifier +P1973 property_type external-identifier +P1976 property_type external-identifier +P1977 property_type external-identifier +P1978 property_type external-identifier +P1979 property_type external-identifier +P1980 property_type external-identifier +P1981 property_type item +P1982 property_type external-identifier +P1983 property_type external-identifier +P1984 property_type external-identifier +P1985 property_type external-identifier +P1986 property_type external-identifier +P1987 property_type string +P1988 property_type external-identifier +P1989 property_type external-identifier +P1990 property_type item +P1991 property_type url +P1992 property_type external-identifier +P1993 property_type string +P1994 property_type external-identifier +P1995 property_type item +P1996 property_type external-identifier +P1997 property_type external-identifier +P1998 property_type string +P1999 property_type item +P2000 property_type external-identifier +P2001 property_type string +P2002 property_type external-identifier +P2003 property_type external-identifier +P2004 property_type external-identifier +P2005 property_type external-identifier +P2006 property_type external-identifier +P2007 property_type external-identifier +P2008 property_type external-identifier +P2009 property_type string +P2010 property_type string +P2011 property_type external-identifier +P2012 property_type item +P2013 property_type external-identifier +P2014 property_type external-identifier +P2015 property_type external-identifier +P2016 property_type external-identifier +P2017 property_type string +P2018 property_type external-identifier +P2019 property_type external-identifier +P2020 property_type external-identifier +P2021 property_type quantity +P2777 property_type external-identifier +P2778 property_type external-identifier +P2779 property_type external-identifier +P2780 property_type external-identifier +P2781 property_type quantity +P2782 property_type external-identifier +P2783 property_type external-identifier +P2784 property_type item +P2786 property_type globe-coordinate +P2787 property_type quantity +P2788 property_type external-identifier +P2789 property_type item +P2790 property_type quantity +P2791 property_type quantity +P2792 property_type external-identifier +P2793 property_type quantity +P2794 property_type external-identifier +P2795 property_type monolingualtext +P2796 property_type external-identifier +P2797 property_type quantity +P2798 property_type external-identifier +P2799 property_type external-identifier +P2800 property_type external-identifier +P2801 property_type external-identifier +P2802 property_type string +P2803 property_type quantity +P2804 property_type external-identifier +P2805 property_type external-identifier +P2806 property_type quantity +P2807 property_type quantity +P2808 property_type quantity +P2809 property_type external-identifier +P2810 property_type external-identifier +P2811 property_type external-identifier +P2812 property_type external-identifier +P2813 property_type item +P2814 property_type external-identifier +P2815 property_type external-identifier +P2816 property_type external-identifier +P2817 property_type item +P2818 property_type external-identifier +P2819 property_type external-identifier +P2820 property_type item +P2821 property_type item +P2822 property_type item +P2823 property_type external-identifier +P2824 property_type external-identifier +P2825 property_type item +P2826 property_type external-identifier +P2827 property_type item +P2828 property_type item +P2829 property_type external-identifier +P2830 property_type external-identifier +P2831 property_type item +P2832 property_type external-identifier +P2833 property_type external-identifier +P2834 property_type quantity +P2835 property_type quantity +P2836 property_type quantity +P2838 property_type item +P2839 property_type item +P2840 property_type external-identifier +P2841 property_type item +P2842 property_type item +P2843 property_type external-identifier +P2844 property_type quantity +P2845 property_type external-identifier +P2846 property_type item +P2847 property_type external-identifier +P2848 property_type item +P2849 property_type item +P2850 property_type external-identifier +P2851 property_type item +P2852 property_type item +P2853 property_type item +P2854 property_type quantity +P2855 property_type quantity +P2856 property_type external-identifier +P2857 property_type external-identifier +P2858 property_type external-identifier +P2859 property_type string +P2860 property_type item +P2861 property_type external-identifier +P2862 property_type external-identifier +P2863 property_type external-identifier +P2864 property_type external-identifier +P2865 property_type external-identifier +P2866 property_type external-identifier +P2867 property_type external-identifier +P2868 property_type item +P2869 property_type item +P2870 property_type external-identifier +P2871 property_type external-identifier +P2872 property_type item +P2873 property_type quantity +P2874 property_type external-identifier +P2875 property_type item +P2876 property_type item +P2877 property_type external-identifier +P2878 property_type external-identifier +P2879 property_type external-identifier +P2880 property_type external-identifier +P2881 property_type item +P2882 property_type item +P2883 property_type external-identifier +P2884 property_type quantity +P2886 property_type external-identifier +P2887 property_type external-identifier +P2888 property_type url +P2889 property_type external-identifier +P2892 property_type external-identifier +P2893 property_type string +P2894 property_type item +P2895 property_type quantity +P2896 property_type quantity +P2897 property_type external-identifier +P2898 property_type external-identifier +P2899 property_type quantity +P2900 property_type string +P2903 property_type external-identifier +P2904 property_type external-identifier +P2905 property_type external-identifier +P2907 property_type quantity +P2908 property_type external-identifier +P2909 property_type external-identifier +P2910 property_type string +P2911 property_type quantity +P2912 property_type item +P2913 property_type time +P2914 property_type external-identifier +P2915 property_type external-identifier +P2916 property_type monolingualtext +P2917 property_type external-identifier +P2918 property_type string +P2919 property_type string +P2922 property_type item +P2923 property_type quantity +P2924 property_type external-identifier +P2925 property_type item +P2926 property_type external-identifier +P2927 property_type quantity +P2928 property_type quantity +P2929 property_type quantity +P2930 property_type external-identifier +P2931 property_type external-identifier +P2935 property_type item +P2936 property_type item +P2937 property_type item +P2938 property_type external-identifier +P2939 property_type external-identifier +P2940 property_type external-identifier +P2941 property_type external-identifier +P2942 property_type external-identifier +P2943 property_type external-identifier +P2944 property_type external-identifier +P2945 property_type external-identifier +P2946 property_type external-identifier +P2948 property_type external-identifier +P2949 property_type external-identifier +P2950 property_type external-identifier +P2951 property_type external-identifier +P2952 property_type external-identifier +P2953 property_type external-identifier +P2954 property_type external-identifier +P2955 property_type quantity +P2956 property_type external-identifier +P2957 property_type quantity +P2959 property_type item +P2960 property_type time +P2961 property_type external-identifier +P2962 property_type item +P2963 property_type external-identifier +P2964 property_type item +P2965 property_type external-identifier +P2966 property_type external-identifier +P2967 property_type external-identifier +P2968 property_type external-identifier +P2969 property_type external-identifier +P2970 property_type external-identifier +P2971 property_type external-identifier +P2972 property_type external-identifier +P2973 property_type external-identifier +P2974 property_type item +P2975 property_type item +P2976 property_type item +P2977 property_type external-identifier +P2978 property_type item +P2979 property_type string +P2980 property_type external-identifier +P2981 property_type external-identifier +P2982 property_type external-identifier +P2983 property_type external-identifier +P2984 property_type external-identifier +P2985 property_type external-identifier +P2986 property_type external-identifier +P2987 property_type external-identifier +P2988 property_type external-identifier +P2989 property_type item +P2990 property_type external-identifier +P2991 property_type external-identifier +P2233 property_type quantity +P2234 property_type quantity +P2235 property_type url +P2236 property_type url +P2238 property_type item +P2239 property_type item +P2240 property_type quantity +P2241 property_type item +P2242 property_type external-identifier +P2243 property_type quantity +P2244 property_type quantity +P2248 property_type quantity +P2249 property_type external-identifier +P2250 property_type quantity +P2252 property_type external-identifier +P2253 property_type external-identifier +P2254 property_type quantity +P2255 property_type external-identifier +P2257 property_type quantity +P2258 property_type string +P2259 property_type string +P2260 property_type quantity +P2261 property_type quantity +P2262 property_type quantity +P2263 property_type string +P2264 property_type external-identifier +P2266 property_type external-identifier +P2267 property_type external-identifier +P2268 property_type external-identifier +P2270 property_type external-identifier +P2271 property_type string +P2272 property_type external-identifier +P2273 property_type external-identifier +P2275 property_type monolingualtext +P2276 property_type external-identifier +P2277 property_type external-identifier +P2278 property_type external-identifier +P2279 property_type item +P2280 property_type external-identifier +P2281 property_type external-identifier +P2282 property_type external-identifier +P2283 property_type item +P2284 property_type quantity +P2285 property_type time +P2286 property_type item +P2287 property_type external-identifier +P2288 property_type item +P2289 property_type item +P2290 property_type external-identifier +P2291 property_type item +P2292 property_type quantity +P2293 property_type item +P2294 property_type quantity +P2295 property_type quantity +P2296 property_type quantity +P2297 property_type quantity +P2298 property_type external-identifier +P2299 property_type quantity +P2300 property_type quantity +P2302 property_type item +P2303 property_type item +P2304 property_type string +P2305 property_type item +P2306 property_type string +P2307 property_type string +P2308 property_type item +P2309 property_type item +P2310 property_type time +P2311 property_type time +P2312 property_type quantity +P2313 property_type quantity +P2315 property_type monolingualtext +P2316 property_type item +P2317 property_type string +P2318 property_type item +P2319 property_type item +P2320 property_type quantity +P2321 property_type item +P2322 property_type string +P2323 property_type external-identifier +P2324 property_type quantity +P2325 property_type quantity +P2326 property_type external-identifier +P2327 property_type external-identifier +P2328 property_type external-identifier +P2329 property_type item +P2330 property_type external-identifier +P2331 property_type external-identifier +P2332 property_type external-identifier +P2333 property_type external-identifier +P2334 property_type external-identifier +P2335 property_type external-identifier +P2336 property_type external-identifier +P2337 property_type external-identifier +P2338 property_type external-identifier +P2339 property_type external-identifier +P2340 property_type external-identifier +P2341 property_type item +P2342 property_type external-identifier +P2343 property_type string +P2024 property_type external-identifier +P2025 property_type external-identifier +P2026 property_type external-identifier +P2027 property_type external-identifier +P2028 property_type external-identifier +P2029 property_type external-identifier +P2030 property_type external-identifier +P2031 property_type time +P2032 property_type time +P2033 property_type item +P2034 property_type external-identifier +P2036 property_type external-identifier +P2037 property_type external-identifier +P2038 property_type external-identifier +P2040 property_type external-identifier +P2041 property_type external-identifier +P2042 property_type external-identifier +P2043 property_type quantity +P2044 property_type quantity +P2045 property_type quantity +P2046 property_type quantity +P2047 property_type quantity +P2048 property_type quantity +P2049 property_type quantity +P2050 property_type quantity +P2051 property_type quantity +P2052 property_type quantity +P2053 property_type quantity +P2054 property_type quantity +P2055 property_type quantity +P2056 property_type quantity +P2057 property_type external-identifier +P2058 property_type item +P2060 property_type quantity +P2061 property_type item +P2062 property_type external-identifier +P2063 property_type external-identifier +P2064 property_type external-identifier +P2065 property_type external-identifier +P2066 property_type quantity +P2067 property_type quantity +P2068 property_type quantity +P2069 property_type quantity +P2070 property_type external-identifier +P2071 property_type external-identifier +P2072 property_type external-identifier +P2073 property_type quantity +P2074 property_type external-identifier +P2075 property_type quantity +P2076 property_type quantity +P2077 property_type quantity +P2078 property_type url +P2079 property_type item +P2080 property_type external-identifier +P2081 property_type external-identifier +P2082 property_type external-identifier +P2083 property_type external-identifier +P2084 property_type external-identifier +P2085 property_type external-identifier +P2086 property_type external-identifier +P2087 property_type external-identifier +P2088 property_type external-identifier +P2089 property_type external-identifier +P2090 property_type external-identifier +P2091 property_type external-identifier +P2092 property_type external-identifier +P2093 property_type string +P2094 property_type item +P2095 property_type item +P2096 property_type monolingualtext +P2097 property_type quantity +P2098 property_type item +P2099 property_type external-identifier +P2100 property_type external-identifier +P2101 property_type quantity +P2102 property_type quantity +P2103 property_type quantity +P2105 property_type quantity +P2106 property_type external-identifier +P2107 property_type quantity +P2108 property_type external-identifier +P2109 property_type quantity +P2112 property_type quantity +P2113 property_type quantity +P2114 property_type quantity +P2115 property_type external-identifier +P2116 property_type quantity +P2117 property_type quantity +P2118 property_type quantity +P2119 property_type quantity +P2120 property_type quantity +P2121 property_type quantity +P2123 property_type external-identifier +P2124 property_type quantity +P2125 property_type string +P2126 property_type string +P2127 property_type item +P2128 property_type quantity +P2129 property_type quantity +P2130 property_type quantity +P2344 property_type external-identifier +P2345 property_type external-identifier +P2346 property_type external-identifier +P2347 property_type external-identifier +P2348 property_type item +P2349 property_type external-identifier +P2350 property_type external-identifier +P2351 property_type quantity +P2352 property_type item +P2353 property_type item +P2354 property_type item +P2355 property_type external-identifier +P2357 property_type string +P2358 property_type item +P2359 property_type item +P2360 property_type item +P2361 property_type item +P2362 property_type quantity +P2363 property_type item +P2364 property_type string +P2365 property_type item +P2366 property_type item +P2367 property_type external-identifier +P2368 property_type string +P2369 property_type external-identifier +P2370 property_type quantity +P2371 property_type item +P2372 property_type external-identifier +P2373 property_type external-identifier +P2374 property_type quantity +P2375 property_type item +P2376 property_type item +P2377 property_type item +P2378 property_type item +P2379 property_type item +P2380 property_type external-identifier +P2381 property_type external-identifier +P2382 property_type external-identifier +P2383 property_type external-identifier +P2384 property_type item +P2385 property_type external-identifier +P2386 property_type quantity +P2387 property_type external-identifier +P2388 property_type item +P2389 property_type item +P2390 property_type external-identifier +P2391 property_type external-identifier +P2392 property_type item +P2393 property_type external-identifier +P2394 property_type external-identifier +P2396 property_type item +P2397 property_type external-identifier +P2398 property_type external-identifier +P2399 property_type external-identifier +P2400 property_type external-identifier +P2401 property_type external-identifier +P2402 property_type quantity +P2403 property_type quantity +P2404 property_type quantity +P2405 property_type quantity +P2406 property_type quantity +P2407 property_type quantity +P2408 property_type item +P2409 property_type external-identifier +P2410 property_type string +P2411 property_type string +P2412 property_type external-identifier +P2413 property_type external-identifier +P2414 property_type item +P2415 property_type quantity +P2416 property_type item +P2417 property_type item +P2418 property_type external-identifier +P2421 property_type external-identifier +P2423 property_type external-identifier +P2424 property_type external-identifier +P2425 property_type string +P2426 property_type external-identifier +P2427 property_type external-identifier +P2428 property_type external-identifier +P2429 property_type item +P2430 property_type quantity +P2431 property_type external-identifier +P2432 property_type external-identifier +P2433 property_type item +P2434 property_type external-identifier +P2435 property_type external-identifier +P2436 property_type quantity +P2437 property_type quantity +P2438 property_type item +P2440 property_type string +P2441 property_type monolingualtext +P2442 property_type quantity +P2443 property_type item +P2444 property_type item +P2445 property_type item +P2446 property_type external-identifier +P2447 property_type external-identifier +P2448 property_type external-identifier +P2449 property_type external-identifier +P2992 property_type item +P2993 property_type quantity +P2997 property_type quantity +P2998 property_type quantity +P2999 property_type quantity +P3000 property_type quantity +P3001 property_type quantity +P3002 property_type external-identifier +P3003 property_type external-identifier +P3004 property_type external-identifier +P3005 property_type item +P3006 property_type external-identifier +P3007 property_type external-identifier +P3008 property_type external-identifier +P3009 property_type external-identifier +P3010 property_type external-identifier +P3012 property_type external-identifier +P3013 property_type quantity +P3014 property_type item +P3015 property_type item +P3016 property_type external-identifier +P3017 property_type external-identifier +P3018 property_type item +P3019 property_type item +P3020 property_type quantity +P3021 property_type external-identifier +P3022 property_type item +P3023 property_type external-identifier +P3024 property_type external-identifier +P3025 property_type item +P3026 property_type item +P3027 property_type item +P3028 property_type item +P3029 property_type external-identifier +P3030 property_type string +P3031 property_type external-identifier +P3032 property_type item +P3033 property_type item +P3034 property_type external-identifier +P3035 property_type external-identifier +P3036 property_type quantity +P3037 property_type item +P3038 property_type external-identifier +P3039 property_type quantity +P3040 property_type external-identifier +P3041 property_type quantity +P3042 property_type external-identifier +P3043 property_type external-identifier +P3044 property_type external-identifier +P3045 property_type external-identifier +P3046 property_type external-identifier +P3047 property_type external-identifier +P3048 property_type external-identifier +P3049 property_type external-identifier +P3050 property_type external-identifier +P3051 property_type external-identifier +P3052 property_type external-identifier +P3053 property_type external-identifier +P3054 property_type external-identifier +P3055 property_type external-identifier +P3056 property_type external-identifier +P3057 property_type external-identifier +P3058 property_type external-identifier +P3059 property_type external-identifier +P3060 property_type external-identifier +P3061 property_type external-identifier +P3063 property_type quantity +P3064 property_type external-identifier +P3065 property_type external-identifier +P3066 property_type external-identifier +P3067 property_type string +P3068 property_type external-identifier +P3069 property_type external-identifier +P3070 property_type quantity +P3071 property_type quantity +P3072 property_type external-identifier +P3073 property_type external-identifier +P3074 property_type external-identifier +P3075 property_type item +P3076 property_type external-identifier +P3077 property_type external-identifier +P3078 property_type quantity +P3080 property_type item +P3081 property_type item +P3082 property_type item +P3083 property_type external-identifier +P3085 property_type item +P3086 property_type quantity +P3087 property_type quantity +P3088 property_type external-identifier +P3089 property_type external-identifier +P3090 property_type string +P3091 property_type item +P3092 property_type item +P3093 property_type item +P3094 property_type item +P3095 property_type item +P3096 property_type item +P3097 property_type external-identifier +P3098 property_type external-identifier +P830 property_type external-identifier +P831 property_type item +P832 property_type item +P833 property_type item +P834 property_type item +P835 property_type string +P836 property_type external-identifier +P837 property_type item +P838 property_type external-identifier +P839 property_type external-identifier +P840 property_type item +P841 property_type item +P842 property_type external-identifier +P843 property_type external-identifier +P844 property_type external-identifier +P845 property_type external-identifier +P846 property_type external-identifier +P847 property_type string +P849 property_type external-identifier +P850 property_type external-identifier +P852 property_type item +P853 property_type item +P854 property_type url +P855 property_type url +P856 property_type url +P858 property_type external-identifier +P859 property_type item +P860 property_type external-identifier +P861 property_type external-identifier +P862 property_type external-identifier +P863 property_type external-identifier +P864 property_type external-identifier +P865 property_type external-identifier +P866 property_type external-identifier +P867 property_type external-identifier +P868 property_type item +P870 property_type item +P872 property_type item +P873 property_type item +P874 property_type string +P875 property_type string +P876 property_type string +P877 property_type string +P878 property_type item +P879 property_type string +P880 property_type item +P881 property_type item +P882 property_type external-identifier +P884 property_type external-identifier +P885 property_type item +P886 property_type external-identifier +P887 property_type item +P888 property_type external-identifier +P889 property_type external-identifier +P892 property_type external-identifier +P893 property_type external-identifier +P894 property_type external-identifier +P897 property_type string +P898 property_type string +P901 property_type external-identifier +P902 property_type external-identifier +P905 property_type external-identifier +P906 property_type external-identifier +P908 property_type item +P909 property_type external-identifier +P910 property_type item +P911 property_type external-identifier +P912 property_type item +P913 property_type item +P914 property_type item +P915 property_type item +P916 property_type item +P917 property_type external-identifier +P918 property_type external-identifier +P919 property_type external-identifier +P920 property_type string +P921 property_type item +P922 property_type item +P923 property_type item +P924 property_type item +P925 property_type item +P926 property_type item +P927 property_type item +P928 property_type item +P929 property_type item +P930 property_type item +P931 property_type item +P932 property_type external-identifier +P933 property_type external-identifier +P935 property_type string +P937 property_type item +P938 property_type external-identifier +P939 property_type external-identifier +P941 property_type item +P942 property_type item +P943 property_type item +P944 property_type item +P945 property_type item +P946 property_type string +P947 property_type external-identifier +P4031 property_type external-identifier +P4032 property_type item +P4033 property_type external-identifier +P4034 property_type external-identifier +P4035 property_type external-identifier +P4036 property_type quantity +P4037 property_type external-identifier +P4038 property_type external-identifier +P4040 property_type external-identifier +P4041 property_type external-identifier +P4042 property_type external-identifier +P4043 property_type item +P4044 property_type item +P4045 property_type string +P4046 property_type external-identifier +P4047 property_type string +P4048 property_type external-identifier +P4050 property_type external-identifier +P4051 property_type external-identifier +P4052 property_type external-identifier +P4053 property_type external-identifier +P4054 property_type external-identifier +P4055 property_type external-identifier +P4056 property_type external-identifier +P4057 property_type external-identifier +P4058 property_type external-identifier +P4059 property_type external-identifier +P4060 property_type external-identifier +P4061 property_type external-identifier +P4062 property_type external-identifier +P4063 property_type external-identifier +P4065 property_type external-identifier +P4066 property_type external-identifier +P4067 property_type external-identifier +P4068 property_type external-identifier +P4069 property_type external-identifier +P4070 property_type item +P4071 property_type external-identifier +P4072 property_type external-identifier +P4073 property_type external-identifier +P4074 property_type external-identifier +P4075 property_type external-identifier +P4076 property_type external-identifier +P4077 property_type external-identifier +P4078 property_type string +P4079 property_type external-identifier +P4080 property_type quantity +P4081 property_type external-identifier +P4082 property_type item +P4083 property_type external-identifier +P4084 property_type external-identifier +P4085 property_type external-identifier +P4086 property_type external-identifier +P4087 property_type external-identifier +P4088 property_type external-identifier +P4089 property_type external-identifier +P4090 property_type external-identifier +P4091 property_type string +P4092 property_type string +P4093 property_type external-identifier +P4094 property_type external-identifier +P4095 property_type external-identifier +P4096 property_type external-identifier +P4097 property_type external-identifier +P4098 property_type external-identifier +P4099 property_type item +P4100 property_type item +P4101 property_type item +P4102 property_type external-identifier +P4103 property_type quantity +P4104 property_type external-identifier +P4105 property_type quantity +P4106 property_type external-identifier +P4107 property_type external-identifier +P4108 property_type external-identifier +P4109 property_type external-identifier +P4110 property_type external-identifier +P4111 property_type external-identifier +P4112 property_type external-identifier +P4113 property_type external-identifier +P4114 property_type external-identifier +P4115 property_type external-identifier +P4116 property_type external-identifier +P4117 property_type external-identifier +P4118 property_type external-identifier +P4119 property_type external-identifier +P4120 property_type external-identifier +P4121 property_type external-identifier +P4122 property_type external-identifier +P4123 property_type external-identifier +P4124 property_type external-identifier +P4125 property_type external-identifier +P4126 property_type external-identifier +P4127 property_type external-identifier +P4128 property_type external-identifier +P4129 property_type external-identifier +P4130 property_type external-identifier +P4131 property_type quantity +P4132 property_type item +P4133 property_type external-identifier +P2450 property_type external-identifier +P2451 property_type external-identifier +P2452 property_type external-identifier +P2453 property_type item +P2454 property_type external-identifier +P2455 property_type external-identifier +P2456 property_type external-identifier +P2457 property_type external-identifier +P2458 property_type external-identifier +P2459 property_type external-identifier +P2460 property_type external-identifier +P2461 property_type external-identifier +P2462 property_type item +P2463 property_type external-identifier +P2464 property_type external-identifier +P2465 property_type external-identifier +P2467 property_type external-identifier +P2468 property_type external-identifier +P2469 property_type external-identifier +P2470 property_type external-identifier +P2471 property_type external-identifier +P2472 property_type external-identifier +P2473 property_type external-identifier +P2474 property_type external-identifier +P2475 property_type external-identifier +P2476 property_type external-identifier +P2477 property_type external-identifier +P2478 property_type external-identifier +P2479 property_type external-identifier +P2480 property_type external-identifier +P2481 property_type external-identifier +P2482 property_type external-identifier +P2483 property_type external-identifier +P2484 property_type external-identifier +P2485 property_type external-identifier +P2486 property_type external-identifier +P2487 property_type external-identifier +P2488 property_type url +P2489 property_type external-identifier +P2490 property_type string +P2491 property_type external-identifier +P2492 property_type external-identifier +P2493 property_type external-identifier +P2494 property_type external-identifier +P2496 property_type external-identifier +P2497 property_type external-identifier +P2498 property_type external-identifier +P2499 property_type item +P2500 property_type item +P2501 property_type item +P2502 property_type item +P2503 property_type external-identifier +P2504 property_type external-identifier +P2505 property_type item +P2506 property_type external-identifier +P2507 property_type item +P2508 property_type external-identifier +P2509 property_type external-identifier +P2510 property_type external-identifier +P2511 property_type external-identifier +P2512 property_type item +P2513 property_type external-identifier +P2514 property_type external-identifier +P2515 property_type item +P2516 property_type external-identifier +P2517 property_type item +P2518 property_type external-identifier +P2519 property_type external-identifier +P2520 property_type url +P2521 property_type monolingualtext +P2522 property_type item +P2524 property_type external-identifier +P2525 property_type external-identifier +P2526 property_type external-identifier +P2527 property_type quantity +P2528 property_type quantity +P2529 property_type external-identifier +P2530 property_type external-identifier +P2531 property_type external-identifier +P2532 property_type quantity +P2533 property_type external-identifier +P2534 property_type string +P2535 property_type string +P2536 property_type external-identifier +P2537 property_type external-identifier +P2538 property_type external-identifier +P2539 property_type external-identifier +P2540 property_type string +P2541 property_type item +P2542 property_type quantity +P2545 property_type item +P2546 property_type item +P2547 property_type quantity +P2548 property_type item +P2549 property_type external-identifier +P2550 property_type item +P2551 property_type item +P2552 property_type string +P2553 property_type item +P2554 property_type item +P4135 property_type quantity +P4136 property_type external-identifier +P4137 property_type quantity +P4138 property_type external-identifier +P4139 property_type external-identifier +P4140 property_type quantity +P4141 property_type external-identifier +P4142 property_type external-identifier +P4143 property_type external-identifier +P4144 property_type external-identifier +P4145 property_type external-identifier +P4146 property_type external-identifier +P4147 property_type item +P4149 property_type item +P4150 property_type string +P4151 property_type item +P4152 property_type string +P4153 property_type quantity +P4154 property_type external-identifier +P4155 property_type string +P4156 property_type external-identifier +P4157 property_type external-identifier +P4158 property_type external-identifier +P4159 property_type external-identifier +P4160 property_type external-identifier +P4161 property_type external-identifier +P4162 property_type external-identifier +P4163 property_type quantity +P4164 property_type external-identifier +P4165 property_type external-identifier +P4166 property_type external-identifier +P4167 property_type external-identifier +P4168 property_type external-identifier +P4169 property_type external-identifier +P4170 property_type external-identifier +P4171 property_type external-identifier +P4172 property_type external-identifier +P4173 property_type external-identifier +P4174 property_type external-identifier +P4175 property_type external-identifier +P4176 property_type quantity +P4177 property_type external-identifier +P4178 property_type external-identifier +P4179 property_type string +P4180 property_type external-identifier +P4181 property_type external-identifier +P4182 property_type external-identifier +P4183 property_type quantity +P4184 property_type quantity +P4185 property_type item +P4186 property_type external-identifier +P4187 property_type string +P4188 property_type string +P4189 property_type string +P4190 property_type external-identifier +P4191 property_type external-identifier +P4192 property_type external-identifier +P4193 property_type external-identifier +P4194 property_type external-identifier +P4195 property_type item +P4196 property_type string +P4197 property_type external-identifier +P4198 property_type external-identifier +P4199 property_type external-identifier +P4200 property_type external-identifier +P4201 property_type external-identifier +P4202 property_type item +P4203 property_type external-identifier +P4204 property_type external-identifier +P4206 property_type external-identifier +P4207 property_type external-identifier +P4208 property_type external-identifier +P4209 property_type external-identifier +P4210 property_type external-identifier +P4211 property_type external-identifier +P4212 property_type external-identifier +P4213 property_type string +P4214 property_type quantity +P4215 property_type external-identifier +P4216 property_type external-identifier +P4217 property_type external-identifier +P4218 property_type quantity +P4219 property_type external-identifier +P4220 property_type item +P4221 property_type external-identifier +P4222 property_type external-identifier +P4223 property_type external-identifier +P4224 property_type item +P4225 property_type string +P4226 property_type external-identifier +P4227 property_type external-identifier +P4228 property_type external-identifier +P4229 property_type external-identifier +P4230 property_type external-identifier +P4231 property_type external-identifier +P4232 property_type external-identifier +P4233 property_type external-identifier +P4235 property_type external-identifier +P4236 property_type external-identifier +P4238 property_type url +P2555 property_type quantity +P2556 property_type quantity +P2557 property_type quantity +P2558 property_type external-identifier +P2559 property_type monolingualtext +P2560 property_type item +P2561 property_type monolingualtext +P2562 property_type monolingualtext +P2563 property_type item +P2564 property_type item +P2565 property_type quantity +P2566 property_type external-identifier +P2567 property_type item +P2568 property_type item +P2571 property_type item +P2572 property_type string +P2573 property_type quantity +P2574 property_type external-identifier +P2575 property_type item +P2576 property_type external-identifier +P2577 property_type item +P2578 property_type item +P2579 property_type item +P2580 property_type external-identifier +P2581 property_type external-identifier +P2582 property_type external-identifier +P2583 property_type quantity +P2584 property_type external-identifier +P2585 property_type external-identifier +P2586 property_type external-identifier +P2587 property_type item +P2588 property_type external-identifier +P2589 property_type external-identifier +P2590 property_type external-identifier +P2591 property_type item +P2592 property_type external-identifier +P2593 property_type external-identifier +P2595 property_type quantity +P2596 property_type item +P2597 property_type item +P2598 property_type string +P2599 property_type quantity +P2600 property_type external-identifier +P2601 property_type external-identifier +P2602 property_type external-identifier +P2603 property_type external-identifier +P2604 property_type external-identifier +P2605 property_type external-identifier +P2606 property_type external-identifier +P2607 property_type external-identifier +P2610 property_type quantity +P2611 property_type external-identifier +P2612 property_type external-identifier +P2613 property_type external-identifier +P2614 property_type item +P2618 property_type external-identifier +P2619 property_type external-identifier +P2620 property_type external-identifier +P2621 property_type external-identifier +P2622 property_type external-identifier +P2623 property_type external-identifier +P2624 property_type external-identifier +P2625 property_type external-identifier +P2626 property_type external-identifier +P2627 property_type external-identifier +P2628 property_type external-identifier +P2629 property_type item +P2630 property_type quantity +P2631 property_type external-identifier +P2632 property_type item +P2633 property_type item +P2634 property_type item +P2635 property_type quantity +P2636 property_type external-identifier +P2637 property_type item +P2638 property_type external-identifier +P2639 property_type external-identifier +P2640 property_type external-identifier +P2641 property_type external-identifier +P2642 property_type external-identifier +P2643 property_type item +P2645 property_type quantity +P2646 property_type external-identifier +P2647 property_type item +P2648 property_type external-identifier +P2649 property_type url +P2650 property_type item +P2651 property_type external-identifier +P2652 property_type item +P2655 property_type external-identifier +P2657 property_type external-identifier +P2658 property_type quantity +P2659 property_type quantity +P2660 property_type quantity +P2661 property_type quantity +P2662 property_type quantity +P2663 property_type quantity +P2664 property_type quantity +P2665 property_type quantity +P2666 property_type external-identifier +P3099 property_type external-identifier +P3100 property_type external-identifier +P3101 property_type external-identifier +P3102 property_type external-identifier +P3103 property_type item +P3104 property_type external-identifier +P3105 property_type external-identifier +P3106 property_type external-identifier +P3107 property_type external-identifier +P3108 property_type external-identifier +P3109 property_type external-identifier +P3110 property_type external-identifier +P3111 property_type external-identifier +P3112 property_type external-identifier +P3113 property_type item +P3114 property_type external-identifier +P3115 property_type external-identifier +P3116 property_type external-identifier +P3117 property_type external-identifier +P3118 property_type external-identifier +P3119 property_type external-identifier +P3120 property_type external-identifier +P3121 property_type external-identifier +P3122 property_type external-identifier +P3123 property_type external-identifier +P3124 property_type external-identifier +P3125 property_type external-identifier +P3126 property_type external-identifier +P3127 property_type external-identifier +P3128 property_type external-identifier +P3129 property_type external-identifier +P3130 property_type external-identifier +P3131 property_type external-identifier +P3132 property_type monolingualtext +P3133 property_type external-identifier +P3134 property_type external-identifier +P3135 property_type external-identifier +P3136 property_type external-identifier +P3137 property_type item +P3138 property_type external-identifier +P3139 property_type external-identifier +P3140 property_type external-identifier +P3141 property_type external-identifier +P3142 property_type external-identifier +P3143 property_type external-identifier +P3144 property_type external-identifier +P3145 property_type external-identifier +P3146 property_type external-identifier +P3147 property_type external-identifier +P3148 property_type item +P3149 property_type item +P3150 property_type item +P3151 property_type external-identifier +P3152 property_type external-identifier +P3153 property_type external-identifier +P3154 property_type external-identifier +P3155 property_type external-identifier +P3156 property_type item +P3157 property_type quantity +P3158 property_type item +P3159 property_type external-identifier +P3160 property_type external-identifier +P3161 property_type item +P3162 property_type external-identifier +P3163 property_type external-identifier +P3165 property_type external-identifier +P3166 property_type external-identifier +P3167 property_type external-identifier +P3168 property_type external-identifier +P3169 property_type external-identifier +P3170 property_type external-identifier +P3171 property_type external-identifier +P3172 property_type external-identifier +P3173 property_type item +P3174 property_type item +P3175 property_type external-identifier +P3176 property_type string +P3177 property_type external-identifier +P3178 property_type external-identifier +P3179 property_type item +P3180 property_type external-identifier +P3181 property_type external-identifier +P3182 property_type external-identifier +P3183 property_type external-identifier +P3184 property_type external-identifier +P3185 property_type external-identifier +P3186 property_type external-identifier +P3187 property_type external-identifier +P3188 property_type external-identifier +P3189 property_type item +P3190 property_type item +P3191 property_type external-identifier +P3192 property_type external-identifier +P3193 property_type external-identifier +P3194 property_type external-identifier +P3195 property_type item +P3196 property_type external-identifier +P3197 property_type external-identifier +P3198 property_type external-identifier +P3199 property_type external-identifier +P948 property_type string +P949 property_type external-identifier +P950 property_type external-identifier +P951 property_type external-identifier +P952 property_type string +P953 property_type url +P954 property_type external-identifier +P957 property_type external-identifier +P958 property_type string +P959 property_type external-identifier +P960 property_type external-identifier +P961 property_type external-identifier +P962 property_type external-identifier +P963 property_type url +P964 property_type external-identifier +P965 property_type string +P966 property_type external-identifier +P967 property_type item +P968 property_type url +P969 property_type string +P970 property_type item +P971 property_type item +P972 property_type item +P973 property_type url +P974 property_type item +P980 property_type external-identifier +P981 property_type external-identifier +P982 property_type external-identifier +P984 property_type external-identifier +P988 property_type external-identifier +P989 property_type string +P990 property_type string +P991 property_type item +P993 property_type string +P994 property_type string +P995 property_type string +P996 property_type string +P998 property_type external-identifier +P999 property_type external-identifier +P1000 property_type item +P1001 property_type item +P1002 property_type item +P1003 property_type external-identifier +P1004 property_type external-identifier +P1005 property_type external-identifier +P1006 property_type external-identifier +P1007 property_type external-identifier +P1010 property_type external-identifier +P1011 property_type item +P1012 property_type item +P1013 property_type item +P1014 property_type external-identifier +P1015 property_type external-identifier +P1016 property_type item +P1017 property_type external-identifier +P1018 property_type item +P1019 property_type url +P1021 property_type external-identifier +P1022 property_type external-identifier +P1023 property_type external-identifier +P1024 property_type external-identifier +P1025 property_type external-identifier +P1026 property_type item +P1027 property_type item +P1028 property_type item +P1029 property_type item +P1030 property_type string +P1031 property_type string +P1032 property_type item +P1033 property_type item +P1034 property_type item +P1035 property_type item +P1036 property_type string +P1037 property_type item +P1038 property_type item +P1039 property_type item +P1040 property_type item +P1041 property_type item +P1042 property_type external-identifier +P1043 property_type external-identifier +P1044 property_type external-identifier +P1045 property_type external-identifier +P1046 property_type item +P1047 property_type external-identifier +P1048 property_type external-identifier +P1049 property_type item +P1050 property_type item +P1051 property_type external-identifier +P1052 property_type external-identifier +P1053 property_type external-identifier +P1054 property_type external-identifier +P1055 property_type external-identifier +P1056 property_type item +P1057 property_type item +P1058 property_type external-identifier +P1059 property_type external-identifier +P1060 property_type item +P1064 property_type item +P1065 property_type url +P1066 property_type item +P2667 property_type item +P2668 property_type item +P2669 property_type time +P2670 property_type item +P2671 property_type external-identifier +P2672 property_type external-identifier +P2673 property_type item +P2674 property_type item +P2675 property_type item +P2676 property_type string +P2677 property_type string +P2678 property_type external-identifier +P2679 property_type item +P2680 property_type item +P2681 property_type item +P2682 property_type item +P2683 property_type external-identifier +P2684 property_type item +P2685 property_type external-identifier +P2686 property_type external-identifier +P2687 property_type external-identifier +P2688 property_type external-identifier +P2689 property_type external-identifier +P2694 property_type external-identifier +P2695 property_type item +P2696 property_type external-identifier +P2697 property_type external-identifier +P2698 property_type external-identifier +P2699 property_type url +P2700 property_type item +P2701 property_type item +P2702 property_type item +P2703 property_type external-identifier +P2704 property_type external-identifier +P2705 property_type external-identifier +P2708 property_type external-identifier +P2709 property_type external-identifier +P2710 property_type quantity +P2712 property_type quantity +P2713 property_type string +P2715 property_type item +P2716 property_type string +P2717 property_type quantity +P2718 property_type quantity +P2719 property_type string +P2720 property_type string +P2721 property_type external-identifier +P2722 property_type external-identifier +P2723 property_type external-identifier +P2724 property_type external-identifier +P2725 property_type external-identifier +P2726 property_type external-identifier +P2727 property_type external-identifier +P2728 property_type external-identifier +P2729 property_type external-identifier +P2730 property_type external-identifier +P2732 property_type external-identifier +P2733 property_type external-identifier +P2734 property_type external-identifier +P2735 property_type external-identifier +P2736 property_type external-identifier +P2737 property_type item +P2738 property_type item +P2739 property_type item +P2740 property_type external-identifier +P2741 property_type external-identifier +P2742 property_type external-identifier +P2743 property_type item +P2744 property_type string +P2745 property_type external-identifier +P2746 property_type item +P2747 property_type item +P2748 property_type external-identifier +P2749 property_type external-identifier +P2750 property_type external-identifier +P2751 property_type external-identifier +P2752 property_type external-identifier +P2753 property_type external-identifier +P2754 property_type time +P2755 property_type external-identifier +P2756 property_type item +P2758 property_type item +P2759 property_type external-identifier +P2760 property_type external-identifier +P2761 property_type external-identifier +P2762 property_type external-identifier +P2763 property_type external-identifier +P2764 property_type external-identifier +P2765 property_type external-identifier +P2766 property_type external-identifier +P2767 property_type external-identifier +P2768 property_type external-identifier +P2769 property_type quantity +P2770 property_type item +P2771 property_type external-identifier +P2772 property_type external-identifier +P2773 property_type external-identifier +P2774 property_type external-identifier +P2775 property_type external-identifier +P2776 property_type external-identifier +P4851 property_type quantity +P4852 property_type external-identifier +P4853 property_type external-identifier +P4854 property_type external-identifier +P4855 property_type external-identifier +P4856 property_type string +P4857 property_type external-identifier +P4858 property_type external-identifier +P4859 property_type external-identifier +P4860 property_type external-identifier +P4861 property_type external-identifier +P4862 property_type external-identifier +P4863 property_type string +P4864 property_type string +P4866 property_type external-identifier +P4867 property_type string +P4868 property_type external-identifier +P4869 property_type external-identifier +P4870 property_type external-identifier +P4871 property_type external-identifier +P4872 property_type external-identifier +P4873 property_type item +P4875 property_type item +P4876 property_type quantity +P4878 property_type item +P4879 property_type external-identifier +P4880 property_type external-identifier +P4881 property_type external-identifier +P4882 property_type item +P4883 property_type external-identifier +P4884 property_type item +P4885 property_type external-identifier +P4886 property_type external-identifier +P4887 property_type external-identifier +P4888 property_type external-identifier +P4889 property_type external-identifier +P4890 property_type external-identifier +P4891 property_type external-identifier +P4892 property_type external-identifier +P4893 property_type external-identifier +P4894 property_type external-identifier +P4895 property_type quantity +P4896 property_type string +P4897 property_type external-identifier +P4898 property_type external-identifier +P4899 property_type external-identifier +P4900 property_type item +P4901 property_type external-identifier +P4902 property_type external-identifier +P4903 property_type external-identifier +P4904 property_type external-identifier +P4905 property_type external-identifier +P4906 property_type external-identifier +P4907 property_type external-identifier +P4908 property_type item +P4909 property_type quantity +P4910 property_type external-identifier +P4911 property_type external-identifier +P4912 property_type quantity +P4913 property_type item +P4914 property_type string +P4915 property_type item +P4916 property_type external-identifier +P4917 property_type external-identifier +P4919 property_type external-identifier +P4920 property_type external-identifier +P4921 property_type external-identifier +P4922 property_type external-identifier +P4923 property_type external-identifier +P4924 property_type external-identifier +P4925 property_type external-identifier +P4926 property_type external-identifier +P4927 property_type external-identifier +P4928 property_type external-identifier +P4929 property_type external-identifier +P4930 property_type external-identifier +P4931 property_type external-identifier +P4932 property_type external-identifier +P4933 property_type external-identifier +P4934 property_type item +P4935 property_type external-identifier +P4936 property_type external-identifier +P4937 property_type external-identifier +P4938 property_type external-identifier +P4939 property_type external-identifier +P4940 property_type external-identifier +P4941 property_type external-identifier +P4942 property_type external-identifier +P4943 property_type external-identifier +P4944 property_type external-identifier +P4945 property_type url +P4946 property_type external-identifier +P4947 property_type external-identifier +P4948 property_type external-identifier +P4949 property_type external-identifier +P4950 property_type external-identifier +P4951 property_type string +P4952 property_type item +P4953 property_type external-identifier +P4954 property_type item +P4239 property_type monolingualtext +P4240 property_type item +P4241 property_type item +P4242 property_type quantity +P4243 property_type string +P4244 property_type external-identifier +P4245 property_type external-identifier +P4246 property_type external-identifier +P4247 property_type external-identifier +P4248 property_type external-identifier +P4249 property_type external-identifier +P4250 property_type quantity +P4251 property_type external-identifier +P4252 property_type external-identifier +P4253 property_type quantity +P4254 property_type external-identifier +P4255 property_type external-identifier +P4256 property_type external-identifier +P4257 property_type external-identifier +P4258 property_type external-identifier +P4259 property_type external-identifier +P4260 property_type external-identifier +P4261 property_type external-identifier +P4262 property_type external-identifier +P4263 property_type external-identifier +P4264 property_type external-identifier +P4265 property_type external-identifier +P4266 property_type external-identifier +P4267 property_type external-identifier +P4268 property_type quantity +P4269 property_type quantity +P4270 property_type external-identifier +P4271 property_type item +P4272 property_type external-identifier +P4273 property_type external-identifier +P4274 property_type external-identifier +P4275 property_type external-identifier +P4276 property_type external-identifier +P4277 property_type external-identifier +P4278 property_type external-identifier +P4279 property_type external-identifier +P4280 property_type external-identifier +P4281 property_type external-identifier +P4282 property_type external-identifier +P4283 property_type external-identifier +P4284 property_type external-identifier +P4285 property_type external-identifier +P4286 property_type external-identifier +P4287 property_type external-identifier +P4288 property_type external-identifier +P4289 property_type external-identifier +P4290 property_type item +P4291 property_type string +P4292 property_type item +P4293 property_type external-identifier +P4294 property_type external-identifier +P4295 property_type quantity +P4296 property_type quantity +P4297 property_type external-identifier +P4298 property_type external-identifier +P4299 property_type external-identifier +P4300 property_type external-identifier +P4301 property_type external-identifier +P4302 property_type external-identifier +P4303 property_type external-identifier +P4304 property_type external-identifier +P4305 property_type external-identifier +P4306 property_type external-identifier +P4307 property_type external-identifier +P4308 property_type external-identifier +P4309 property_type external-identifier +P4310 property_type external-identifier +P4311 property_type external-identifier +P4312 property_type item +P4313 property_type external-identifier +P4314 property_type external-identifier +P4315 property_type external-identifier +P4316 property_type string +P4317 property_type external-identifier +P4318 property_type external-identifier +P4319 property_type external-identifier +P4320 property_type item +P4321 property_type item +P4322 property_type item +P4323 property_type item +P4324 property_type item +P4325 property_type string +P4326 property_type external-identifier +P4327 property_type external-identifier +P4328 property_type external-identifier +P4329 property_type item +P4330 property_type item +P4331 property_type external-identifier +P4332 property_type external-identifier +P4333 property_type external-identifier +P4334 property_type external-identifier +P4335 property_type external-identifier +P4336 property_type external-identifier +P4337 property_type external-identifier +P4338 property_type external-identifier +P3200 property_type external-identifier +P3201 property_type external-identifier +P3202 property_type external-identifier +P3203 property_type external-identifier +P3204 property_type external-identifier +P3205 property_type item +P3206 property_type external-identifier +P3207 property_type external-identifier +P3208 property_type external-identifier +P3209 property_type external-identifier +P3211 property_type external-identifier +P3212 property_type external-identifier +P3213 property_type external-identifier +P3215 property_type external-identifier +P3216 property_type item +P3217 property_type external-identifier +P3218 property_type external-identifier +P3219 property_type external-identifier +P3220 property_type external-identifier +P3221 property_type external-identifier +P3222 property_type external-identifier +P3223 property_type external-identifier +P3224 property_type external-identifier +P3225 property_type external-identifier +P3226 property_type external-identifier +P3227 property_type external-identifier +P3228 property_type string +P3229 property_type external-identifier +P3230 property_type external-identifier +P3231 property_type external-identifier +P3232 property_type external-identifier +P3233 property_type external-identifier +P3234 property_type external-identifier +P3235 property_type external-identifier +P3236 property_type external-identifier +P3237 property_type external-identifier +P3238 property_type string +P3240 property_type external-identifier +P3241 property_type external-identifier +P3242 property_type external-identifier +P3243 property_type external-identifier +P3245 property_type external-identifier +P3246 property_type external-identifier +P3248 property_type external-identifier +P3250 property_type external-identifier +P3251 property_type quantity +P3252 property_type quantity +P3253 property_type quantity +P3254 property_type url +P3256 property_type external-identifier +P3257 property_type external-identifier +P3258 property_type external-identifier +P3259 property_type item +P3260 property_type quantity +P3261 property_type item +P3262 property_type item +P3263 property_type item +P3264 property_type item +P3265 property_type external-identifier +P3266 property_type external-identifier +P3267 property_type external-identifier +P3268 property_type url +P3269 property_type external-identifier +P3270 property_type quantity +P3271 property_type quantity +P3272 property_type external-identifier +P3273 property_type external-identifier +P3274 property_type item +P3275 property_type item +P3276 property_type external-identifier +P3277 property_type external-identifier +P3279 property_type item +P3280 property_type external-identifier +P3281 property_type external-identifier +P3283 property_type external-identifier +P3284 property_type external-identifier +P3285 property_type external-identifier +P3286 property_type external-identifier +P3288 property_type external-identifier +P3289 property_type external-identifier +P3290 property_type external-identifier +P3291 property_type external-identifier +P3292 property_type external-identifier +P3293 property_type external-identifier +P3294 property_type item +P3295 property_type string +P3296 property_type external-identifier +P3297 property_type external-identifier +P3298 property_type external-identifier +P3299 property_type external-identifier +P3300 property_type item +P3301 property_type item +P3302 property_type external-identifier +P3303 property_type string +P3304 property_type external-identifier +P3305 property_type external-identifier +P3306 property_type item +P3307 property_type external-identifier +P3308 property_type external-identifier +P3309 property_type external-identifier +P1067 property_type external-identifier +P1068 property_type item +P1069 property_type external-identifier +P1070 property_type external-identifier +P1071 property_type item +P1072 property_type item +P1073 property_type item +P1074 property_type item +P1075 property_type item +P1076 property_type external-identifier +P1077 property_type string +P1078 property_type item +P1079 property_type item +P1080 property_type item +P1081 property_type quantity +P1082 property_type quantity +P1083 property_type quantity +P1084 property_type external-identifier +P1085 property_type external-identifier +P1086 property_type quantity +P1087 property_type quantity +P1088 property_type quantity +P1090 property_type quantity +P1092 property_type quantity +P1093 property_type quantity +P1096 property_type quantity +P1097 property_type quantity +P1098 property_type quantity +P1099 property_type quantity +P1100 property_type quantity +P1101 property_type quantity +P1102 property_type quantity +P1103 property_type quantity +P1104 property_type quantity +P1106 property_type quantity +P1107 property_type quantity +P1108 property_type quantity +P1109 property_type quantity +P1110 property_type quantity +P1111 property_type quantity +P1113 property_type quantity +P1114 property_type quantity +P1115 property_type external-identifier +P1116 property_type external-identifier +P1117 property_type quantity +P1120 property_type quantity +P1121 property_type quantity +P1122 property_type quantity +P1123 property_type quantity +P1125 property_type quantity +P1126 property_type quantity +P1127 property_type quantity +P1128 property_type quantity +P1129 property_type quantity +P1132 property_type quantity +P1133 property_type external-identifier +P1135 property_type item +P1136 property_type item +P1137 property_type item +P1138 property_type external-identifier +P1139 property_type quantity +P1140 property_type external-identifier +P1141 property_type quantity +P1142 property_type item +P1143 property_type external-identifier +P1144 property_type external-identifier +P1145 property_type item +P1146 property_type external-identifier +P1148 property_type quantity +P1149 property_type string +P1150 property_type string +P1151 property_type item +P1153 property_type external-identifier +P1154 property_type external-identifier +P1155 property_type external-identifier +P1156 property_type external-identifier +P1157 property_type external-identifier +P1158 property_type item +P1159 property_type external-identifier +P1160 property_type external-identifier +P1161 property_type string +P1162 property_type string +P1163 property_type string +P1164 property_type quantity +P1165 property_type item +P1167 property_type external-identifier +P1168 property_type external-identifier +P1170 property_type item +P1171 property_type item +P1172 property_type external-identifier +P1174 property_type quantity +P1181 property_type quantity +P1182 property_type external-identifier +P1183 property_type string +P1184 property_type external-identifier +P1185 property_type external-identifier +P1186 property_type external-identifier +P1187 property_type external-identifier +P1188 property_type external-identifier +P1189 property_type external-identifier +P5374 property_type external-identifier +P5375 property_type external-identifier +P5376 property_type external-identifier +P5377 property_type external-identifier +P5378 property_type external-identifier +P5379 property_type external-identifier +P5380 property_type external-identifier +P5381 property_type external-identifier +P5382 property_type external-identifier +P5383 property_type external-identifier +P5384 property_type external-identifier +P5385 property_type external-identifier +P5386 property_type item +P5387 property_type external-identifier +P5388 property_type external-identifier +P5389 property_type item +P5390 property_type external-identifier +P5391 property_type external-identifier +P5392 property_type external-identifier +P5393 property_type external-identifier +P5394 property_type external-identifier +P5395 property_type external-identifier +P5396 property_type external-identifier +P5397 property_type external-identifier +P5398 property_type external-identifier +P5400 property_type external-identifier +P5401 property_type string +P5402 property_type string +P5403 property_type external-identifier +P5404 property_type external-identifier +P5406 property_type external-identifier +P5407 property_type external-identifier +P5408 property_type external-identifier +P5409 property_type external-identifier +P5410 property_type external-identifier +P5411 property_type external-identifier +P5412 property_type string +P5413 property_type external-identifier +P5414 property_type external-identifier +P5415 property_type external-identifier +P5417 property_type external-identifier +P5418 property_type external-identifier +P5419 property_type external-identifier +P5420 property_type external-identifier +P5421 property_type external-identifier +P5422 property_type item +P5423 property_type string +P5424 property_type external-identifier +P5425 property_type item +P5426 property_type item +P5427 property_type quantity +P5428 property_type quantity +P5429 property_type external-identifier +P5430 property_type external-identifier +P5431 property_type external-identifier +P5432 property_type external-identifier +P5434 property_type external-identifier +P5435 property_type external-identifier +P5436 property_type quantity +P5437 property_type external-identifier +P5438 property_type item +P5439 property_type item +P5440 property_type external-identifier +P5441 property_type external-identifier +P5442 property_type external-identifier +P5443 property_type external-identifier +P5444 property_type item +P5445 property_type external-identifier +P5446 property_type item +P5447 property_type quantity +P5448 property_type quantity +P5449 property_type external-identifier +P5450 property_type external-identifier +P5451 property_type external-identifier +P5452 property_type external-identifier +P5453 property_type external-identifier +P5454 property_type external-identifier +P5455 property_type external-identifier +P5456 property_type external-identifier +P5457 property_type external-identifier +P5458 property_type external-identifier +P5459 property_type external-identifier +P5460 property_type item +P5461 property_type string +P5462 property_type external-identifier +P5463 property_type external-identifier +P5464 property_type external-identifier +P5465 property_type external-identifier +P5466 property_type external-identifier +P5467 property_type external-identifier +P5468 property_type external-identifier +P5469 property_type external-identifier +P5470 property_type external-identifier +P5471 property_type string +P5473 property_type external-identifier +P5474 property_type quantity +P5475 property_type item +P5476 property_type external-identifier +P5477 property_type external-identifier +P5478 property_type external-identifier +P4339 property_type external-identifier +P4340 property_type external-identifier +P4341 property_type quantity +P4342 property_type external-identifier +P4343 property_type external-identifier +P4344 property_type external-identifier +P4345 property_type item +P4346 property_type external-identifier +P4347 property_type external-identifier +P4348 property_type external-identifier +P4349 property_type external-identifier +P4350 property_type quantity +P4351 property_type external-identifier +P4352 property_type external-identifier +P4353 property_type item +P4354 property_type string +P4355 property_type external-identifier +P4356 property_type external-identifier +P4357 property_type external-identifier +P4358 property_type external-identifier +P4359 property_type external-identifier +P4360 property_type external-identifier +P4361 property_type external-identifier +P4362 property_type external-identifier +P4363 property_type external-identifier +P4364 property_type external-identifier +P4365 property_type external-identifier +P4366 property_type external-identifier +P4367 property_type external-identifier +P4368 property_type external-identifier +P4369 property_type external-identifier +P4370 property_type external-identifier +P4371 property_type external-identifier +P4372 property_type external-identifier +P4373 property_type external-identifier +P4374 property_type external-identifier +P4375 property_type external-identifier +P4376 property_type external-identifier +P4377 property_type external-identifier +P4379 property_type item +P4380 property_type external-identifier +P4381 property_type external-identifier +P4382 property_type external-identifier +P4383 property_type external-identifier +P4384 property_type external-identifier +P4385 property_type external-identifier +P4386 property_type external-identifier +P4387 property_type item +P4388 property_type external-identifier +P4389 property_type external-identifier +P4390 property_type item +P4391 property_type external-identifier +P4392 property_type external-identifier +P4393 property_type external-identifier +P4394 property_type external-identifier +P4395 property_type external-identifier +P4396 property_type external-identifier +P4397 property_type external-identifier +P4398 property_type external-identifier +P4399 property_type external-identifier +P4400 property_type external-identifier +P4401 property_type external-identifier +P4402 property_type external-identifier +P4403 property_type quantity +P4404 property_type external-identifier +P4405 property_type external-identifier +P4406 property_type external-identifier +P4407 property_type external-identifier +P4408 property_type external-identifier +P4409 property_type external-identifier +P4410 property_type external-identifier +P4411 property_type external-identifier +P4412 property_type external-identifier +P4413 property_type external-identifier +P4414 property_type external-identifier +P4415 property_type external-identifier +P4416 property_type external-identifier +P4417 property_type external-identifier +P4418 property_type external-identifier +P4419 property_type external-identifier +P4421 property_type external-identifier +P4422 property_type external-identifier +P4423 property_type external-identifier +P4424 property_type item +P4425 property_type item +P4426 property_type item +P4427 property_type external-identifier +P4428 property_type item +P4429 property_type external-identifier +P4430 property_type external-identifier +P4431 property_type external-identifier +P4432 property_type external-identifier +P4433 property_type external-identifier +P4434 property_type external-identifier +P4435 property_type external-identifier +P4436 property_type external-identifier +P4437 property_type item +P4438 property_type external-identifier +P4439 property_type external-identifier +P4440 property_type external-identifier +P4955 property_type external-identifier +P4956 property_type external-identifier +P4957 property_type external-identifier +P4958 property_type item +P4959 property_type external-identifier +P4960 property_type external-identifier +P4961 property_type external-identifier +P4962 property_type external-identifier +P4963 property_type external-identifier +P4964 property_type external-identifier +P4965 property_type external-identifier +P4966 property_type external-identifier +P4967 property_type item +P4968 property_type item +P4969 property_type item +P4970 property_type string +P4971 property_type external-identifier +P4972 property_type external-identifier +P4973 property_type external-identifier +P4974 property_type external-identifier +P4975 property_type external-identifier +P4976 property_type external-identifier +P4977 property_type external-identifier +P4978 property_type external-identifier +P4979 property_type external-identifier +P4980 property_type external-identifier +P4981 property_type external-identifier +P4982 property_type external-identifier +P4983 property_type external-identifier +P4985 property_type external-identifier +P4986 property_type external-identifier +P4987 property_type external-identifier +P4988 property_type item +P4989 property_type external-identifier +P4991 property_type external-identifier +P4992 property_type external-identifier +P4993 property_type external-identifier +P4994 property_type external-identifier +P4996 property_type external-identifier +P4997 property_type url +P4998 property_type string +P4999 property_type quantity +P5001 property_type external-identifier +P5002 property_type external-identifier +P5003 property_type external-identifier +P5004 property_type item +P5005 property_type external-identifier +P5006 property_type external-identifier +P5007 property_type external-identifier +P5008 property_type item +P5009 property_type item +P5010 property_type external-identifier +P5011 property_type external-identifier +P5012 property_type item +P5013 property_type external-identifier +P5014 property_type external-identifier +P5015 property_type external-identifier +P5016 property_type external-identifier +P5017 property_type time +P5018 property_type external-identifier +P5019 property_type external-identifier +P5020 property_type external-identifier +P5021 property_type item +P5022 property_type quantity +P5023 property_type item +P5024 property_type item +P5025 property_type item +P5026 property_type item +P5027 property_type quantity +P5028 property_type item +P5029 property_type external-identifier +P5030 property_type item +P5031 property_type external-identifier +P5032 property_type external-identifier +P5033 property_type external-identifier +P5034 property_type external-identifier +P5035 property_type external-identifier +P5036 property_type external-identifier +P5037 property_type external-identifier +P5038 property_type external-identifier +P5039 property_type external-identifier +P5040 property_type item +P5041 property_type item +P5042 property_type item +P5043 property_type quantity +P5044 property_type quantity +P5045 property_type quantity +P5046 property_type string +P5047 property_type external-identifier +P5048 property_type external-identifier +P5049 property_type external-identifier +P5050 property_type external-identifier +P5051 property_type item +P5052 property_type item +P5053 property_type item +P5054 property_type item +P5055 property_type external-identifier +P5056 property_type item +P5057 property_type external-identifier +P5058 property_type external-identifier +P3310 property_type item +P3311 property_type string +P3314 property_type external-identifier +P3315 property_type external-identifier +P3316 property_type external-identifier +P3318 property_type external-identifier +P3320 property_type item +P3321 property_type monolingualtext +P3322 property_type external-identifier +P3323 property_type item +P3324 property_type external-identifier +P3325 property_type external-identifier +P3326 property_type external-identifier +P3327 property_type external-identifier +P3328 property_type external-identifier +P3329 property_type external-identifier +P3330 property_type external-identifier +P3331 property_type external-identifier +P3332 property_type external-identifier +P3333 property_type external-identifier +P3335 property_type item +P3337 property_type quantity +P3338 property_type external-identifier +P3339 property_type external-identifier +P3340 property_type external-identifier +P3341 property_type external-identifier +P3342 property_type item +P3343 property_type external-identifier +P3344 property_type external-identifier +P3345 property_type external-identifier +P3346 property_type external-identifier +P3347 property_type external-identifier +P3348 property_type external-identifier +P3349 property_type item +P3350 property_type external-identifier +P3351 property_type external-identifier +P3352 property_type external-identifier +P3353 property_type external-identifier +P3354 property_type item +P3355 property_type item +P3356 property_type item +P3357 property_type item +P3358 property_type item +P3359 property_type item +P3360 property_type external-identifier +P3361 property_type external-identifier +P3362 property_type quantity +P3363 property_type external-identifier +P3364 property_type item +P3365 property_type external-identifier +P3366 property_type external-identifier +P3367 property_type external-identifier +P3368 property_type external-identifier +P3370 property_type external-identifier +P3371 property_type external-identifier +P3372 property_type external-identifier +P3373 property_type item +P3374 property_type item +P3375 property_type external-identifier +P3376 property_type external-identifier +P3377 property_type external-identifier +P3378 property_type external-identifier +P3379 property_type external-identifier +P3380 property_type external-identifier +P3381 property_type external-identifier +P3382 property_type external-identifier +P3383 property_type string +P3385 property_type external-identifier +P3386 property_type external-identifier +P3387 property_type quantity +P3388 property_type external-identifier +P3389 property_type external-identifier +P3390 property_type external-identifier +P3391 property_type external-identifier +P3392 property_type external-identifier +P3393 property_type external-identifier +P3394 property_type external-identifier +P3395 property_type quantity +P3396 property_type external-identifier +P3397 property_type external-identifier +P3398 property_type external-identifier +P3399 property_type external-identifier +P3400 property_type external-identifier +P3401 property_type external-identifier +P3402 property_type item +P3403 property_type item +P3404 property_type external-identifier +P3405 property_type external-identifier +P3406 property_type external-identifier +P3407 property_type external-identifier +P3408 property_type external-identifier +P3409 property_type external-identifier +P3410 property_type external-identifier +P3411 property_type external-identifier +P3412 property_type external-identifier +P3413 property_type external-identifier +P3414 property_type external-identifier +P3415 property_type item +P3416 property_type item +P3417 property_type external-identifier +P1190 property_type string +P1191 property_type time +P1192 property_type item +P1193 property_type quantity +P1194 property_type item +P1195 property_type string +P1196 property_type item +P1198 property_type quantity +P1199 property_type item +P1200 property_type item +P1201 property_type item +P1202 property_type item +P1203 property_type external-identifier +P1204 property_type item +P1207 property_type external-identifier +P1208 property_type external-identifier +P1209 property_type external-identifier +P1210 property_type item +P1211 property_type item +P1212 property_type external-identifier +P1213 property_type external-identifier +P1214 property_type external-identifier +P1215 property_type quantity +P1216 property_type external-identifier +P1217 property_type external-identifier +P1218 property_type external-identifier +P1219 property_type external-identifier +P1220 property_type external-identifier +P1221 property_type item +P1225 property_type external-identifier +P1227 property_type item +P1229 property_type external-identifier +P1230 property_type external-identifier +P1232 property_type external-identifier +P1233 property_type external-identifier +P1234 property_type external-identifier +P1235 property_type external-identifier +P1236 property_type external-identifier +P1237 property_type external-identifier +P1238 property_type external-identifier +P1239 property_type external-identifier +P1240 property_type string +P1241 property_type external-identifier +P1242 property_type external-identifier +P1243 property_type external-identifier +P1245 property_type external-identifier +P1246 property_type external-identifier +P1247 property_type quantity +P1248 property_type external-identifier +P1249 property_type time +P1250 property_type external-identifier +P1251 property_type external-identifier +P1252 property_type external-identifier +P1253 property_type external-identifier +P1254 property_type external-identifier +P1255 property_type external-identifier +P1256 property_type external-identifier +P1257 property_type string +P1258 property_type external-identifier +P1259 property_type globe-coordinate +P1260 property_type external-identifier +P1261 property_type external-identifier +P1262 property_type external-identifier +P1263 property_type external-identifier +P1264 property_type item +P1265 property_type external-identifier +P1266 property_type external-identifier +P1267 property_type external-identifier +P1268 property_type item +P1269 property_type item +P1270 property_type external-identifier +P1271 property_type external-identifier +P1272 property_type external-identifier +P1273 property_type external-identifier +P1274 property_type external-identifier +P1275 property_type external-identifier +P1276 property_type external-identifier +P1277 property_type external-identifier +P1278 property_type external-identifier +P1279 property_type quantity +P1280 property_type external-identifier +P1281 property_type external-identifier +P1282 property_type string +P1283 property_type item +P1284 property_type external-identifier +P1285 property_type external-identifier +P1286 property_type external-identifier +P1287 property_type external-identifier +P1288 property_type external-identifier +P1289 property_type external-identifier +P1290 property_type item +P1291 property_type external-identifier +P1292 property_type external-identifier +P1293 property_type external-identifier +P1294 property_type external-identifier +P1295 property_type quantity +P1296 property_type external-identifier +P1297 property_type external-identifier +P1299 property_type item +P1300 property_type external-identifier +P4441 property_type quantity +P4442 property_type quantity +P4443 property_type item +P4444 property_type item +P4445 property_type quantity +P4446 property_type item +P4447 property_type quantity +P4448 property_type quantity +P4449 property_type external-identifier +P4450 property_type external-identifier +P4451 property_type external-identifier +P4452 property_type item +P4453 property_type external-identifier +P4454 property_type external-identifier +P4455 property_type external-identifier +P4456 property_type external-identifier +P4457 property_type external-identifier +P4458 property_type external-identifier +P4459 property_type external-identifier +P4460 property_type external-identifier +P4461 property_type external-identifier +P4462 property_type external-identifier +P4463 property_type external-identifier +P4464 property_type external-identifier +P4465 property_type external-identifier +P4466 property_type external-identifier +P4467 property_type external-identifier +P4468 property_type external-identifier +P4469 property_type external-identifier +P4470 property_type external-identifier +P4471 property_type external-identifier +P4472 property_type external-identifier +P4473 property_type external-identifier +P4474 property_type external-identifier +P4475 property_type external-identifier +P4476 property_type external-identifier +P4477 property_type external-identifier +P4478 property_type external-identifier +P4479 property_type external-identifier +P4480 property_type external-identifier +P4481 property_type external-identifier +P4482 property_type external-identifier +P4483 property_type external-identifier +P4484 property_type external-identifier +P4485 property_type external-identifier +P4486 property_type external-identifier +P4487 property_type external-identifier +P4488 property_type external-identifier +P4489 property_type external-identifier +P4490 property_type external-identifier +P4491 property_type external-identifier +P4493 property_type external-identifier +P4494 property_type external-identifier +P4495 property_type external-identifier +P4496 property_type string +P4498 property_type external-identifier +P4500 property_type quantity +P4501 property_type quantity +P4502 property_type external-identifier +P4503 property_type external-identifier +P4504 property_type external-identifier +P4505 property_type external-identifier +P4506 property_type string +P4507 property_type external-identifier +P4508 property_type external-identifier +P4509 property_type external-identifier +P4510 property_type item +P4511 property_type quantity +P4512 property_type external-identifier +P4513 property_type external-identifier +P4514 property_type external-identifier +P4515 property_type external-identifier +P4516 property_type external-identifier +P4517 property_type external-identifier +P4518 property_type external-identifier +P4519 property_type quantity +P4520 property_type external-identifier +P4521 property_type external-identifier +P4522 property_type external-identifier +P4523 property_type external-identifier +P4524 property_type external-identifier +P4525 property_type external-identifier +P4526 property_type external-identifier +P4527 property_type external-identifier +P4528 property_type external-identifier +P4529 property_type external-identifier +P4530 property_type external-identifier +P4531 property_type external-identifier +P4532 property_type external-identifier +P4533 property_type external-identifier +P4534 property_type external-identifier +P4535 property_type external-identifier +P4536 property_type external-identifier +P4537 property_type external-identifier +P4538 property_type external-identifier +P4539 property_type external-identifier +P4540 property_type external-identifier +P4541 property_type external-identifier +P4542 property_type external-identifier +P4543 property_type item +P5479 property_type quantity +P5480 property_type quantity +P5481 property_type quantity +P5483 property_type quantity +P5485 property_type external-identifier +P5488 property_type external-identifier +P5489 property_type external-identifier +P5490 property_type external-identifier +P5491 property_type external-identifier +P5492 property_type external-identifier +P5493 property_type external-identifier +P5494 property_type external-identifier +P5495 property_type external-identifier +P5496 property_type external-identifier +P5497 property_type external-identifier +P5498 property_type external-identifier +P5499 property_type external-identifier +P5500 property_type external-identifier +P5501 property_type external-identifier +P5502 property_type external-identifier +P5503 property_type external-identifier +P5504 property_type external-identifier +P5505 property_type external-identifier +P5506 property_type external-identifier +P5507 property_type external-identifier +P5508 property_type external-identifier +P5509 property_type external-identifier +P5510 property_type external-identifier +P5513 property_type external-identifier +P5514 property_type item +P5515 property_type external-identifier +P5516 property_type external-identifier +P5517 property_type external-identifier +P5518 property_type string +P5519 property_type string +P5520 property_type quantity +P5522 property_type item +P5523 property_type item +P5524 property_type quantity +P5525 property_type external-identifier +P5526 property_type quantity +P5527 property_type external-identifier +P5528 property_type external-identifier +P5529 property_type quantity +P5530 property_type external-identifier +P5531 property_type external-identifier +P5532 property_type external-identifier +P5533 property_type external-identifier +P5534 property_type external-identifier +P5535 property_type external-identifier +P5536 property_type external-identifier +P5537 property_type item +P5538 property_type external-identifier +P5539 property_type external-identifier +P5540 property_type external-identifier +P5541 property_type external-identifier +P5542 property_type external-identifier +P5543 property_type external-identifier +P5544 property_type external-identifier +P5545 property_type external-identifier +P5546 property_type external-identifier +P5547 property_type external-identifier +P5548 property_type string +P5549 property_type external-identifier +P5550 property_type external-identifier +P5551 property_type external-identifier +P5552 property_type external-identifier +P5553 property_type external-identifier +P5554 property_type external-identifier +P5555 property_type string +P5556 property_type external-identifier +P5557 property_type external-identifier +P5558 property_type external-identifier +P5559 property_type external-identifier +P5560 property_type item +P5561 property_type external-identifier +P5562 property_type external-identifier +P5563 property_type external-identifier +P5564 property_type item +P5565 property_type external-identifier +P5566 property_type external-identifier +P5567 property_type external-identifier +P5568 property_type external-identifier +P5569 property_type external-identifier +P5570 property_type external-identifier +P5571 property_type external-identifier +P5572 property_type item +P5573 property_type external-identifier +P5574 property_type external-identifier +P5575 property_type quantity +P5576 property_type external-identifier +P5578 property_type external-identifier +P5579 property_type external-identifier +P5580 property_type external-identifier +P5581 property_type external-identifier +P5582 property_type quantity +P5584 property_type external-identifier +P5585 property_type external-identifier +P5586 property_type external-identifier +P5587 property_type external-identifier +P1301 property_type quantity +P1302 property_type item +P1303 property_type item +P1304 property_type item +P1305 property_type external-identifier +P1307 property_type external-identifier +P1308 property_type item +P1309 property_type external-identifier +P1310 property_type item +P1311 property_type external-identifier +P1312 property_type item +P1313 property_type item +P1314 property_type quantity +P1315 property_type external-identifier +P1316 property_type external-identifier +P1317 property_type time +P1318 property_type item +P1319 property_type time +P1320 property_type external-identifier +P1321 property_type item +P1322 property_type item +P1323 property_type external-identifier +P1324 property_type url +P1325 property_type url +P1326 property_type time +P1327 property_type item +P1329 property_type string +P1330 property_type external-identifier +P1331 property_type external-identifier +P1332 property_type globe-coordinate +P1333 property_type globe-coordinate +P1334 property_type globe-coordinate +P1335 property_type globe-coordinate +P1336 property_type item +P1338 property_type external-identifier +P1339 property_type quantity +P1340 property_type item +P1341 property_type external-identifier +P1342 property_type quantity +P1343 property_type item +P1344 property_type item +P1345 property_type quantity +P1346 property_type item +P1347 property_type item +P1348 property_type url +P1349 property_type item +P1350 property_type quantity +P1351 property_type quantity +P1352 property_type quantity +P1353 property_type string +P1354 property_type item +P1355 property_type quantity +P1356 property_type quantity +P1357 property_type quantity +P1358 property_type quantity +P1359 property_type quantity +P1360 property_type string +P1362 property_type external-identifier +P1363 property_type item +P1364 property_type external-identifier +P1365 property_type item +P1366 property_type item +P1367 property_type external-identifier +P1368 property_type external-identifier +P1369 property_type external-identifier +P1370 property_type external-identifier +P1371 property_type external-identifier +P1372 property_type item +P1373 property_type quantity +P1375 property_type external-identifier +P1376 property_type item +P1377 property_type external-identifier +P1378 property_type external-identifier +P1380 property_type external-identifier +P1381 property_type external-identifier +P1382 property_type item +P1383 property_type item +P1385 property_type external-identifier +P1386 property_type external-identifier +P1387 property_type item +P1388 property_type external-identifier +P1389 property_type item +P1390 property_type quantity +P1391 property_type external-identifier +P1392 property_type external-identifier +P1393 property_type item +P1394 property_type external-identifier +P1395 property_type external-identifier +P1396 property_type string +P1397 property_type external-identifier +P1398 property_type item +P1399 property_type item +P1400 property_type external-identifier +P1401 property_type url +P1402 property_type string +P1403 property_type item +P1404 property_type external-identifier +P1406 property_type item +P1407 property_type external-identifier +P1408 property_type item +P4544 property_type external-identifier +P4545 property_type item +P4546 property_type external-identifier +P4547 property_type external-identifier +P4548 property_type external-identifier +P4549 property_type external-identifier +P4550 property_type external-identifier +P4551 property_type external-identifier +P4552 property_type item +P4553 property_type external-identifier +P4554 property_type external-identifier +P4555 property_type external-identifier +P4556 property_type external-identifier +P4557 property_type external-identifier +P4558 property_type external-identifier +P4559 property_type external-identifier +P4560 property_type external-identifier +P4561 property_type external-identifier +P4562 property_type external-identifier +P4563 property_type external-identifier +P4564 property_type external-identifier +P4565 property_type string +P4566 property_type time +P4567 property_type external-identifier +P4568 property_type external-identifier +P4569 property_type external-identifier +P4571 property_type external-identifier +P4572 property_type external-identifier +P4573 property_type string +P4574 property_type external-identifier +P4575 property_type string +P4576 property_type external-identifier +P4577 property_type external-identifier +P4578 property_type external-identifier +P4579 property_type external-identifier +P4580 property_type external-identifier +P4581 property_type external-identifier +P4582 property_type external-identifier +P4583 property_type external-identifier +P4584 property_type item +P4585 property_type external-identifier +P4586 property_type item +P4587 property_type external-identifier +P4588 property_type external-identifier +P4589 property_type external-identifier +P4590 property_type external-identifier +P4591 property_type external-identifier +P4592 property_type external-identifier +P4593 property_type external-identifier +P4594 property_type external-identifier +P4595 property_type string +P4596 property_type external-identifier +P4597 property_type external-identifier +P4598 property_type external-identifier +P4599 property_type item +P4600 property_type item +P4601 property_type external-identifier +P4602 property_type time +P4604 property_type external-identifier +P4605 property_type external-identifier +P4606 property_type external-identifier +P4607 property_type external-identifier +P4608 property_type item +P4609 property_type external-identifier +P4610 property_type external-identifier +P4611 property_type external-identifier +P4612 property_type external-identifier +P4613 property_type external-identifier +P4614 property_type item +P4615 property_type external-identifier +P4616 property_type external-identifier +P4617 property_type external-identifier +P4618 property_type external-identifier +P4619 property_type external-identifier +P4620 property_type external-identifier +P4621 property_type external-identifier +P4622 property_type item +P4623 property_type external-identifier +P4624 property_type item +P4625 property_type external-identifier +P4626 property_type item +P4627 property_type external-identifier +P4628 property_type item +P4629 property_type external-identifier +P4630 property_type external-identifier +P4631 property_type external-identifier +P4632 property_type external-identifier +P4633 property_type string +P4634 property_type item +P4635 property_type external-identifier +P4636 property_type external-identifier +P4637 property_type external-identifier +P4638 property_type external-identifier +P4639 property_type external-identifier +P4640 property_type string +P4641 property_type external-identifier +P4642 property_type external-identifier +P4643 property_type external-identifier +P4644 property_type external-identifier +P4645 property_type external-identifier +P3418 property_type external-identifier +P3419 property_type external-identifier +P3420 property_type external-identifier +P3421 property_type external-identifier +P3422 property_type external-identifier +P3423 property_type external-identifier +P3424 property_type external-identifier +P3425 property_type external-identifier +P3426 property_type external-identifier +P3427 property_type external-identifier +P3428 property_type item +P3429 property_type external-identifier +P3430 property_type external-identifier +P3431 property_type external-identifier +P3432 property_type item +P3433 property_type item +P3434 property_type external-identifier +P3435 property_type external-identifier +P3436 property_type external-identifier +P3437 property_type item +P3438 property_type item +P3439 property_type quantity +P3440 property_type item +P3441 property_type string +P3442 property_type external-identifier +P3443 property_type external-identifier +P3444 property_type external-identifier +P3445 property_type external-identifier +P3446 property_type external-identifier +P3447 property_type item +P3448 property_type item +P3449 property_type external-identifier +P3450 property_type item +P3451 property_type string +P3452 property_type item +P3453 property_type external-identifier +P3454 property_type external-identifier +P3455 property_type external-identifier +P3456 property_type external-identifier +P3457 property_type quantity +P3458 property_type external-identifier +P3459 property_type external-identifier +P3460 property_type item +P3461 property_type item +P3462 property_type external-identifier +P3463 property_type external-identifier +P3464 property_type item +P3465 property_type quantity +P3466 property_type external-identifier +P3467 property_type external-identifier +P3468 property_type external-identifier +P3469 property_type external-identifier +P3470 property_type external-identifier +P3471 property_type external-identifier +P3472 property_type external-identifier +P3473 property_type external-identifier +P3475 property_type external-identifier +P3476 property_type external-identifier +P3477 property_type external-identifier +P3478 property_type external-identifier +P3479 property_type external-identifier +P3480 property_type external-identifier +P3481 property_type external-identifier +P3482 property_type external-identifier +P3483 property_type external-identifier +P3485 property_type quantity +P3486 property_type quantity +P3487 property_type quantity +P3488 property_type quantity +P3489 property_type item +P3490 property_type item +P3491 property_type item +P3492 property_type quantity +P3493 property_type item +P3494 property_type item +P3495 property_type external-identifier +P3496 property_type item +P3497 property_type item +P3498 property_type external-identifier +P3499 property_type external-identifier +P3500 property_type external-identifier +P3501 property_type item +P3502 property_type external-identifier +P3503 property_type external-identifier +P3504 property_type external-identifier +P3505 property_type external-identifier +P3506 property_type external-identifier +P3507 property_type external-identifier +P3509 property_type external-identifier +P3511 property_type external-identifier +P3512 property_type item +P3513 property_type external-identifier +P3514 property_type external-identifier +P3515 property_type external-identifier +P3516 property_type external-identifier +P3517 property_type external-identifier +P3518 property_type external-identifier +P3519 property_type external-identifier +P3520 property_type external-identifier +P3521 property_type external-identifier +P5059 property_type item +P5061 property_type monolingualtext +P5062 property_type external-identifier +P5063 property_type external-identifier +P5064 property_type external-identifier +P5065 property_type quantity +P5066 property_type quantity +P5067 property_type quantity +P5068 property_type external-identifier +P5069 property_type quantity +P5070 property_type item +P5071 property_type quantity +P5072 property_type item +P5073 property_type external-identifier +P5075 property_type external-identifier +P5076 property_type external-identifier +P5077 property_type external-identifier +P5078 property_type external-identifier +P5079 property_type external-identifier +P5080 property_type external-identifier +P5081 property_type external-identifier +P5082 property_type external-identifier +P5083 property_type external-identifier +P5084 property_type external-identifier +P5085 property_type external-identifier +P5086 property_type external-identifier +P5087 property_type external-identifier +P5088 property_type external-identifier +P5090 property_type external-identifier +P5091 property_type external-identifier +P5092 property_type external-identifier +P5093 property_type external-identifier +P5094 property_type external-identifier +P5095 property_type item +P5096 property_type item +P5097 property_type external-identifier +P5098 property_type external-identifier +P5099 property_type external-identifier +P5101 property_type external-identifier +P5102 property_type item +P5103 property_type external-identifier +P5104 property_type external-identifier +P5105 property_type item +P5106 property_type external-identifier +P5107 property_type external-identifier +P5108 property_type external-identifier +P5109 property_type item +P5110 property_type item +P5114 property_type external-identifier +P5115 property_type external-identifier +P5116 property_type external-identifier +P5117 property_type external-identifier +P5118 property_type external-identifier +P5119 property_type external-identifier +P5120 property_type external-identifier +P5121 property_type external-identifier +P5122 property_type external-identifier +P5123 property_type external-identifier +P5124 property_type external-identifier +P5125 property_type item +P5126 property_type item +P5127 property_type external-identifier +P5128 property_type external-identifier +P5129 property_type external-identifier +P5131 property_type item +P5132 property_type item +P5133 property_type item +P5134 property_type item +P5135 property_type item +P5136 property_type item +P5137 property_type item +P5138 property_type item +P5139 property_type string +P5140 property_type globe-coordinate +P5141 property_type quantity +P5142 property_type external-identifier +P5143 property_type external-identifier +P5144 property_type external-identifier +P5145 property_type external-identifier +P5146 property_type external-identifier +P5147 property_type external-identifier +P5148 property_type external-identifier +P5149 property_type external-identifier +P5150 property_type item +P5151 property_type external-identifier +P5152 property_type item +P5153 property_type external-identifier +P5154 property_type external-identifier +P5156 property_type external-identifier +P5157 property_type external-identifier +P5158 property_type external-identifier +P5159 property_type external-identifier +P5160 property_type external-identifier +P5161 property_type external-identifier +P5162 property_type external-identifier +P5163 property_type external-identifier +P5164 property_type external-identifier +P5165 property_type external-identifier +P5166 property_type item +P5167 property_type quantity +P5588 property_type item +P5589 property_type item +P5590 property_type external-identifier +P5591 property_type item +P5592 property_type quantity +P5593 property_type quantity +P5594 property_type quantity +P5595 property_type quantity +P5596 property_type quantity +P5597 property_type external-identifier +P5598 property_type external-identifier +P5599 property_type external-identifier +P5600 property_type external-identifier +P5601 property_type external-identifier +P5602 property_type external-identifier +P5603 property_type external-identifier +P5604 property_type external-identifier +P5605 property_type external-identifier +P5606 property_type item +P5607 property_type item +P5608 property_type quantity +P5609 property_type external-identifier +P5610 property_type external-identifier +P5611 property_type external-identifier +P5612 property_type external-identifier +P5613 property_type external-identifier +P5614 property_type external-identifier +P5615 property_type external-identifier +P5616 property_type external-identifier +P5617 property_type external-identifier +P5618 property_type external-identifier +P5619 property_type external-identifier +P5620 property_type external-identifier +P5621 property_type external-identifier +P5622 property_type external-identifier +P5623 property_type item +P5624 property_type quantity +P5625 property_type string +P5626 property_type external-identifier +P5627 property_type external-identifier +P5628 property_type external-identifier +P5629 property_type external-identifier +P5630 property_type quantity +P5631 property_type external-identifier +P5632 property_type external-identifier +P5633 property_type external-identifier +P5634 property_type external-identifier +P5635 property_type external-identifier +P5636 property_type external-identifier +P5637 property_type external-identifier +P5638 property_type external-identifier +P5639 property_type external-identifier +P5640 property_type external-identifier +P5641 property_type external-identifier +P5642 property_type item +P5643 property_type external-identifier +P5644 property_type external-identifier +P5645 property_type external-identifier +P5646 property_type external-identifier +P5647 property_type external-identifier +P5648 property_type external-identifier +P5649 property_type external-identifier +P5650 property_type external-identifier +P5651 property_type external-identifier +P5652 property_type external-identifier +P5653 property_type external-identifier +P5654 property_type external-identifier +P5655 property_type external-identifier +P5656 property_type external-identifier +P5657 property_type external-identifier +P5658 property_type item +P5659 property_type external-identifier +P5661 property_type external-identifier +P5662 property_type external-identifier +P5663 property_type external-identifier +P5664 property_type external-identifier +P5665 property_type external-identifier +P5666 property_type external-identifier +P5667 property_type external-identifier +P5668 property_type quantity +P5669 property_type quantity +P5670 property_type quantity +P5672 property_type quantity +P5673 property_type quantity +P5674 property_type quantity +P5675 property_type quantity +P5676 property_type quantity +P5677 property_type quantity +P5678 property_type quantity +P5679 property_type quantity +P5680 property_type external-identifier +P5681 property_type quantity +P5682 property_type quantity +P5683 property_type external-identifier +P5685 property_type quantity +P5686 property_type external-identifier +P5687 property_type external-identifier +P5688 property_type external-identifier +P5690 property_type external-identifier +P5691 property_type external-identifier +P4646 property_type item +P4647 property_type item +P4649 property_type item +P4650 property_type external-identifier +P4651 property_type external-identifier +P4652 property_type external-identifier +P4653 property_type item +P4654 property_type string +P4655 property_type external-identifier +P4656 property_type url +P4657 property_type external-identifier +P4658 property_type external-identifier +P4659 property_type external-identifier +P4660 property_type external-identifier +P4661 property_type item +P4662 property_type external-identifier +P4663 property_type external-identifier +P4664 property_type external-identifier +P4665 property_type external-identifier +P4666 property_type external-identifier +P4667 property_type external-identifier +P4668 property_type external-identifier +P4669 property_type string +P4670 property_type external-identifier +P4671 property_type external-identifier +P4672 property_type external-identifier +P4673 property_type external-identifier +P4674 property_type external-identifier +P4675 property_type item +P4676 property_type external-identifier +P4677 property_type external-identifier +P4678 property_type external-identifier +P4679 property_type external-identifier +P4680 property_type item +P4681 property_type external-identifier +P4682 property_type external-identifier +P4683 property_type external-identifier +P4684 property_type external-identifier +P4685 property_type external-identifier +P4686 property_type external-identifier +P4687 property_type external-identifier +P4688 property_type item +P4689 property_type external-identifier +P4690 property_type external-identifier +P4691 property_type external-identifier +P4692 property_type external-identifier +P4693 property_type external-identifier +P4694 property_type external-identifier +P4695 property_type external-identifier +P4696 property_type external-identifier +P4697 property_type external-identifier +P4698 property_type external-identifier +P4699 property_type external-identifier +P4700 property_type external-identifier +P4701 property_type external-identifier +P4702 property_type external-identifier +P4703 property_type external-identifier +P4704 property_type external-identifier +P4705 property_type external-identifier +P4706 property_type external-identifier +P4707 property_type external-identifier +P4708 property_type external-identifier +P4709 property_type external-identifier +P4710 property_type external-identifier +P4711 property_type external-identifier +P4712 property_type external-identifier +P4713 property_type external-identifier +P4714 property_type quantity +P4715 property_type external-identifier +P4716 property_type external-identifier +P4717 property_type external-identifier +P4718 property_type external-identifier +P4720 property_type external-identifier +P4721 property_type external-identifier +P4722 property_type external-identifier +P4723 property_type external-identifier +P4724 property_type external-identifier +P4725 property_type external-identifier +P4726 property_type external-identifier +P4727 property_type external-identifier +P4728 property_type external-identifier +P4729 property_type external-identifier +P4730 property_type external-identifier +P4731 property_type external-identifier +P4732 property_type external-identifier +P4733 property_type item +P4734 property_type external-identifier +P4735 property_type external-identifier +P4736 property_type external-identifier +P4737 property_type external-identifier +P4738 property_type external-identifier +P4739 property_type external-identifier +P4740 property_type external-identifier +P4741 property_type external-identifier +P4742 property_type external-identifier +P4743 property_type item +P4744 property_type external-identifier +P4745 property_type item +P4746 property_type external-identifier +P4747 property_type external-identifier +P3522 property_type external-identifier +P3523 property_type external-identifier +P3524 property_type external-identifier +P3525 property_type external-identifier +P3526 property_type external-identifier +P3527 property_type external-identifier +P3528 property_type external-identifier +P3529 property_type quantity +P3530 property_type quantity +P3531 property_type external-identifier +P3532 property_type external-identifier +P3533 property_type external-identifier +P3534 property_type external-identifier +P3535 property_type external-identifier +P3536 property_type external-identifier +P3537 property_type external-identifier +P3538 property_type external-identifier +P3539 property_type external-identifier +P3541 property_type external-identifier +P3542 property_type external-identifier +P3544 property_type external-identifier +P3545 property_type external-identifier +P3546 property_type external-identifier +P3547 property_type external-identifier +P3548 property_type external-identifier +P3549 property_type external-identifier +P3550 property_type external-identifier +P3551 property_type external-identifier +P3552 property_type external-identifier +P3553 property_type external-identifier +P3554 property_type external-identifier +P3555 property_type external-identifier +P3556 property_type external-identifier +P3557 property_type external-identifier +P3558 property_type external-identifier +P3559 property_type quantity +P3560 property_type external-identifier +P3561 property_type external-identifier +P3562 property_type external-identifier +P3563 property_type external-identifier +P3564 property_type external-identifier +P3565 property_type external-identifier +P3566 property_type external-identifier +P3567 property_type external-identifier +P3568 property_type external-identifier +P3569 property_type external-identifier +P3570 property_type external-identifier +P3571 property_type external-identifier +P3572 property_type external-identifier +P3573 property_type external-identifier +P3574 property_type external-identifier +P3575 property_type quantity +P3576 property_type external-identifier +P3577 property_type external-identifier +P3578 property_type item +P3579 property_type external-identifier +P3580 property_type external-identifier +P3581 property_type external-identifier +P3582 property_type external-identifier +P3583 property_type external-identifier +P3584 property_type external-identifier +P3585 property_type external-identifier +P3586 property_type external-identifier +P3587 property_type external-identifier +P3588 property_type external-identifier +P3589 property_type external-identifier +P3590 property_type external-identifier +P3591 property_type external-identifier +P3592 property_type item +P3593 property_type external-identifier +P3594 property_type external-identifier +P3595 property_type external-identifier +P3596 property_type external-identifier +P3597 property_type external-identifier +P3598 property_type external-identifier +P3599 property_type external-identifier +P3600 property_type external-identifier +P3601 property_type external-identifier +P3602 property_type item +P3603 property_type external-identifier +P3604 property_type external-identifier +P3605 property_type external-identifier +P3606 property_type external-identifier +P3607 property_type external-identifier +P3608 property_type external-identifier +P3609 property_type external-identifier +P3610 property_type item +P3611 property_type external-identifier +P3612 property_type external-identifier +P3613 property_type external-identifier +P3614 property_type external-identifier +P3615 property_type external-identifier +P3616 property_type external-identifier +P3618 property_type quantity +P3619 property_type external-identifier +P3620 property_type external-identifier +P3621 property_type external-identifier +P3622 property_type external-identifier +P3623 property_type external-identifier +P3624 property_type external-identifier +P1409 property_type external-identifier +P1410 property_type quantity +P1411 property_type item +P1412 property_type item +P1414 property_type item +P1415 property_type external-identifier +P1416 property_type item +P1417 property_type external-identifier +P1418 property_type quantity +P1419 property_type item +P1420 property_type item +P1421 property_type url +P1422 property_type external-identifier +P1423 property_type item +P1424 property_type item +P1425 property_type item +P1427 property_type item +P1428 property_type external-identifier +P1429 property_type item +P1430 property_type external-identifier +P1431 property_type item +P1433 property_type item +P1434 property_type item +P1435 property_type item +P1436 property_type quantity +P1437 property_type item +P1438 property_type string +P1439 property_type external-identifier +P1440 property_type external-identifier +P1441 property_type item +P1442 property_type string +P1443 property_type item +P1444 property_type item +P1445 property_type item +P1446 property_type quantity +P1447 property_type external-identifier +P1448 property_type monolingualtext +P1449 property_type monolingualtext +P1450 property_type monolingualtext +P1451 property_type monolingualtext +P1453 property_type external-identifier +P1454 property_type item +P1455 property_type item +P1456 property_type item +P1457 property_type quantity +P1458 property_type quantity +P1459 property_type external-identifier +P1460 property_type external-identifier +P1461 property_type string +P1462 property_type item +P1463 property_type external-identifier +P1464 property_type item +P1465 property_type item +P1466 property_type external-identifier +P1467 property_type external-identifier +P1468 property_type external-identifier +P1469 property_type external-identifier +P1470 property_type quantity +P1471 property_type string +P1472 property_type string +P1473 property_type external-identifier +P1474 property_type external-identifier +P1476 property_type monolingualtext +P1477 property_type monolingualtext +P1478 property_type item +P1479 property_type item +P1480 property_type item +P1481 property_type external-identifier +P1482 property_type url +P1483 property_type external-identifier +P1529 property_type external-identifier +P1531 property_type item +P1532 property_type item +P1533 property_type item +P1534 property_type item +P1535 property_type item +P1536 property_type item +P1537 property_type item +P1538 property_type quantity +P1539 property_type quantity +P1540 property_type quantity +P1541 property_type external-identifier +P1542 property_type item +P1543 property_type string +P1544 property_type external-identifier +P1545 property_type string +P1546 property_type item +P1547 property_type item +P1548 property_type quantity +P1549 property_type monolingualtext +P1550 property_type external-identifier +P1551 property_type external-identifier +P1552 property_type item +P1553 property_type external-identifier +P1554 property_type external-identifier +P1555 property_type external-identifier +P1556 property_type external-identifier +P1557 property_type item +P1558 property_type item +P1559 property_type monolingualtext +P5168 property_type monolingualtext +P5169 property_type external-identifier +P5170 property_type external-identifier +P5171 property_type external-identifier +P5172 property_type external-identifier +P5173 property_type external-identifier +P5174 property_type external-identifier +P5175 property_type external-identifier +P5176 property_type external-identifier +P5177 property_type external-identifier +P5178 property_type url +P5179 property_type external-identifier +P5180 property_type external-identifier +P5181 property_type external-identifier +P5182 property_type external-identifier +P5183 property_type external-identifier +P5184 property_type external-identifier +P5185 property_type item +P5186 property_type item +P5187 property_type monolingualtext +P5188 property_type string +P5189 property_type string +P5191 property_type string +P5192 property_type string +P5193 property_type string +P5194 property_type item +P5195 property_type url +P5196 property_type item +P5197 property_type external-identifier +P5198 property_type external-identifier +P5199 property_type external-identifier +P5200 property_type external-identifier +P5201 property_type item +P5202 property_type item +P5203 property_type item +P5204 property_type time +P5205 property_type quantity +P5206 property_type item +P5207 property_type external-identifier +P5208 property_type external-identifier +P5209 property_type external-identifier +P5210 property_type external-identifier +P5211 property_type external-identifier +P5212 property_type external-identifier +P5213 property_type external-identifier +P5214 property_type external-identifier +P5215 property_type external-identifier +P5216 property_type external-identifier +P5217 property_type external-identifier +P5218 property_type external-identifier +P5219 property_type external-identifier +P5220 property_type external-identifier +P5221 property_type external-identifier +P5222 property_type external-identifier +P5223 property_type external-identifier +P5224 property_type external-identifier +P5225 property_type external-identifier +P5226 property_type external-identifier +P5227 property_type external-identifier +P5229 property_type external-identifier +P5230 property_type quantity +P5231 property_type external-identifier +P5232 property_type external-identifier +P5233 property_type external-identifier +P5234 property_type external-identifier +P5235 property_type external-identifier +P5236 property_type item +P5237 property_type item +P5238 property_type string +P5239 property_type external-identifier +P5240 property_type external-identifier +P5241 property_type external-identifier +P5242 property_type external-identifier +P5243 property_type external-identifier +P5244 property_type item +P5245 property_type external-identifier +P5246 property_type external-identifier +P5247 property_type external-identifier +P5248 property_type item +P5249 property_type item +P5250 property_type external-identifier +P5251 property_type external-identifier +P5252 property_type string +P5253 property_type external-identifier +P5254 property_type external-identifier +P5255 property_type external-identifier +P5256 property_type external-identifier +P5257 property_type external-identifier +P5258 property_type external-identifier +P5259 property_type external-identifier +P5260 property_type external-identifier +P5261 property_type external-identifier +P5262 property_type external-identifier +P5263 property_type external-identifier +P5264 property_type external-identifier +P5265 property_type external-identifier +P5266 property_type external-identifier +P5267 property_type external-identifier +P5268 property_type external-identifier +P5269 property_type external-identifier +P5692 property_type string +P5693 property_type external-identifier +P5694 property_type external-identifier +P5695 property_type external-identifier +P5696 property_type external-identifier +P5698 property_type external-identifier +P5699 property_type external-identifier +P5700 property_type external-identifier +P5703 property_type string +P5704 property_type external-identifier +P5705 property_type external-identifier +P5706 property_type quantity +P5707 property_type item +P5708 property_type quantity +P5709 property_type quantity +P5710 property_type external-identifier +P5711 property_type external-identifier +P5712 property_type external-identifier +P5713 property_type item +P5714 property_type external-identifier +P5715 property_type url +P5716 property_type external-identifier +P5717 property_type external-identifier +P5718 property_type external-identifier +P5719 property_type external-identifier +P5720 property_type external-identifier +P5721 property_type external-identifier +P5722 property_type external-identifier +P5723 property_type external-identifier +P5724 property_type external-identifier +P5725 property_type external-identifier +P5726 property_type external-identifier +P5727 property_type external-identifier +P5731 property_type external-identifier +P5732 property_type external-identifier +P5733 property_type external-identifier +P5734 property_type external-identifier +P5735 property_type external-identifier +P5736 property_type external-identifier +P5737 property_type external-identifier +P5738 property_type external-identifier +P5739 property_type external-identifier +P5740 property_type external-identifier +P5742 property_type external-identifier +P5743 property_type external-identifier +P5744 property_type external-identifier +P5745 property_type external-identifier +P5746 property_type external-identifier +P5747 property_type external-identifier +P5748 property_type external-identifier +P5749 property_type external-identifier +P5750 property_type external-identifier +P5752 property_type external-identifier +P5753 property_type item +P5755 property_type external-identifier +P5756 property_type external-identifier +P5757 property_type external-identifier +P5758 property_type external-identifier +P5759 property_type external-identifier +P5760 property_type external-identifier +P5761 property_type external-identifier +P5762 property_type external-identifier +P5763 property_type external-identifier +P5764 property_type external-identifier +P5765 property_type external-identifier +P5768 property_type external-identifier +P5769 property_type item +P5770 property_type external-identifier +P5771 property_type external-identifier +P5772 property_type external-identifier +P5773 property_type external-identifier +P5774 property_type external-identifier +P5775 property_type string +P5776 property_type external-identifier +P5777 property_type external-identifier +P5778 property_type string +P5779 property_type external-identifier +P5780 property_type external-identifier +P5782 property_type external-identifier +P5783 property_type external-identifier +P5784 property_type external-identifier +P5785 property_type external-identifier +P5786 property_type external-identifier +P5787 property_type external-identifier +P5788 property_type external-identifier +P5789 property_type external-identifier +P5790 property_type external-identifier +P5791 property_type external-identifier +P5792 property_type external-identifier +P5793 property_type external-identifier +P5794 property_type external-identifier +P5795 property_type external-identifier +P5796 property_type external-identifier +P5797 property_type external-identifier +P5798 property_type string +P5799 property_type external-identifier +P5800 property_type item +P5801 property_type external-identifier +P5802 property_type item +P5803 property_type external-identifier +P3625 property_type string +P3626 property_type external-identifier +P3627 property_type external-identifier +P3628 property_type external-identifier +P3629 property_type quantity +P3630 property_type external-identifier +P3631 property_type external-identifier +P3632 property_type external-identifier +P3633 property_type external-identifier +P3634 property_type external-identifier +P3635 property_type external-identifier +P3636 property_type external-identifier +P3637 property_type external-identifier +P3638 property_type external-identifier +P3639 property_type external-identifier +P3640 property_type external-identifier +P3641 property_type external-identifier +P3642 property_type external-identifier +P3643 property_type item +P3644 property_type external-identifier +P3645 property_type external-identifier +P3646 property_type external-identifier +P3647 property_type external-identifier +P3648 property_type item +P3650 property_type item +P3651 property_type external-identifier +P3652 property_type external-identifier +P3653 property_type external-identifier +P3654 property_type external-identifier +P3655 property_type external-identifier +P3656 property_type external-identifier +P3657 property_type external-identifier +P3658 property_type external-identifier +P3659 property_type external-identifier +P3660 property_type external-identifier +P3661 property_type external-identifier +P3662 property_type external-identifier +P3663 property_type external-identifier +P3664 property_type external-identifier +P3665 property_type external-identifier +P3666 property_type external-identifier +P3667 property_type external-identifier +P3668 property_type external-identifier +P3669 property_type external-identifier +P3670 property_type external-identifier +P3671 property_type external-identifier +P3672 property_type external-identifier +P3673 property_type external-identifier +P3674 property_type external-identifier +P3675 property_type external-identifier +P3676 property_type external-identifier +P3677 property_type external-identifier +P3678 property_type external-identifier +P3679 property_type item +P3680 property_type item +P3681 property_type external-identifier +P3682 property_type external-identifier +P3683 property_type external-identifier +P3684 property_type external-identifier +P3685 property_type external-identifier +P3686 property_type external-identifier +P3687 property_type external-identifier +P3689 property_type external-identifier +P3690 property_type external-identifier +P3691 property_type external-identifier +P3692 property_type external-identifier +P3693 property_type external-identifier +P3694 property_type external-identifier +P3695 property_type external-identifier +P3696 property_type external-identifier +P3697 property_type external-identifier +P3698 property_type external-identifier +P3699 property_type external-identifier +P3700 property_type external-identifier +P3701 property_type item +P3702 property_type external-identifier +P3703 property_type external-identifier +P3704 property_type external-identifier +P3705 property_type external-identifier +P3706 property_type external-identifier +P3707 property_type external-identifier +P3708 property_type external-identifier +P3709 property_type item +P3710 property_type external-identifier +P3711 property_type external-identifier +P3712 property_type item +P3713 property_type item +P3714 property_type external-identifier +P3715 property_type external-identifier +P3716 property_type item +P3717 property_type external-identifier +P3718 property_type external-identifier +P3719 property_type item +P3720 property_type external-identifier +P3721 property_type string +P3723 property_type external-identifier +P3724 property_type external-identifier +P3725 property_type external-identifier +P3726 property_type external-identifier +P3727 property_type external-identifier +P4748 property_type external-identifier +P4749 property_type external-identifier +P4750 property_type external-identifier +P4751 property_type external-identifier +P4752 property_type external-identifier +P4753 property_type external-identifier +P4754 property_type external-identifier +P4755 property_type external-identifier +P4756 property_type external-identifier +P4757 property_type external-identifier +P4758 property_type external-identifier +P4759 property_type external-identifier +P4760 property_type external-identifier +P4761 property_type external-identifier +P4762 property_type external-identifier +P4763 property_type external-identifier +P4764 property_type external-identifier +P4765 property_type url +P4766 property_type external-identifier +P4768 property_type external-identifier +P4769 property_type external-identifier +P4770 property_type item +P4771 property_type external-identifier +P4772 property_type external-identifier +P4773 property_type external-identifier +P4774 property_type item +P4775 property_type string +P4776 property_type string +P4777 property_type item +P4778 property_type external-identifier +P4779 property_type external-identifier +P4780 property_type external-identifier +P4781 property_type external-identifier +P4782 property_type external-identifier +P4783 property_type external-identifier +P4784 property_type external-identifier +P4785 property_type external-identifier +P4786 property_type external-identifier +P4787 property_type external-identifier +P4788 property_type item +P4789 property_type external-identifier +P4790 property_type external-identifier +P4791 property_type item +P4792 property_type item +P4793 property_type external-identifier +P4794 property_type item +P4795 property_type external-identifier +P4796 property_type external-identifier +P4797 property_type external-identifier +P4798 property_type external-identifier +P4799 property_type external-identifier +P4800 property_type external-identifier +P4801 property_type external-identifier +P4802 property_type external-identifier +P4803 property_type external-identifier +P4804 property_type external-identifier +P4805 property_type item +P4806 property_type external-identifier +P4807 property_type external-identifier +P4808 property_type external-identifier +P4809 property_type item +P4810 property_type item +P4811 property_type external-identifier +P4812 property_type external-identifier +P4813 property_type external-identifier +P4814 property_type external-identifier +P4815 property_type quantity +P4816 property_type external-identifier +P4818 property_type external-identifier +P4819 property_type external-identifier +P4820 property_type external-identifier +P4821 property_type external-identifier +P4822 property_type external-identifier +P4823 property_type external-identifier +P4824 property_type external-identifier +P4825 property_type quantity +P4826 property_type quantity +P4827 property_type external-identifier +P4829 property_type external-identifier +P4830 property_type external-identifier +P4831 property_type external-identifier +P4832 property_type external-identifier +P4833 property_type external-identifier +P4834 property_type external-identifier +P4835 property_type external-identifier +P4836 property_type external-identifier +P4837 property_type string +P4838 property_type external-identifier +P4839 property_type external-identifier +P4840 property_type external-identifier +P4841 property_type quantity +P4842 property_type external-identifier +P4843 property_type item +P4844 property_type item +P4845 property_type external-identifier +P4846 property_type external-identifier +P4847 property_type external-identifier +P4848 property_type external-identifier +P4849 property_type external-identifier +P4850 property_type item +P5270 property_type external-identifier +P5271 property_type external-identifier +P5272 property_type external-identifier +P5273 property_type external-identifier +P5274 property_type external-identifier +P5275 property_type external-identifier +P5276 property_type string +P5277 property_type item +P5278 property_type item +P5279 property_type string +P5280 property_type item +P5281 property_type quantity +P5282 property_type url +P5283 property_type external-identifier +P5284 property_type external-identifier +P5285 property_type external-identifier +P5286 property_type string +P5287 property_type external-identifier +P5288 property_type external-identifier +P5289 property_type external-identifier +P5290 property_type external-identifier +P5291 property_type external-identifier +P5292 property_type external-identifier +P5293 property_type external-identifier +P5294 property_type external-identifier +P5295 property_type external-identifier +P5296 property_type external-identifier +P5297 property_type external-identifier +P5298 property_type external-identifier +P5299 property_type external-identifier +P5300 property_type external-identifier +P5301 property_type external-identifier +P5302 property_type external-identifier +P5303 property_type external-identifier +P5304 property_type item +P5305 property_type url +P5306 property_type external-identifier +P5307 property_type item +P5308 property_type external-identifier +P5309 property_type external-identifier +P5310 property_type external-identifier +P5311 property_type external-identifier +P5312 property_type external-identifier +P5313 property_type external-identifier +P5314 property_type item +P5315 property_type external-identifier +P5316 property_type external-identifier +P5317 property_type item +P5318 property_type external-identifier +P5319 property_type external-identifier +P5320 property_type external-identifier +P5321 property_type external-identifier +P5323 property_type item +P5324 property_type external-identifier +P5325 property_type external-identifier +P5326 property_type item +P5327 property_type external-identifier +P5328 property_type item +P5329 property_type external-identifier +P5330 property_type item +P5331 property_type external-identifier +P5332 property_type external-identifier +P5333 property_type external-identifier +P5334 property_type external-identifier +P5335 property_type external-identifier +P5336 property_type external-identifier +P5337 property_type external-identifier +P5338 property_type external-identifier +P5339 property_type external-identifier +P5340 property_type external-identifier +P5341 property_type external-identifier +P5343 property_type external-identifier +P5344 property_type external-identifier +P5345 property_type external-identifier +P5346 property_type external-identifier +P5348 property_type quantity +P5349 property_type quantity +P5350 property_type string +P5351 property_type string +P5352 property_type string +P5353 property_type item +P5354 property_type external-identifier +P5355 property_type external-identifier +P5356 property_type external-identifier +P5357 property_type external-identifier +P5358 property_type external-identifier +P5359 property_type external-identifier +P5360 property_type external-identifier +P5361 property_type external-identifier +P5362 property_type external-identifier +P5363 property_type external-identifier +P5364 property_type external-identifier +P5365 property_type external-identifier +P5366 property_type external-identifier +P5368 property_type external-identifier +P5369 property_type external-identifier +P5370 property_type external-identifier +P5371 property_type external-identifier +P5372 property_type external-identifier +P5373 property_type external-identifier +P1560 property_type item +P1561 property_type quantity +P1562 property_type external-identifier +P1563 property_type external-identifier +P1564 property_type external-identifier +P1565 property_type external-identifier +P1566 property_type external-identifier +P1567 property_type external-identifier +P1568 property_type item +P1571 property_type item +P1573 property_type external-identifier +P1574 property_type item +P1575 property_type external-identifier +P1576 property_type item +P1577 property_type external-identifier +P1578 property_type external-identifier +P1579 property_type external-identifier +P1580 property_type external-identifier +P1581 property_type url +P1582 property_type item +P1583 property_type external-identifier +P1584 property_type external-identifier +P1585 property_type external-identifier +P1586 property_type external-identifier +P1587 property_type external-identifier +P1588 property_type string +P1589 property_type item +P1590 property_type quantity +P1591 property_type item +P1592 property_type item +P1593 property_type item +P1594 property_type item +P1595 property_type item +P1596 property_type item +P1598 property_type item +P1599 property_type external-identifier +P1600 property_type external-identifier +P1601 property_type external-identifier +P1602 property_type external-identifier +P1603 property_type quantity +P1604 property_type item +P1605 property_type item +P1606 property_type item +P1607 property_type external-identifier +P1608 property_type external-identifier +P1609 property_type external-identifier +P1610 property_type external-identifier +P1611 property_type item +P1612 property_type string +P1613 property_type url +P1614 property_type external-identifier +P1615 property_type external-identifier +P1616 property_type external-identifier +P1617 property_type external-identifier +P1618 property_type string +P1619 property_type time +P1620 property_type item +P1621 property_type string +P1622 property_type item +P1624 property_type external-identifier +P1625 property_type item +P1626 property_type external-identifier +P1627 property_type external-identifier +P1628 property_type url +P1629 property_type item +P1630 property_type string +P1631 property_type external-identifier +P1632 property_type external-identifier +P1635 property_type monolingualtext +P1636 property_type time +P1637 property_type item +P1638 property_type monolingualtext +P1639 property_type item +P1640 property_type item +P1641 property_type quantity +P1642 property_type item +P1643 property_type item +P1644 property_type external-identifier +P1645 property_type external-identifier +P1647 property_type string +P1648 property_type external-identifier +P1649 property_type external-identifier +P1650 property_type external-identifier +P1651 property_type external-identifier +P1652 property_type item +P1653 property_type external-identifier +P1654 property_type item +P1656 property_type item +P1657 property_type item +P1659 property_type string +P1660 property_type item +P1661 property_type quantity +P1662 property_type external-identifier +P1663 property_type external-identifier +P1664 property_type external-identifier +P1665 property_type external-identifier +P1666 property_type external-identifier +P1667 property_type external-identifier +P1668 property_type external-identifier +P1669 property_type external-identifier +P3728 property_type external-identifier +P3729 property_type item +P3730 property_type item +P3731 property_type external-identifier +P3732 property_type external-identifier +P3733 property_type external-identifier +P3734 property_type item +P3735 property_type external-identifier +P3736 property_type external-identifier +P3737 property_type quantity +P3738 property_type quantity +P3739 property_type item +P3740 property_type quantity +P3741 property_type item +P3742 property_type external-identifier +P3743 property_type external-identifier +P3744 property_type quantity +P3745 property_type external-identifier +P3746 property_type external-identifier +P3747 property_type external-identifier +P3748 property_type external-identifier +P3749 property_type external-identifier +P3750 property_type external-identifier +P3751 property_type external-identifier +P3752 property_type string +P3753 property_type string +P3754 property_type string +P3755 property_type string +P3756 property_type string +P3757 property_type string +P3758 property_type external-identifier +P3759 property_type external-identifier +P3760 property_type external-identifier +P3761 property_type string +P3762 property_type external-identifier +P3763 property_type external-identifier +P3764 property_type item +P3765 property_type external-identifier +P3766 property_type external-identifier +P3767 property_type external-identifier +P3768 property_type external-identifier +P3769 property_type external-identifier +P3770 property_type external-identifier +P3771 property_type item +P3772 property_type item +P3773 property_type item +P3774 property_type item +P3775 property_type item +P3776 property_type item +P3777 property_type item +P3778 property_type item +P3779 property_type item +P3780 property_type item +P3781 property_type item +P3782 property_type external-identifier +P3783 property_type external-identifier +P3784 property_type external-identifier +P3785 property_type external-identifier +P3786 property_type external-identifier +P3787 property_type external-identifier +P3788 property_type external-identifier +P3789 property_type external-identifier +P3790 property_type external-identifier +P3791 property_type external-identifier +P3792 property_type quantity +P3793 property_type string +P3794 property_type external-identifier +P3795 property_type external-identifier +P3796 property_type external-identifier +P3797 property_type external-identifier +P3798 property_type external-identifier +P3799 property_type external-identifier +P3800 property_type external-identifier +P3801 property_type external-identifier +P3802 property_type external-identifier +P3803 property_type item +P3804 property_type external-identifier +P3805 property_type external-identifier +P3806 property_type external-identifier +P3807 property_type external-identifier +P3808 property_type external-identifier +P3809 property_type external-identifier +P3810 property_type external-identifier +P3811 property_type external-identifier +P3812 property_type external-identifier +P3813 property_type external-identifier +P3814 property_type external-identifier +P3815 property_type item +P3816 property_type item +P3817 property_type external-identifier +P3818 property_type item +P3819 property_type external-identifier +P3820 property_type external-identifier +P3821 property_type external-identifier +P3822 property_type item +P3823 property_type item +P3824 property_type external-identifier +P3825 property_type external-identifier +P3826 property_type external-identifier +P3827 property_type external-identifier +P5804 property_type item +P5805 property_type item +P5806 property_type external-identifier +P5807 property_type external-identifier +P5808 property_type external-identifier +P5809 property_type external-identifier +P5810 property_type string +P5811 property_type quantity +P5813 property_type external-identifier +P5814 property_type external-identifier +P5815 property_type external-identifier +P5816 property_type item +P5817 property_type item +P5818 property_type external-identifier +P5819 property_type external-identifier +P5820 property_type external-identifier +P5821 property_type external-identifier +P5822 property_type quantity +P5823 property_type external-identifier +P5824 property_type item +P5825 property_type string +P5826 property_type item +P5827 property_type external-identifier +P5828 property_type item +P5829 property_type external-identifier +P5830 property_type string +P5831 property_type monolingualtext +P5832 property_type item +P5833 property_type external-identifier +P5834 property_type external-identifier +P5835 property_type external-identifier +P5836 property_type external-identifier +P5838 property_type external-identifier +P5839 property_type external-identifier +P5840 property_type external-identifier +P5841 property_type item +P5842 property_type external-identifier +P5843 property_type external-identifier +P5844 property_type external-identifier +P5845 property_type external-identifier +P5846 property_type external-identifier +P5847 property_type external-identifier +P5848 property_type external-identifier +P5849 property_type external-identifier +P5851 property_type external-identifier +P5852 property_type item +P5858 property_type string +P5859 property_type external-identifier +P5860 property_type external-identifier +P5862 property_type external-identifier +P5863 property_type external-identifier +P5864 property_type external-identifier +P5865 property_type external-identifier +P5866 property_type external-identifier +P5867 property_type external-identifier +P5868 property_type external-identifier +P5869 property_type item +P5870 property_type external-identifier +P5871 property_type external-identifier +P5872 property_type item +P5873 property_type item +P5874 property_type external-identifier +P5875 property_type external-identifier +P5876 property_type external-identifier +P5877 property_type external-identifier +P5878 property_type string +P5879 property_type external-identifier +P5880 property_type item +P5881 property_type item +P5882 property_type external-identifier +P5883 property_type external-identifier +P5884 property_type external-identifier +P5885 property_type external-identifier +P5886 property_type item +P5887 property_type external-identifier +P5888 property_type external-identifier +P5890 property_type external-identifier +P5891 property_type external-identifier +P5892 property_type external-identifier +P5893 property_type quantity +P5894 property_type quantity +P5895 property_type quantity +P5896 property_type quantity +P5897 property_type quantity +P5898 property_type quantity +P5899 property_type quantity +P5900 property_type quantity +P5901 property_type string +P5902 property_type external-identifier +P5903 property_type external-identifier +P5904 property_type external-identifier +P5905 property_type external-identifier +P5906 property_type external-identifier +P5908 property_type external-identifier +P5909 property_type external-identifier +P5910 property_type string +P5911 property_type item +P5912 property_type external-identifier +P5913 property_type item +P5914 property_type external-identifier +P1670 property_type external-identifier +P1671 property_type string +P1672 property_type item +P1673 property_type string +P1674 property_type quantity +P1675 property_type quantity +P1676 property_type quantity +P1677 property_type item +P1678 property_type item +P1679 property_type external-identifier +P1680 property_type monolingualtext +P1683 property_type monolingualtext +P1684 property_type monolingualtext +P1685 property_type string +P1686 property_type item +P1687 property_type string +P1689 property_type quantity +P1690 property_type external-identifier +P1691 property_type external-identifier +P1692 property_type string +P1693 property_type external-identifier +P1694 property_type external-identifier +P1695 property_type external-identifier +P1696 property_type string +P1697 property_type quantity +P1699 property_type external-identifier +P1700 property_type external-identifier +P1702 property_type external-identifier +P1703 property_type item +P1704 property_type item +P1705 property_type monolingualtext +P1706 property_type item +P1707 property_type external-identifier +P1708 property_type external-identifier +P1709 property_type url +P1710 property_type external-identifier +P1711 property_type external-identifier +P1712 property_type external-identifier +P1713 property_type url +P1714 property_type external-identifier +P1715 property_type external-identifier +P1716 property_type item +P1717 property_type external-identifier +P1721 property_type string +P1725 property_type quantity +P1726 property_type external-identifier +P1727 property_type external-identifier +P1728 property_type external-identifier +P1729 property_type external-identifier +P1730 property_type external-identifier +P1731 property_type item +P1732 property_type external-identifier +P1733 property_type external-identifier +P1734 property_type time +P1735 property_type external-identifier +P1736 property_type external-identifier +P1738 property_type external-identifier +P1739 property_type external-identifier +P1740 property_type item +P1741 property_type external-identifier +P1743 property_type external-identifier +P1744 property_type external-identifier +P1745 property_type external-identifier +P1746 property_type external-identifier +P1747 property_type external-identifier +P1748 property_type string +P1749 property_type external-identifier +P1750 property_type item +P1751 property_type external-identifier +P1752 property_type quantity +P1753 property_type item +P1754 property_type item +P1755 property_type external-identifier +P1760 property_type external-identifier +P1761 property_type external-identifier +P1762 property_type string +P1763 property_type external-identifier +P1764 property_type external-identifier +P1766 property_type string +P1769 property_type external-identifier +P1770 property_type external-identifier +P1771 property_type external-identifier +P1772 property_type external-identifier +P1774 property_type item +P1775 property_type item +P1776 property_type item +P1777 property_type item +P1778 property_type item +P1779 property_type item +P1780 property_type item +P1782 property_type string +P1785 property_type string +P1786 property_type string +P1787 property_type string +P1788 property_type external-identifier +P1789 property_type item +P1791 property_type item +P1792 property_type item +P1793 property_type string +P1794 property_type external-identifier +P7486 property_type item +P7487 property_type external-identifier +P7488 property_type external-identifier +P7489 property_type external-identifier +P7490 property_type external-identifier +P7491 property_type external-identifier +P7492 property_type external-identifier +P7493 property_type external-identifier +P7494 property_type external-identifier +P7495 property_type external-identifier +P7496 property_type external-identifier +P7497 property_type external-identifier +P7498 property_type external-identifier +P7499 property_type external-identifier +P7500 property_type item +P7501 property_type item +P7502 property_type external-identifier +P7503 property_type external-identifier +P7504 property_type external-identifier +P7505 property_type external-identifier +P7506 property_type external-identifier +P7507 property_type external-identifier +P7508 property_type item +P7509 property_type external-identifier +P7510 property_type url +P7511 property_type external-identifier +P7512 property_type external-identifier +P7513 property_type external-identifier +P7514 property_type item +P7515 property_type external-identifier +P7516 property_type external-identifier +P7517 property_type external-identifier +P7518 property_type external-identifier +P7519 property_type external-identifier +P7520 property_type external-identifier +P7521 property_type external-identifier +P7522 property_type external-identifier +P7523 property_type external-identifier +P7524 property_type external-identifier +P7525 property_type external-identifier +P7526 property_type external-identifier +P7527 property_type quantity +P7528 property_type item +P7529 property_type external-identifier +P7530 property_type external-identifier +P7531 property_type external-identifier +P7532 property_type string +P7533 property_type external-identifier +P7534 property_type external-identifier +P7535 property_type monolingualtext +P7536 property_type external-identifier +P7537 property_type external-identifier +P7538 property_type external-identifier +P7539 property_type external-identifier +P7540 property_type external-identifier +P7541 property_type external-identifier +P7542 property_type external-identifier +P7543 property_type external-identifier +P7544 property_type external-identifier +P7545 property_type external-identifier +P7546 property_type external-identifier +P7547 property_type external-identifier +P7548 property_type external-identifier +P7549 property_type external-identifier +P7550 property_type external-identifier +P7551 property_type external-identifier +P7552 property_type external-identifier +P7553 property_type external-identifier +P7554 property_type external-identifier +P7555 property_type external-identifier +P7556 property_type external-identifier +P7558 property_type external-identifier +P7559 property_type external-identifier +P7560 property_type external-identifier +P7561 property_type item +P7562 property_type external-identifier +P7563 property_type external-identifier +P7564 property_type external-identifier +P7565 property_type external-identifier +P7566 property_type external-identifier +P7567 property_type external-identifier +P7568 property_type external-identifier +P7569 property_type url +P7570 property_type external-identifier +P7571 property_type external-identifier +P7572 property_type external-identifier +P7573 property_type item +P7574 property_type external-identifier +P7575 property_type external-identifier +P7576 property_type external-identifier +P7577 property_type external-identifier +P7578 property_type external-identifier +P7579 property_type external-identifier +P7580 property_type external-identifier +P7581 property_type string +P7582 property_type item +P7583 property_type external-identifier +P7584 property_type quantity +P7585 property_type external-identifier +P7586 property_type external-identifier +P3828 property_type item +P3829 property_type external-identifier +P3830 property_type external-identifier +P3831 property_type item +P3832 property_type external-identifier +P3833 property_type item +P3834 property_type item +P3835 property_type external-identifier +P3836 property_type external-identifier +P3837 property_type external-identifier +P3838 property_type external-identifier +P3839 property_type external-identifier +P3840 property_type quantity +P3841 property_type external-identifier +P3842 property_type item +P3843 property_type external-identifier +P3844 property_type external-identifier +P3845 property_type external-identifier +P3846 property_type external-identifier +P3847 property_type external-identifier +P3848 property_type external-identifier +P3849 property_type external-identifier +P3850 property_type external-identifier +P3851 property_type external-identifier +P3852 property_type external-identifier +P3853 property_type external-identifier +P3854 property_type external-identifier +P3855 property_type external-identifier +P3856 property_type external-identifier +P3857 property_type external-identifier +P3858 property_type item +P3859 property_type external-identifier +P3860 property_type external-identifier +P3861 property_type external-identifier +P3862 property_type external-identifier +P3863 property_type external-identifier +P3864 property_type quantity +P3865 property_type item +P3866 property_type external-identifier +P3867 property_type external-identifier +P3868 property_type external-identifier +P3869 property_type external-identifier +P3870 property_type external-identifier +P3871 property_type item +P3872 property_type quantity +P3874 property_type external-identifier +P3875 property_type external-identifier +P3876 property_type item +P3877 property_type external-identifier +P3878 property_type string +P3879 property_type string +P3880 property_type string +P3881 property_type external-identifier +P3882 property_type external-identifier +P3883 property_type external-identifier +P3884 property_type external-identifier +P3885 property_type external-identifier +P3886 property_type quantity +P3887 property_type external-identifier +P3888 property_type external-identifier +P3889 property_type external-identifier +P3890 property_type external-identifier +P3891 property_type quantity +P3892 property_type external-identifier +P3893 property_type time +P3894 property_type external-identifier +P3895 property_type external-identifier +P3896 property_type string +P3897 property_type external-identifier +P3898 property_type external-identifier +P3899 property_type external-identifier +P3900 property_type external-identifier +P3901 property_type external-identifier +P3902 property_type item +P3903 property_type string +P3904 property_type external-identifier +P3906 property_type external-identifier +P3907 property_type external-identifier +P3908 property_type external-identifier +P3909 property_type monolingualtext +P3910 property_type external-identifier +P3911 property_type external-identifier +P3912 property_type item +P3913 property_type external-identifier +P3914 property_type external-identifier +P3915 property_type external-identifier +P3916 property_type external-identifier +P3917 property_type quantity +P3918 property_type external-identifier +P3919 property_type item +P3920 property_type external-identifier +P3921 property_type string +P3922 property_type string +P3923 property_type external-identifier +P3924 property_type external-identifier +P3925 property_type external-identifier +P3926 property_type external-identifier +P3927 property_type external-identifier +P3928 property_type external-identifier +P3929 property_type external-identifier +P6437 property_type item +P6438 property_type quantity +P6439 property_type item +P6440 property_type item +P6441 property_type external-identifier +P6442 property_type external-identifier +P6443 property_type external-identifier +P6444 property_type external-identifier +P6445 property_type external-identifier +P6446 property_type external-identifier +P6447 property_type external-identifier +P6448 property_type external-identifier +P6449 property_type external-identifier +P6450 property_type external-identifier +P6451 property_type external-identifier +P6452 property_type item +P6453 property_type external-identifier +P6454 property_type external-identifier +P6455 property_type external-identifier +P6456 property_type external-identifier +P6457 property_type external-identifier +P6458 property_type external-identifier +P6459 property_type external-identifier +P6460 property_type external-identifier +P6461 property_type external-identifier +P6462 property_type external-identifier +P6463 property_type external-identifier +P6464 property_type external-identifier +P6465 property_type external-identifier +P6466 property_type external-identifier +P6467 property_type external-identifier +P6468 property_type external-identifier +P6469 property_type external-identifier +P6470 property_type external-identifier +P6471 property_type external-identifier +P6472 property_type external-identifier +P6473 property_type external-identifier +P6474 property_type external-identifier +P6475 property_type external-identifier +P6476 property_type external-identifier +P6477 property_type item +P6478 property_type external-identifier +P6479 property_type external-identifier +P6480 property_type external-identifier +P6481 property_type external-identifier +P6482 property_type external-identifier +P6483 property_type external-identifier +P6484 property_type external-identifier +P6485 property_type external-identifier +P6486 property_type external-identifier +P6487 property_type external-identifier +P6488 property_type external-identifier +P6489 property_type external-identifier +P6490 property_type external-identifier +P6491 property_type external-identifier +P6492 property_type external-identifier +P6493 property_type external-identifier +P6494 property_type external-identifier +P6495 property_type external-identifier +P6496 property_type external-identifier +P6497 property_type quantity +P6498 property_type quantity +P6499 property_type quantity +P6500 property_type url +P6501 property_type external-identifier +P6502 property_type external-identifier +P6503 property_type external-identifier +P6504 property_type external-identifier +P6506 property_type external-identifier +P6507 property_type string +P6509 property_type quantity +P6510 property_type quantity +P6512 property_type external-identifier +P6513 property_type external-identifier +P6514 property_type external-identifier +P6515 property_type external-identifier +P6516 property_type external-identifier +P6517 property_type external-identifier +P6518 property_type external-identifier +P6519 property_type external-identifier +P6520 property_type external-identifier +P6521 property_type external-identifier +P6524 property_type item +P6525 property_type external-identifier +P6526 property_type external-identifier +P6527 property_type external-identifier +P6528 property_type external-identifier +P6529 property_type string +P6530 property_type item +P6531 property_type item +P6532 property_type item +P6533 property_type item +P6534 property_type item +P6535 property_type external-identifier +P6536 property_type external-identifier +P6537 property_type external-identifier +P6538 property_type external-identifier +P6539 property_type external-identifier +P6540 property_type item +P6541 property_type url +P3930 property_type external-identifier +P3931 property_type item +P3932 property_type external-identifier +P3933 property_type external-identifier +P3934 property_type quantity +P3935 property_type external-identifier +P3936 property_type external-identifier +P3937 property_type external-identifier +P3938 property_type item +P3939 property_type external-identifier +P3940 property_type external-identifier +P3941 property_type external-identifier +P3942 property_type external-identifier +P3943 property_type external-identifier +P3944 property_type external-identifier +P3945 property_type external-identifier +P3946 property_type external-identifier +P3948 property_type external-identifier +P3949 property_type external-identifier +P3950 property_type url +P3951 property_type external-identifier +P3952 property_type external-identifier +P3953 property_type external-identifier +P3954 property_type external-identifier +P3955 property_type external-identifier +P3956 property_type external-identifier +P3957 property_type external-identifier +P3958 property_type external-identifier +P3959 property_type external-identifier +P3960 property_type external-identifier +P3961 property_type external-identifier +P3962 property_type external-identifier +P3963 property_type external-identifier +P3964 property_type external-identifier +P3965 property_type external-identifier +P3966 property_type item +P3967 property_type item +P3968 property_type external-identifier +P3969 property_type item +P3970 property_type string +P3971 property_type external-identifier +P3972 property_type external-identifier +P3973 property_type external-identifier +P3974 property_type external-identifier +P3975 property_type item +P3976 property_type external-identifier +P3977 property_type external-identifier +P3978 property_type external-identifier +P3979 property_type external-identifier +P3980 property_type external-identifier +P3981 property_type external-identifier +P3982 property_type external-identifier +P3983 property_type quantity +P3984 property_type external-identifier +P3985 property_type item +P3986 property_type external-identifier +P3987 property_type external-identifier +P3988 property_type external-identifier +P3989 property_type item +P3990 property_type external-identifier +P3991 property_type external-identifier +P3992 property_type external-identifier +P3993 property_type external-identifier +P3994 property_type string +P3995 property_type external-identifier +P3996 property_type external-identifier +P3997 property_type external-identifier +P3998 property_type external-identifier +P3999 property_type time +P4000 property_type item +P4001 property_type url +P4002 property_type item +P4003 property_type external-identifier +P4004 property_type string +P4005 property_type external-identifier +P4006 property_type item +P4007 property_type external-identifier +P4008 property_type external-identifier +P4009 property_type external-identifier +P4010 property_type quantity +P4011 property_type external-identifier +P4012 property_type external-identifier +P4013 property_type external-identifier +P4014 property_type external-identifier +P4015 property_type external-identifier +P4016 property_type external-identifier +P4017 property_type external-identifier +P4018 property_type external-identifier +P4019 property_type external-identifier +P4020 property_type string +P4021 property_type external-identifier +P4022 property_type external-identifier +P4023 property_type external-identifier +P4024 property_type external-identifier +P4025 property_type external-identifier +P4026 property_type external-identifier +P4027 property_type external-identifier +P4028 property_type external-identifier +P4029 property_type external-identifier +P4030 property_type external-identifier +P5915 property_type external-identifier +P5916 property_type external-identifier +P5917 property_type external-identifier +P5918 property_type external-identifier +P5920 property_type string +P5921 property_type external-identifier +P5922 property_type external-identifier +P5923 property_type item +P5925 property_type external-identifier +P5926 property_type external-identifier +P5927 property_type external-identifier +P5928 property_type external-identifier +P5929 property_type quantity +P5930 property_type external-identifier +P5931 property_type external-identifier +P5932 property_type external-identifier +P5933 property_type external-identifier +P5934 property_type external-identifier +P5935 property_type external-identifier +P5936 property_type external-identifier +P5937 property_type external-identifier +P5938 property_type external-identifier +P5939 property_type external-identifier +P5940 property_type item +P5941 property_type external-identifier +P5942 property_type external-identifier +P5944 property_type external-identifier +P5945 property_type external-identifier +P5946 property_type external-identifier +P5947 property_type quantity +P5948 property_type external-identifier +P5949 property_type string +P5950 property_type external-identifier +P5951 property_type external-identifier +P5952 property_type external-identifier +P5953 property_type external-identifier +P5954 property_type external-identifier +P5955 property_type external-identifier +P5956 property_type external-identifier +P5957 property_type external-identifier +P5958 property_type external-identifier +P5959 property_type external-identifier +P5960 property_type external-identifier +P5961 property_type item +P5962 property_type string +P5963 property_type external-identifier +P5964 property_type external-identifier +P5965 property_type external-identifier +P5966 property_type external-identifier +P5967 property_type item +P5968 property_type external-identifier +P5969 property_type external-identifier +P5970 property_type item +P5971 property_type external-identifier +P5972 property_type string +P5973 property_type string +P5974 property_type string +P5975 property_type string +P5976 property_type string +P5977 property_type string +P5978 property_type string +P5979 property_type string +P5980 property_type string +P5981 property_type external-identifier +P5982 property_type quantity +P5983 property_type external-identifier +P5984 property_type external-identifier +P5985 property_type external-identifier +P5986 property_type external-identifier +P5987 property_type external-identifier +P5988 property_type external-identifier +P5989 property_type external-identifier +P5990 property_type external-identifier +P5991 property_type quantity +P5992 property_type quantity +P5993 property_type quantity +P5994 property_type string +P5995 property_type item +P5996 property_type item +P5997 property_type string +P5998 property_type item +P5999 property_type external-identifier +P6000 property_type quantity +P6001 property_type item +P6002 property_type external-identifier +P6003 property_type external-identifier +P6004 property_type external-identifier +P6005 property_type external-identifier +P6006 property_type external-identifier +P6007 property_type external-identifier +P6008 property_type external-identifier +P6009 property_type external-identifier +P6010 property_type external-identifier +P6011 property_type external-identifier +P6012 property_type external-identifier +P6013 property_type external-identifier +P6014 property_type quantity +P6015 property_type external-identifier +P6016 property_type external-identifier +P6017 property_type external-identifier +P1795 property_type external-identifier +P1796 property_type string +P1798 property_type external-identifier +P1799 property_type external-identifier +P1800 property_type external-identifier +P1801 property_type string +P1802 property_type external-identifier +P1803 property_type external-identifier +P1804 property_type external-identifier +P1806 property_type external-identifier +P1807 property_type external-identifier +P1808 property_type external-identifier +P1809 property_type item +P1810 property_type string +P1811 property_type item +P1813 property_type monolingualtext +P1814 property_type string +P1815 property_type string +P1816 property_type external-identifier +P1817 property_type item +P1818 property_type external-identifier +P1819 property_type external-identifier +P1820 property_type string +P1821 property_type external-identifier +P1822 property_type external-identifier +P1823 property_type external-identifier +P1824 property_type string +P1825 property_type external-identifier +P1826 property_type external-identifier +P1827 property_type external-identifier +P1828 property_type external-identifier +P1829 property_type external-identifier +P1830 property_type item +P1831 property_type quantity +P1832 property_type external-identifier +P1833 property_type quantity +P1836 property_type string +P1837 property_type external-identifier +P1838 property_type external-identifier +P1839 property_type external-identifier +P1840 property_type item +P1841 property_type external-identifier +P1842 property_type external-identifier +P1843 property_type monolingualtext +P1844 property_type external-identifier +P1845 property_type string +P1846 property_type string +P1847 property_type external-identifier +P1848 property_type external-identifier +P1849 property_type external-identifier +P1850 property_type external-identifier +P1851 property_type item +P1852 property_type external-identifier +P1853 property_type item +P1854 property_type external-identifier +P1855 property_type item +P1866 property_type external-identifier +P1867 property_type quantity +P1868 property_type quantity +P1869 property_type external-identifier +P1870 property_type external-identifier +P1871 property_type external-identifier +P1872 property_type quantity +P1873 property_type quantity +P1874 property_type external-identifier +P1875 property_type item +P1876 property_type item +P1877 property_type item +P1878 property_type item +P1879 property_type item +P1880 property_type item +P1881 property_type item +P1882 property_type external-identifier +P1883 property_type external-identifier +P1884 property_type item +P1885 property_type item +P1886 property_type external-identifier +P1887 property_type item +P1888 property_type external-identifier +P1889 property_type item +P1890 property_type external-identifier +P1891 property_type item +P1893 property_type external-identifier +P1894 property_type external-identifier +P1895 property_type external-identifier +P1896 property_type url +P1897 property_type item +P1898 property_type item +P1899 property_type external-identifier +P1900 property_type external-identifier +P1901 property_type external-identifier +P1902 property_type external-identifier +P1903 property_type item +P1906 property_type item +P1907 property_type external-identifier +P1908 property_type external-identifier +P1909 property_type item +P1910 property_type item +P1911 property_type item +P1912 property_type item +P7587 property_type external-identifier +P7588 property_type time +P7589 property_type time +P7590 property_type external-identifier +P7591 property_type external-identifier +P7592 property_type external-identifier +P7593 property_type external-identifier +P7594 property_type external-identifier +P7595 property_type external-identifier +P7596 property_type external-identifier +P7597 property_type external-identifier +P7598 property_type string +P7599 property_type string +P7600 property_type item +P7601 property_type item +P7602 property_type external-identifier +P7603 property_type item +P7604 property_type item +P7605 property_type string +P7606 property_type external-identifier +P7607 property_type external-identifier +P7608 property_type external-identifier +P7609 property_type external-identifier +P7610 property_type external-identifier +P7611 property_type external-identifier +P7612 property_type external-identifier +P7613 property_type external-identifier +P7614 property_type external-identifier +P7615 property_type external-identifier +P7616 property_type external-identifier +P7617 property_type external-identifier +P7618 property_type external-identifier +P7619 property_type external-identifier +P7620 property_type external-identifier +P7621 property_type external-identifier +P7622 property_type external-identifier +P7623 property_type external-identifier +P7624 property_type external-identifier +P7625 property_type external-identifier +P7626 property_type external-identifier +P7627 property_type external-identifier +P7630 property_type external-identifier +P7631 property_type external-identifier +P7632 property_type external-identifier +P7633 property_type external-identifier +P7634 property_type external-identifier +P7635 property_type external-identifier +P7636 property_type external-identifier +P7637 property_type external-identifier +P7638 property_type external-identifier +P7639 property_type external-identifier +P7641 property_type external-identifier +P7642 property_type external-identifier +P7643 property_type item +P7644 property_type external-identifier +P7645 property_type external-identifier +P7646 property_type external-identifier +P7647 property_type external-identifier +P7648 property_type external-identifier +P7649 property_type external-identifier +P7650 property_type external-identifier +P7651 property_type external-identifier +P7652 property_type external-identifier +P7653 property_type external-identifier +P7654 property_type external-identifier +P7655 property_type external-identifier +P7656 property_type external-identifier +P7657 property_type external-identifier +P7658 property_type external-identifier +P7659 property_type external-identifier +P7660 property_type external-identifier +P7661 property_type external-identifier +P7662 property_type external-identifier +P7663 property_type external-identifier +P7665 property_type external-identifier +P7666 property_type external-identifier +P7667 property_type external-identifier +P7668 property_type quantity +P7669 property_type external-identifier +P7670 property_type external-identifier +P7671 property_type external-identifier +P7672 property_type external-identifier +P7673 property_type external-identifier +P7674 property_type external-identifier +P7675 property_type external-identifier +P7676 property_type external-identifier +P7677 property_type external-identifier +P7678 property_type external-identifier +P7679 property_type external-identifier +P7680 property_type external-identifier +P7681 property_type external-identifier +P7682 property_type external-identifier +P7683 property_type external-identifier +P7684 property_type external-identifier +P7685 property_type external-identifier +P7686 property_type external-identifier +P7687 property_type external-identifier +P7688 property_type external-identifier +P7689 property_type external-identifier +P7690 property_type external-identifier +P6542 property_type external-identifier +P6543 property_type quantity +P6544 property_type quantity +P6545 property_type quantity +P6546 property_type quantity +P6547 property_type quantity +P6548 property_type external-identifier +P6549 property_type external-identifier +P6550 property_type external-identifier +P6551 property_type external-identifier +P6552 property_type external-identifier +P6553 property_type string +P6554 property_type external-identifier +P6555 property_type external-identifier +P6556 property_type external-identifier +P6557 property_type external-identifier +P6558 property_type external-identifier +P6559 property_type external-identifier +P6560 property_type external-identifier +P6561 property_type external-identifier +P6562 property_type external-identifier +P6563 property_type item +P6564 property_type external-identifier +P6565 property_type external-identifier +P6566 property_type external-identifier +P6567 property_type external-identifier +P6568 property_type item +P6569 property_type item +P6570 property_type quantity +P6571 property_type string +P6572 property_type external-identifier +P6573 property_type external-identifier +P6574 property_type external-identifier +P6575 property_type external-identifier +P6576 property_type external-identifier +P6577 property_type external-identifier +P6578 property_type external-identifier +P6579 property_type external-identifier +P6580 property_type external-identifier +P6581 property_type external-identifier +P6582 property_type external-identifier +P6583 property_type external-identifier +P6584 property_type external-identifier +P6585 property_type external-identifier +P6586 property_type string +P6587 property_type item +P6589 property_type quantity +P6590 property_type quantity +P6591 property_type quantity +P6592 property_type string +P6593 property_type string +P6594 property_type external-identifier +P6595 property_type external-identifier +P6596 property_type external-identifier +P6597 property_type external-identifier +P6598 property_type external-identifier +P6599 property_type external-identifier +P6600 property_type external-identifier +P6601 property_type external-identifier +P6602 property_type external-identifier +P6603 property_type external-identifier +P6604 property_type string +P6605 property_type external-identifier +P6606 property_type item +P6607 property_type monolingualtext +P6608 property_type item +P6609 property_type string +P6610 property_type external-identifier +P6611 property_type external-identifier +P6612 property_type external-identifier +P6613 property_type external-identifier +P6614 property_type external-identifier +P6615 property_type external-identifier +P6616 property_type external-identifier +P6617 property_type external-identifier +P6618 property_type external-identifier +P6619 property_type external-identifier +P6620 property_type external-identifier +P6621 property_type external-identifier +P6622 property_type external-identifier +P6623 property_type external-identifier +P6624 property_type external-identifier +P6625 property_type external-identifier +P6626 property_type external-identifier +P6627 property_type external-identifier +P6628 property_type external-identifier +P6629 property_type external-identifier +P6630 property_type external-identifier +P6631 property_type external-identifier +P6632 property_type external-identifier +P6633 property_type external-identifier +P6634 property_type external-identifier +P6635 property_type external-identifier +P6636 property_type external-identifier +P6637 property_type external-identifier +P6639 property_type quantity +P6640 property_type external-identifier +P6641 property_type external-identifier +P6643 property_type external-identifier +P6644 property_type external-identifier +P7792 property_type external-identifier +P7793 property_type string +P7794 property_type external-identifier +P7795 property_type external-identifier +P7796 property_type external-identifier +P7797 property_type external-identifier +P7798 property_type external-identifier +P7799 property_type external-identifier +P7800 property_type external-identifier +P7801 property_type external-identifier +P7802 property_type external-identifier +P7803 property_type external-identifier +P7804 property_type external-identifier +P7805 property_type external-identifier +P7806 property_type external-identifier +P7807 property_type external-identifier +P7808 property_type external-identifier +P7809 property_type external-identifier +P7810 property_type external-identifier +P7811 property_type external-identifier +P7812 property_type external-identifier +P7813 property_type external-identifier +P7814 property_type external-identifier +P7815 property_type external-identifier +P7816 property_type external-identifier +P7817 property_type external-identifier +P7818 property_type external-identifier +P7819 property_type external-identifier +P7820 property_type external-identifier +P7821 property_type external-identifier +P7822 property_type external-identifier +P7823 property_type external-identifier +P7824 property_type external-identifier +P7825 property_type external-identifier +P7826 property_type external-identifier +P7827 property_type external-identifier +P7828 property_type external-identifier +P7829 property_type external-identifier +P7830 property_type external-identifier +P7831 property_type external-identifier +P7832 property_type external-identifier +P7834 property_type external-identifier +P7835 property_type external-identifier +P7836 property_type external-identifier +P7837 property_type external-identifier +P7838 property_type external-identifier +P7839 property_type external-identifier +P7840 property_type external-identifier +P7841 property_type external-identifier +P7842 property_type external-identifier +P7843 property_type external-identifier +P7844 property_type external-identifier +P7845 property_type external-identifier +P7846 property_type external-identifier +P7847 property_type external-identifier +P7848 property_type external-identifier +P7849 property_type external-identifier +P7850 property_type external-identifier +P7851 property_type external-identifier +P7852 property_type external-identifier +P7853 property_type external-identifier +P7854 property_type external-identifier +P7855 property_type string +P7856 property_type external-identifier +P7857 property_type external-identifier +P7858 property_type external-identifier +P7859 property_type external-identifier +P7860 property_type external-identifier +P7861 property_type item +P7862 property_type quantity +P7863 property_type quantity +P7864 property_type external-identifier +P7865 property_type external-identifier +P7866 property_type external-identifier +P7867 property_type item +P7868 property_type external-identifier +P7869 property_type external-identifier +P7870 property_type external-identifier +P7871 property_type external-identifier +P7872 property_type external-identifier +P7873 property_type external-identifier +P7874 property_type external-identifier +P7875 property_type external-identifier +P7876 property_type external-identifier +P7877 property_type external-identifier +P7878 property_type external-identifier +P7879 property_type external-identifier +P7880 property_type external-identifier +P7881 property_type external-identifier +P7882 property_type external-identifier +P7883 property_type external-identifier +P7884 property_type external-identifier +P7885 property_type external-identifier +P7886 property_type external-identifier +P7887 property_type quantity +P7888 property_type item +P7889 property_type external-identifier +P7890 property_type url +P7891 property_type external-identifier +P7892 property_type external-identifier +P6018 property_type external-identifier +P6019 property_type external-identifier +P6020 property_type external-identifier +P6021 property_type external-identifier +P6022 property_type item +P6023 property_type external-identifier +P6024 property_type external-identifier +P6025 property_type external-identifier +P6028 property_type external-identifier +P6030 property_type external-identifier +P6032 property_type external-identifier +P6033 property_type external-identifier +P6034 property_type external-identifier +P6035 property_type external-identifier +P6036 property_type external-identifier +P6037 property_type external-identifier +P6038 property_type external-identifier +P6039 property_type external-identifier +P6040 property_type external-identifier +P6041 property_type external-identifier +P6042 property_type external-identifier +P6043 property_type external-identifier +P6044 property_type external-identifier +P6045 property_type external-identifier +P6046 property_type external-identifier +P6047 property_type external-identifier +P6048 property_type external-identifier +P6049 property_type external-identifier +P6050 property_type external-identifier +P6051 property_type external-identifier +P6052 property_type external-identifier +P6053 property_type external-identifier +P6054 property_type external-identifier +P6055 property_type external-identifier +P6056 property_type external-identifier +P6057 property_type external-identifier +P6058 property_type external-identifier +P6059 property_type external-identifier +P6060 property_type external-identifier +P6061 property_type external-identifier +P6062 property_type external-identifier +P6063 property_type external-identifier +P6064 property_type external-identifier +P6065 property_type external-identifier +P6066 property_type external-identifier +P6067 property_type external-identifier +P6068 property_type external-identifier +P6069 property_type quantity +P6070 property_type external-identifier +P6071 property_type external-identifier +P6072 property_type string +P6073 property_type quantity +P6075 property_type quantity +P6076 property_type quantity +P6077 property_type external-identifier +P6078 property_type external-identifier +P6079 property_type external-identifier +P6080 property_type external-identifier +P6081 property_type external-identifier +P6082 property_type external-identifier +P6083 property_type external-identifier +P6084 property_type item +P6086 property_type item +P6087 property_type item +P6088 property_type quantity +P6089 property_type quantity +P6090 property_type external-identifier +P6091 property_type external-identifier +P6092 property_type external-identifier +P6093 property_type external-identifier +P6094 property_type external-identifier +P6095 property_type item +P6096 property_type external-identifier +P6097 property_type external-identifier +P6098 property_type external-identifier +P6099 property_type item +P6100 property_type external-identifier +P6101 property_type external-identifier +P6102 property_type external-identifier +P6103 property_type external-identifier +P6104 property_type item +P6105 property_type external-identifier +P6106 property_type item +P6107 property_type url +P6108 property_type url +P6109 property_type external-identifier +P6110 property_type external-identifier +P6111 property_type external-identifier +P6112 property_type item +P6113 property_type external-identifier +P6114 property_type external-identifier +P6115 property_type external-identifier +P6116 property_type item +P6117 property_type external-identifier +P6118 property_type item +P6119 property_type external-identifier +P6120 property_type external-identifier +P6122 property_type external-identifier +P6123 property_type external-identifier +P6124 property_type external-identifier +P7691 property_type external-identifier +P7692 property_type external-identifier +P7693 property_type external-identifier +P7694 property_type external-identifier +P7695 property_type external-identifier +P7696 property_type external-identifier +P7697 property_type external-identifier +P7698 property_type external-identifier +P7699 property_type external-identifier +P7700 property_type external-identifier +P7701 property_type external-identifier +P7702 property_type external-identifier +P7703 property_type external-identifier +P7704 property_type external-identifier +P7705 property_type url +P7706 property_type string +P7707 property_type url +P7708 property_type external-identifier +P7709 property_type external-identifier +P7710 property_type external-identifier +P7711 property_type external-identifier +P7712 property_type external-identifier +P7713 property_type external-identifier +P7714 property_type external-identifier +P7715 property_type external-identifier +P7716 property_type external-identifier +P7717 property_type external-identifier +P7718 property_type external-identifier +P7719 property_type item +P7720 property_type external-identifier +P7721 property_type external-identifier +P7722 property_type external-identifier +P7723 property_type external-identifier +P7724 property_type external-identifier +P7725 property_type quantity +P7726 property_type external-identifier +P7727 property_type item +P7729 property_type external-identifier +P7730 property_type external-identifier +P7731 property_type external-identifier +P7732 property_type external-identifier +P7733 property_type external-identifier +P7734 property_type external-identifier +P7735 property_type external-identifier +P7736 property_type external-identifier +P7737 property_type external-identifier +P7738 property_type external-identifier +P7739 property_type external-identifier +P7740 property_type external-identifier +P7741 property_type external-identifier +P7742 property_type external-identifier +P7743 property_type external-identifier +P7744 property_type external-identifier +P7745 property_type external-identifier +P7746 property_type external-identifier +P7747 property_type external-identifier +P7748 property_type external-identifier +P7749 property_type external-identifier +P7750 property_type external-identifier +P7751 property_type external-identifier +P7752 property_type external-identifier +P7753 property_type external-identifier +P7754 property_type external-identifier +P7755 property_type external-identifier +P7756 property_type external-identifier +P7757 property_type external-identifier +P7758 property_type external-identifier +P7759 property_type external-identifier +P7760 property_type external-identifier +P7761 property_type external-identifier +P7762 property_type external-identifier +P7763 property_type item +P7764 property_type external-identifier +P7765 property_type external-identifier +P7766 property_type external-identifier +P7767 property_type item +P7768 property_type external-identifier +P7769 property_type external-identifier +P7770 property_type quantity +P7771 property_type external-identifier +P7772 property_type external-identifier +P7773 property_type external-identifier +P7774 property_type external-identifier +P7775 property_type external-identifier +P7776 property_type external-identifier +P7777 property_type external-identifier +P7778 property_type external-identifier +P7779 property_type item +P7780 property_type external-identifier +P7781 property_type item +P7782 property_type item +P7783 property_type external-identifier +P7784 property_type external-identifier +P7785 property_type external-identifier +P7786 property_type external-identifier +P7787 property_type quantity +P7788 property_type external-identifier +P7789 property_type external-identifier +P7790 property_type external-identifier +P7791 property_type external-identifier +P7893 property_type external-identifier +P7894 property_type external-identifier +P7895 property_type external-identifier +P7896 property_type external-identifier +P7897 property_type external-identifier +P7898 property_type external-identifier +P7899 property_type external-identifier +P7900 property_type external-identifier +P7901 property_type external-identifier +P7902 property_type external-identifier +P7903 property_type item +P7904 property_type item +P7905 property_type external-identifier +P7906 property_type external-identifier +P7907 property_type external-identifier +P7908 property_type external-identifier +P7909 property_type external-identifier +P7910 property_type external-identifier +P7911 property_type external-identifier +P7912 property_type external-identifier +P7913 property_type external-identifier +P7914 property_type external-identifier +P7915 property_type external-identifier +P7916 property_type external-identifier +P7917 property_type external-identifier +P7918 property_type external-identifier +P7919 property_type external-identifier +P7920 property_type external-identifier +P7921 property_type external-identifier +P7922 property_type external-identifier +P7923 property_type external-identifier +P7924 property_type external-identifier +P7925 property_type external-identifier +P7926 property_type external-identifier +P7927 property_type external-identifier +P7928 property_type external-identifier +P7929 property_type external-identifier +P7930 property_type url +P7931 property_type external-identifier +P7932 property_type external-identifier +P7934 property_type external-identifier +P7935 property_type external-identifier +P7936 property_type item +P7937 property_type item +P7938 property_type item +P7939 property_type external-identifier +P7940 property_type external-identifier +P7941 property_type external-identifier +P7942 property_type external-identifier +P7943 property_type external-identifier +P7944 property_type external-identifier +P7945 property_type external-identifier +P7946 property_type external-identifier +P7947 property_type external-identifier +P7948 property_type external-identifier +P7949 property_type external-identifier +P7950 property_type external-identifier +P7951 property_type external-identifier +P7952 property_type external-identifier +P7953 property_type external-identifier +P7954 property_type external-identifier +P7955 property_type external-identifier +P7956 property_type external-identifier +P7957 property_type external-identifier +P7958 property_type external-identifier +P7959 property_type item +P7960 property_type external-identifier +P7961 property_type external-identifier +P7962 property_type external-identifier +P7963 property_type external-identifier +P7964 property_type string +P7965 property_type external-identifier +P7966 property_type external-identifier +P7967 property_type external-identifier +P7968 property_type external-identifier +P7969 property_type external-identifier +P7970 property_type external-identifier +P7971 property_type quantity +P7972 property_type external-identifier +P7973 property_type string +P7974 property_type external-identifier +P7975 property_type external-identifier +P7976 property_type external-identifier +P7977 property_type external-identifier +P7978 property_type external-identifier +P7979 property_type external-identifier +P7980 property_type external-identifier +P7981 property_type external-identifier +P7982 property_type external-identifier +P7983 property_type external-identifier +P7984 property_type item +P7985 property_type external-identifier +P7986 property_type external-identifier +P7987 property_type external-identifier +P7988 property_type external-identifier +P7989 property_type external-identifier +P7990 property_type external-identifier +P7991 property_type external-identifier +P7992 property_type external-identifier +P7993 property_type external-identifier +P6125 property_type quantity +P6126 property_type external-identifier +P6127 property_type external-identifier +P6128 property_type external-identifier +P6130 property_type external-identifier +P6131 property_type external-identifier +P6132 property_type external-identifier +P6133 property_type external-identifier +P6134 property_type external-identifier +P6135 property_type external-identifier +P6136 property_type external-identifier +P6137 property_type external-identifier +P6138 property_type external-identifier +P6139 property_type external-identifier +P6140 property_type external-identifier +P6141 property_type external-identifier +P6142 property_type external-identifier +P6143 property_type external-identifier +P6144 property_type external-identifier +P6145 property_type external-identifier +P6146 property_type external-identifier +P6147 property_type external-identifier +P6148 property_type external-identifier +P6149 property_type item +P6150 property_type external-identifier +P6151 property_type external-identifier +P6152 property_type external-identifier +P6153 property_type item +P6154 property_type external-identifier +P6155 property_type external-identifier +P6156 property_type external-identifier +P6157 property_type external-identifier +P6158 property_type external-identifier +P6159 property_type external-identifier +P6160 property_type external-identifier +P6161 property_type external-identifier +P6162 property_type external-identifier +P6163 property_type external-identifier +P6164 property_type external-identifier +P6165 property_type external-identifier +P6166 property_type item +P6167 property_type external-identifier +P6168 property_type external-identifier +P6169 property_type external-identifier +P6170 property_type external-identifier +P6171 property_type external-identifier +P6172 property_type external-identifier +P6173 property_type external-identifier +P6174 property_type external-identifier +P6175 property_type external-identifier +P6176 property_type external-identifier +P6177 property_type external-identifier +P6178 property_type external-identifier +P6179 property_type external-identifier +P6180 property_type external-identifier +P6181 property_type external-identifier +P6182 property_type external-identifier +P6183 property_type external-identifier +P6184 property_type item +P6185 property_type item +P6186 property_type item +P6187 property_type external-identifier +P6188 property_type external-identifier +P6189 property_type external-identifier +P6190 property_type external-identifier +P6191 property_type item +P6192 property_type external-identifier +P6193 property_type item +P6194 property_type external-identifier +P6195 property_type item +P6196 property_type external-identifier +P6197 property_type external-identifier +P6198 property_type external-identifier +P6199 property_type external-identifier +P6200 property_type external-identifier +P6201 property_type external-identifier +P6202 property_type external-identifier +P6204 property_type external-identifier +P6205 property_type external-identifier +P6206 property_type external-identifier +P6208 property_type monolingualtext +P6209 property_type external-identifier +P6210 property_type external-identifier +P6211 property_type external-identifier +P6212 property_type item +P6213 property_type external-identifier +P6214 property_type external-identifier +P6215 property_type external-identifier +P6216 property_type item +P6217 property_type external-identifier +P6218 property_type external-identifier +P6219 property_type external-identifier +P6220 property_type external-identifier +P6221 property_type external-identifier +P6222 property_type external-identifier +P6223 property_type external-identifier +P6224 property_type item +P6225 property_type external-identifier +P6226 property_type external-identifier +P6227 property_type external-identifier +P6645 property_type external-identifier +P6646 property_type external-identifier +P6647 property_type external-identifier +P6648 property_type string +P6649 property_type external-identifier +P6650 property_type external-identifier +P6652 property_type external-identifier +P6653 property_type external-identifier +P6654 property_type external-identifier +P6655 property_type string +P6656 property_type external-identifier +P6657 property_type item +P6658 property_type item +P6659 property_type external-identifier +P6660 property_type external-identifier +P6661 property_type external-identifier +P6662 property_type external-identifier +P6663 property_type external-identifier +P6664 property_type external-identifier +P6665 property_type external-identifier +P6666 property_type external-identifier +P6667 property_type external-identifier +P6668 property_type external-identifier +P6669 property_type external-identifier +P6670 property_type string +P6671 property_type external-identifier +P6672 property_type external-identifier +P6673 property_type external-identifier +P6674 property_type external-identifier +P6676 property_type external-identifier +P6677 property_type external-identifier +P6678 property_type external-identifier +P6679 property_type external-identifier +P6680 property_type external-identifier +P6681 property_type external-identifier +P6682 property_type external-identifier +P6683 property_type external-identifier +P6684 property_type item +P6685 property_type string +P6686 property_type string +P6687 property_type external-identifier +P6688 property_type external-identifier +P6689 property_type external-identifier +P6690 property_type external-identifier +P6691 property_type external-identifier +P6692 property_type external-identifier +P6693 property_type external-identifier +P6694 property_type external-identifier +P6695 property_type quantity +P6696 property_type external-identifier +P6697 property_type quantity +P6698 property_type external-identifier +P6699 property_type external-identifier +P6700 property_type external-identifier +P6701 property_type external-identifier +P6702 property_type external-identifier +P6703 property_type external-identifier +P6704 property_type external-identifier +P6705 property_type external-identifier +P6706 property_type external-identifier +P6707 property_type quantity +P6708 property_type quantity +P6709 property_type string +P6710 property_type quantity +P6711 property_type quantity +P6712 property_type string +P6713 property_type external-identifier +P6714 property_type external-identifier +P6715 property_type external-identifier +P6716 property_type external-identifier +P6717 property_type external-identifier +P6718 property_type item +P6719 property_type string +P6720 property_type external-identifier +P6721 property_type external-identifier +P6722 property_type external-identifier +P6723 property_type external-identifier +P6724 property_type external-identifier +P6725 property_type external-identifier +P6726 property_type external-identifier +P6727 property_type external-identifier +P6728 property_type external-identifier +P6729 property_type external-identifier +P6730 property_type external-identifier +P6731 property_type item +P6732 property_type external-identifier +P6733 property_type string +P6734 property_type external-identifier +P6735 property_type external-identifier +P6736 property_type external-identifier +P6737 property_type external-identifier +P6738 property_type external-identifier +P6739 property_type external-identifier +P6740 property_type external-identifier +P6741 property_type external-identifier +P6742 property_type external-identifier +P6743 property_type external-identifier +P6744 property_type external-identifier +P6745 property_type external-identifier +P6746 property_type external-identifier +P7994 property_type external-identifier +P7995 property_type external-identifier +P7996 property_type external-identifier +P7997 property_type external-identifier +P7998 property_type external-identifier +P7999 property_type external-identifier +P8000 property_type string +P8001 property_type item +P8002 property_type external-identifier +P8003 property_type external-identifier +P8004 property_type item +P8005 property_type item +P8006 property_type item +P8007 property_type external-identifier +P8008 property_type external-identifier +P8009 property_type string +P8010 property_type quantity +P8011 property_type quantity +P8012 property_type external-identifier +P8013 property_type external-identifier +P8014 property_type external-identifier +P8015 property_type external-identifier +P8016 property_type external-identifier +P8017 property_type string +P8018 property_type external-identifier +P8019 property_type external-identifier +P8020 property_type external-identifier +P8021 property_type external-identifier +P8022 property_type external-identifier +P8023 property_type external-identifier +P8024 property_type external-identifier +P8025 property_type external-identifier +P8026 property_type item +P8027 property_type external-identifier +P8028 property_type external-identifier +P8029 property_type external-identifier +P8030 property_type item +P8031 property_type item +P8032 property_type item +P8033 property_type external-identifier +P8034 property_type external-identifier +P8035 property_type external-identifier +P8036 property_type external-identifier +P8037 property_type external-identifier +P8038 property_type external-identifier +P8039 property_type external-identifier +P8040 property_type external-identifier +P8041 property_type external-identifier +P8042 property_type external-identifier +P8043 property_type external-identifier +P8044 property_type external-identifier +P8045 property_type item +P8046 property_type string +P8047 property_type item +P8048 property_type external-identifier +P8049 property_type quantity +P8050 property_type external-identifier +P8051 property_type external-identifier +P8052 property_type external-identifier +P8053 property_type external-identifier +P8054 property_type string +P8055 property_type external-identifier +P8056 property_type external-identifier +P8057 property_type external-identifier +P8058 property_type item +P8059 property_type external-identifier +P8060 property_type external-identifier +P8061 property_type external-identifier +P8062 property_type external-identifier +P8063 property_type external-identifier +P8064 property_type external-identifier +P8065 property_type external-identifier +P8066 property_type external-identifier +P8067 property_type external-identifier +P8068 property_type external-identifier +P8069 property_type external-identifier +P8070 property_type external-identifier +P8071 property_type external-identifier +P8072 property_type external-identifier +P8073 property_type external-identifier +P8074 property_type external-identifier +P8075 property_type external-identifier +P8076 property_type external-identifier +P8077 property_type external-identifier +P8078 property_type external-identifier +P8079 property_type external-identifier +P8080 property_type external-identifier +P8081 property_type external-identifier +P8082 property_type external-identifier +P8083 property_type external-identifier +P8084 property_type external-identifier +P8085 property_type external-identifier +P8086 property_type external-identifier +P8087 property_type external-identifier +P8088 property_type external-identifier +P8089 property_type external-identifier +P8090 property_type external-identifier +P8091 property_type external-identifier +P8092 property_type external-identifier +P8093 property_type quantity +P8094 property_type external-identifier +P8095 property_type external-identifier +P8096 property_type external-identifier +P8097 property_type item +P8098 property_type external-identifier +P8099 property_type external-identifier +P8100 property_type external-identifier +P8101 property_type external-identifier +P8102 property_type external-identifier +P8103 property_type external-identifier +P8104 property_type external-identifier +P8105 property_type external-identifier +P8106 property_type external-identifier +P8107 property_type item +P8108 property_type external-identifier +P8109 property_type external-identifier +P8110 property_type external-identifier +P8111 property_type item +P8112 property_type string +P8113 property_type external-identifier +P8114 property_type external-identifier +P8115 property_type item +P8116 property_type external-identifier +P8117 property_type external-identifier +P8118 property_type external-identifier +P8119 property_type external-identifier +P8120 property_type external-identifier +P8121 property_type external-identifier +P8122 property_type external-identifier +P8123 property_type external-identifier +P8124 property_type external-identifier +P8125 property_type external-identifier +P8126 property_type external-identifier +P8127 property_type item +P8128 property_type external-identifier +P8129 property_type external-identifier +P8130 property_type external-identifier +P8131 property_type item +P8132 property_type external-identifier +P6228 property_type external-identifier +P6229 property_type external-identifier +P6230 property_type external-identifier +P6231 property_type external-identifier +P6232 property_type external-identifier +P6233 property_type external-identifier +P6234 property_type external-identifier +P6235 property_type external-identifier +P6237 property_type item +P6238 property_type external-identifier +P6239 property_type external-identifier +P6240 property_type external-identifier +P6241 property_type item +P6242 property_type external-identifier +P6243 property_type item +P6244 property_type external-identifier +P6245 property_type external-identifier +P6246 property_type external-identifier +P6247 property_type external-identifier +P6248 property_type external-identifier +P6249 property_type quantity +P6250 property_type external-identifier +P6251 property_type monolingualtext +P6252 property_type external-identifier +P6253 property_type external-identifier +P6254 property_type string +P6255 property_type external-identifier +P6256 property_type external-identifier +P6257 property_type quantity +P6258 property_type quantity +P6259 property_type item +P6260 property_type quantity +P6261 property_type quantity +P6262 property_type external-identifier +P6263 property_type external-identifier +P6264 property_type external-identifier +P6265 property_type external-identifier +P6266 property_type external-identifier +P6267 property_type external-identifier +P6268 property_type external-identifier +P6269 property_type url +P6271 property_type item +P6272 property_type quantity +P6274 property_type quantity +P6275 property_type item +P6276 property_type external-identifier +P6277 property_type external-identifier +P6278 property_type external-identifier +P6279 property_type external-identifier +P6280 property_type string +P6281 property_type external-identifier +P6282 property_type external-identifier +P6283 property_type external-identifier +P6284 property_type external-identifier +P6285 property_type external-identifier +P6286 property_type external-identifier +P6287 property_type external-identifier +P6288 property_type external-identifier +P6289 property_type external-identifier +P6290 property_type external-identifier +P6291 property_type item +P6292 property_type external-identifier +P6293 property_type external-identifier +P6294 property_type external-identifier +P6295 property_type external-identifier +P6296 property_type external-identifier +P6297 property_type external-identifier +P6298 property_type external-identifier +P6299 property_type external-identifier +P6300 property_type external-identifier +P6301 property_type external-identifier +P6302 property_type external-identifier +P6303 property_type external-identifier +P6304 property_type external-identifier +P6305 property_type external-identifier +P6306 property_type external-identifier +P6307 property_type external-identifier +P6308 property_type external-identifier +P6309 property_type external-identifier +P6310 property_type external-identifier +P6311 property_type external-identifier +P6312 property_type external-identifier +P6313 property_type external-identifier +P6314 property_type external-identifier +P6315 property_type external-identifier +P6316 property_type external-identifier +P6317 property_type external-identifier +P6318 property_type external-identifier +P6319 property_type external-identifier +P6320 property_type external-identifier +P6321 property_type external-identifier +P6322 property_type external-identifier +P6323 property_type external-identifier +P6324 property_type external-identifier +P6325 property_type external-identifier +P6326 property_type external-identifier +P6327 property_type external-identifier +P6328 property_type external-identifier +P6329 property_type external-identifier +P6330 property_type external-identifier +P6747 property_type external-identifier +P6748 property_type external-identifier +P6749 property_type external-identifier +P6750 property_type external-identifier +P6751 property_type external-identifier +P6752 property_type external-identifier +P6753 property_type quantity +P6754 property_type external-identifier +P6756 property_type external-identifier +P6757 property_type quantity +P6758 property_type item +P6759 property_type external-identifier +P6760 property_type external-identifier +P6761 property_type external-identifier +P6762 property_type external-identifier +P6763 property_type external-identifier +P6764 property_type external-identifier +P6765 property_type external-identifier +P6766 property_type external-identifier +P6767 property_type external-identifier +P6768 property_type external-identifier +P6769 property_type external-identifier +P6770 property_type external-identifier +P6771 property_type external-identifier +P6772 property_type external-identifier +P6773 property_type external-identifier +P6774 property_type external-identifier +P6775 property_type external-identifier +P6776 property_type external-identifier +P6777 property_type external-identifier +P6778 property_type external-identifier +P6780 property_type external-identifier +P6781 property_type external-identifier +P6782 property_type external-identifier +P6783 property_type external-identifier +P6784 property_type external-identifier +P6785 property_type external-identifier +P6786 property_type external-identifier +P6787 property_type external-identifier +P6788 property_type external-identifier +P6789 property_type quantity +P6790 property_type quantity +P6791 property_type external-identifier +P6792 property_type external-identifier +P6793 property_type string +P6794 property_type quantity +P6795 property_type external-identifier +P6796 property_type external-identifier +P6797 property_type external-identifier +P6798 property_type string +P6799 property_type external-identifier +P6800 property_type url +P6801 property_type quantity +P6802 property_type string +P6803 property_type item +P6804 property_type external-identifier +P6805 property_type external-identifier +P6806 property_type external-identifier +P6807 property_type external-identifier +P6808 property_type external-identifier +P6809 property_type external-identifier +P6810 property_type external-identifier +P6811 property_type external-identifier +P6812 property_type external-identifier +P6813 property_type external-identifier +P6814 property_type external-identifier +P6815 property_type external-identifier +P6816 property_type external-identifier +P6817 property_type external-identifier +P6818 property_type url +P6819 property_type item +P6820 property_type external-identifier +P6821 property_type external-identifier +P6822 property_type external-identifier +P6823 property_type external-identifier +P6824 property_type string +P6825 property_type external-identifier +P6826 property_type quantity +P6827 property_type external-identifier +P6828 property_type external-identifier +P6829 property_type external-identifier +P6830 property_type external-identifier +P6831 property_type external-identifier +P6832 property_type external-identifier +P6833 property_type monolingualtext +P6835 property_type string +P6836 property_type external-identifier +P6837 property_type external-identifier +P6838 property_type external-identifier +P6839 property_type external-identifier +P6840 property_type item +P6841 property_type external-identifier +P6842 property_type external-identifier +P6843 property_type external-identifier +P6844 property_type external-identifier +P6845 property_type external-identifier +P6846 property_type external-identifier +P6847 property_type external-identifier +P6848 property_type external-identifier +P6849 property_type external-identifier +P6331 property_type external-identifier +P6332 property_type external-identifier +P6333 property_type monolingualtext +P6334 property_type external-identifier +P6335 property_type external-identifier +P6336 property_type external-identifier +P6337 property_type external-identifier +P6338 property_type item +P6339 property_type item +P6340 property_type external-identifier +P6341 property_type external-identifier +P6342 property_type external-identifier +P6343 property_type quantity +P6344 property_type quantity +P6346 property_type monolingualtext +P6347 property_type external-identifier +P6348 property_type external-identifier +P6349 property_type external-identifier +P6350 property_type external-identifier +P6351 property_type external-identifier +P6352 property_type external-identifier +P6353 property_type external-identifier +P6354 property_type quantity +P6355 property_type external-identifier +P6356 property_type external-identifier +P6357 property_type external-identifier +P6358 property_type external-identifier +P6359 property_type external-identifier +P6360 property_type external-identifier +P6361 property_type external-identifier +P6362 property_type external-identifier +P6363 property_type url +P6364 property_type item +P6365 property_type item +P6366 property_type external-identifier +P6367 property_type external-identifier +P6368 property_type external-identifier +P6369 property_type external-identifier +P6370 property_type external-identifier +P6371 property_type external-identifier +P6372 property_type external-identifier +P6373 property_type external-identifier +P6374 property_type external-identifier +P6375 property_type monolingualtext +P6376 property_type external-identifier +P6377 property_type external-identifier +P6378 property_type url +P6379 property_type item +P6381 property_type external-identifier +P6382 property_type external-identifier +P6383 property_type external-identifier +P6384 property_type external-identifier +P6385 property_type external-identifier +P6386 property_type external-identifier +P6387 property_type external-identifier +P6388 property_type external-identifier +P6389 property_type external-identifier +P6390 property_type external-identifier +P6391 property_type external-identifier +P6392 property_type external-identifier +P6394 property_type external-identifier +P6395 property_type external-identifier +P6398 property_type external-identifier +P6399 property_type external-identifier +P6400 property_type external-identifier +P6401 property_type external-identifier +P6402 property_type external-identifier +P6403 property_type external-identifier +P6404 property_type external-identifier +P6405 property_type external-identifier +P6406 property_type external-identifier +P6407 property_type external-identifier +P6408 property_type external-identifier +P6409 property_type external-identifier +P6410 property_type external-identifier +P6411 property_type external-identifier +P6412 property_type external-identifier +P6413 property_type external-identifier +P6414 property_type external-identifier +P6415 property_type external-identifier +P6416 property_type external-identifier +P6417 property_type external-identifier +P6418 property_type external-identifier +P6419 property_type external-identifier +P6420 property_type external-identifier +P6421 property_type external-identifier +P6422 property_type external-identifier +P6423 property_type external-identifier +P6424 property_type string +P6425 property_type external-identifier +P6426 property_type item +P6427 property_type monolingualtext +P6428 property_type external-identifier +P6429 property_type external-identifier +P6430 property_type external-identifier +P6431 property_type external-identifier +P6432 property_type string +P6433 property_type external-identifier +P6434 property_type external-identifier +P6436 property_type external-identifier +P6850 property_type external-identifier +P6851 property_type external-identifier +P6852 property_type external-identifier +P6853 property_type external-identifier +P6854 property_type external-identifier +P6855 property_type item +P6856 property_type quantity +P6857 property_type external-identifier +P6858 property_type external-identifier +P6859 property_type external-identifier +P6861 property_type external-identifier +P6862 property_type external-identifier +P6863 property_type external-identifier +P6864 property_type external-identifier +P6865 property_type external-identifier +P6866 property_type external-identifier +P6867 property_type external-identifier +P6868 property_type external-identifier +P6869 property_type external-identifier +P6870 property_type external-identifier +P6871 property_type external-identifier +P6872 property_type item +P6873 property_type external-identifier +P6874 property_type external-identifier +P6875 property_type item +P6876 property_type quantity +P6877 property_type external-identifier +P6878 property_type external-identifier +P6879 property_type quantity +P6880 property_type external-identifier +P6881 property_type external-identifier +P6882 property_type external-identifier +P6883 property_type string +P6884 property_type item +P6885 property_type item +P6886 property_type item +P6887 property_type item +P6888 property_type external-identifier +P6889 property_type item +P6890 property_type external-identifier +P6891 property_type external-identifier +P6892 property_type external-identifier +P6893 property_type external-identifier +P6894 property_type external-identifier +P6895 property_type external-identifier +P6896 property_type external-identifier +P6897 property_type quantity +P6898 property_type external-identifier +P6899 property_type external-identifier +P6900 property_type external-identifier +P6901 property_type external-identifier +P6902 property_type item +P6903 property_type external-identifier +P6904 property_type external-identifier +P6905 property_type external-identifier +P6906 property_type external-identifier +P6907 property_type external-identifier +P6908 property_type external-identifier +P6909 property_type external-identifier +P6910 property_type external-identifier +P6911 property_type external-identifier +P6912 property_type external-identifier +P6913 property_type external-identifier +P6914 property_type external-identifier +P6915 property_type external-identifier +P6916 property_type external-identifier +P6917 property_type external-identifier +P6918 property_type external-identifier +P6919 property_type external-identifier +P6920 property_type external-identifier +P6921 property_type external-identifier +P6922 property_type external-identifier +P6923 property_type external-identifier +P6924 property_type external-identifier +P6925 property_type external-identifier +P6926 property_type external-identifier +P6927 property_type external-identifier +P6928 property_type external-identifier +P6929 property_type external-identifier +P6930 property_type external-identifier +P6931 property_type external-identifier +P6932 property_type external-identifier +P6933 property_type external-identifier +P6934 property_type external-identifier +P6935 property_type external-identifier +P6936 property_type external-identifier +P6937 property_type external-identifier +P6938 property_type item +P6939 property_type item +P6940 property_type external-identifier +P6941 property_type external-identifier +P6942 property_type item +P6943 property_type external-identifier +P6944 property_type external-identifier +P6945 property_type external-identifier +P6946 property_type external-identifier +P6947 property_type external-identifier +P6948 property_type item +P6949 property_type time +P6950 property_type external-identifier +P6953 property_type external-identifier +P6954 property_type item +P6955 property_type external-identifier +P6956 property_type external-identifier +P6957 property_type external-identifier +P6958 property_type external-identifier +P6959 property_type external-identifier +P6960 property_type external-identifier +P6962 property_type item +P6963 property_type external-identifier +P6964 property_type external-identifier +P6965 property_type external-identifier +P6966 property_type external-identifier +P6967 property_type external-identifier +P6968 property_type external-identifier +P6969 property_type external-identifier +P6970 property_type external-identifier +P6971 property_type external-identifier +P6972 property_type external-identifier +P6973 property_type external-identifier +P6975 property_type external-identifier +P6976 property_type external-identifier +P6977 property_type item +P6978 property_type item +P6979 property_type external-identifier +P6980 property_type external-identifier +P6981 property_type external-identifier +P6982 property_type external-identifier +P6983 property_type external-identifier +P6984 property_type external-identifier +P6985 property_type external-identifier +P6987 property_type external-identifier +P6988 property_type external-identifier +P6989 property_type external-identifier +P6992 property_type external-identifier +P6993 property_type external-identifier +P6994 property_type external-identifier +P6995 property_type external-identifier +P6996 property_type external-identifier +P6997 property_type external-identifier +P6998 property_type external-identifier +P6999 property_type external-identifier +P7000 property_type external-identifier +P7001 property_type external-identifier +P7002 property_type external-identifier +P7003 property_type external-identifier +P7004 property_type external-identifier +P7005 property_type external-identifier +P7006 property_type external-identifier +P7007 property_type external-identifier +P7008 property_type monolingualtext +P7009 property_type string +P7010 property_type item +P7011 property_type external-identifier +P7012 property_type external-identifier +P7013 property_type external-identifier +P7014 property_type url +P7015 property_type quantity +P7017 property_type external-identifier +P7018 property_type string +P7019 property_type external-identifier +P7020 property_type external-identifier +P7021 property_type external-identifier +P7022 property_type external-identifier +P7023 property_type external-identifier +P7024 property_type external-identifier +P7025 property_type external-identifier +P7026 property_type external-identifier +P7027 property_type external-identifier +P7028 property_type external-identifier +P7029 property_type external-identifier +P7030 property_type external-identifier +P7031 property_type external-identifier +P7032 property_type external-identifier +P7033 property_type external-identifier +P7034 property_type external-identifier +P7035 property_type external-identifier +P7036 property_type external-identifier +P7037 property_type external-identifier +P7038 property_type external-identifier +P7039 property_type external-identifier +P7040 property_type external-identifier +P7041 property_type external-identifier +P7042 property_type external-identifier +P7043 property_type external-identifier +P7044 property_type external-identifier +P7045 property_type item +P7046 property_type external-identifier +P7047 property_type item +P7048 property_type external-identifier +P7049 property_type external-identifier +P7050 property_type external-identifier +P7051 property_type external-identifier +P7052 property_type external-identifier +P7053 property_type external-identifier +P7054 property_type external-identifier +P7055 property_type external-identifier +P7056 property_type external-identifier +P7057 property_type external-identifier +P7058 property_type external-identifier +P7059 property_type external-identifier +P7060 property_type external-identifier +P7063 property_type external-identifier +P7064 property_type external-identifier +P7065 property_type external-identifier +P7066 property_type external-identifier +P7067 property_type external-identifier +P7068 property_type external-identifier +P7069 property_type string +P7070 property_type external-identifier +P7071 property_type external-identifier +P7072 property_type external-identifier +P7073 property_type external-identifier +P7074 property_type external-identifier +P7075 property_type item +P7076 property_type external-identifier +P7077 property_type external-identifier +P7078 property_type item +P7079 property_type quantity +P7080 property_type quantity +P7081 property_type monolingualtext +P7083 property_type quantity +P7084 property_type item +P7085 property_type external-identifier +P7086 property_type item +P7087 property_type item +P7089 property_type external-identifier +P7090 property_type external-identifier +P7091 property_type external-identifier +P7092 property_type external-identifier +P7093 property_type external-identifier +P7094 property_type external-identifier +P7095 property_type item +P7100 property_type external-identifier +P7101 property_type url +P7102 property_type external-identifier +P7103 property_type time +P7104 property_type time +P7105 property_type external-identifier +P7106 property_type external-identifier +P7107 property_type external-identifier +P7108 property_type item +P7109 property_type external-identifier +P7110 property_type external-identifier +P7111 property_type external-identifier +P7112 property_type external-identifier +P7113 property_type external-identifier +P7114 property_type external-identifier +P7115 property_type external-identifier +P7116 property_type external-identifier +P7117 property_type external-identifier +P7118 property_type external-identifier +P7119 property_type external-identifier +P7120 property_type external-identifier +P7121 property_type external-identifier +P7122 property_type item +P7124 property_type time +P7125 property_type time +P7126 property_type string +P7127 property_type external-identifier +P7128 property_type external-identifier +P7129 property_type external-identifier +P7130 property_type external-identifier +P7131 property_type external-identifier +P7132 property_type external-identifier +P7133 property_type external-identifier +P7134 property_type external-identifier +P7135 property_type external-identifier +P7136 property_type external-identifier +P7137 property_type item +P7138 property_type external-identifier +P7139 property_type external-identifier +P7140 property_type external-identifier +P7141 property_type string +P7142 property_type external-identifier +P7143 property_type external-identifier +P7144 property_type external-identifier +P7145 property_type external-identifier +P7146 property_type external-identifier +P7148 property_type external-identifier +P7149 property_type external-identifier +P7150 property_type monolingualtext +P7151 property_type external-identifier +P7152 property_type item +P7153 property_type item +P7154 property_type external-identifier +P7155 property_type external-identifier +P7156 property_type external-identifier +P7157 property_type external-identifier +P7159 property_type external-identifier +P7160 property_type item +P7161 property_type external-identifier +P7162 property_type item +P7163 property_type item +P7164 property_type external-identifier +P7165 property_type item +P7166 property_type external-identifier +P7167 property_type item +P7168 property_type external-identifier +P7169 property_type item +P7170 property_type external-identifier +P7171 property_type external-identifier +P7172 property_type external-identifier +P7173 property_type external-identifier +P7174 property_type item +P7175 property_type external-identifier +P7176 property_type external-identifier +P7177 property_type external-identifier +P7178 property_type external-identifier +P7179 property_type external-identifier +P7180 property_type external-identifier +P7181 property_type external-identifier +P7182 property_type external-identifier +P7183 property_type external-identifier +P7184 property_type external-identifier +P7185 property_type external-identifier +P7186 property_type external-identifier +P7187 property_type external-identifier +P7188 property_type external-identifier +P7189 property_type external-identifier +P7190 property_type external-identifier +P7191 property_type external-identifier +P7192 property_type external-identifier +P7193 property_type external-identifier +P7194 property_type external-identifier +P7195 property_type external-identifier +P7196 property_type external-identifier +P7197 property_type external-identifier +P7198 property_type external-identifier +P7199 property_type external-identifier +P7200 property_type external-identifier +P7201 property_type external-identifier +P7202 property_type external-identifier +P7203 property_type external-identifier +P7204 property_type external-identifier +P7205 property_type external-identifier +P7206 property_type external-identifier +P7207 property_type external-identifier +P7208 property_type external-identifier +P7209 property_type item +P7210 property_type external-identifier +P7211 property_type external-identifier +P7212 property_type external-identifier +P7213 property_type url +P7214 property_type external-identifier +P7215 property_type external-identifier +P7216 property_type external-identifier +P7217 property_type external-identifier +P7218 property_type external-identifier +P7219 property_type string +P7220 property_type string +P7221 property_type string +P7222 property_type external-identifier +P7223 property_type external-identifier +P7224 property_type external-identifier +P7225 property_type external-identifier +P7226 property_type external-identifier +P7227 property_type external-identifier +P7228 property_type item +P7229 property_type external-identifier +P7230 property_type external-identifier +P7231 property_type external-identifier +P7232 property_type external-identifier +P7233 property_type external-identifier +P7234 property_type external-identifier +P7235 property_type string +P7236 property_type external-identifier +P7237 property_type external-identifier +P7238 property_type external-identifier +P7241 property_type external-identifier +P7242 property_type external-identifier +P7243 property_type monolingualtext +P7250 property_type string +P7251 property_type external-identifier +P7252 property_type item +P7253 property_type item +P7254 property_type external-identifier +P7255 property_type external-identifier +P7256 property_type quantity +P7257 property_type external-identifier +P7258 property_type external-identifier +P7259 property_type external-identifier +P7260 property_type external-identifier +P7261 property_type item +P7262 property_type external-identifier +P7263 property_type external-identifier +P7264 property_type external-identifier +P7265 property_type external-identifier +P7266 property_type external-identifier +P7267 property_type external-identifier +P7268 property_type external-identifier +P7269 property_type external-identifier +P7270 property_type external-identifier +P7271 property_type external-identifier +P7272 property_type external-identifier +P7273 property_type external-identifier +P7274 property_type external-identifier +P7275 property_type external-identifier +P7276 property_type external-identifier +P7277 property_type external-identifier +P7278 property_type external-identifier +P7279 property_type external-identifier +P7280 property_type external-identifier +P7281 property_type external-identifier +P7282 property_type external-identifier +P7283 property_type external-identifier +P7284 property_type external-identifier +P7285 property_type external-identifier +P7286 property_type external-identifier +P7287 property_type external-identifier +P7288 property_type external-identifier +P7289 property_type external-identifier +P7290 property_type string +P7291 property_type external-identifier +P7292 property_type external-identifier +P7293 property_type external-identifier +P7294 property_type url +P7295 property_type time +P7296 property_type external-identifier +P7297 property_type quantity +P7298 property_type external-identifier +P7299 property_type external-identifier +P7300 property_type external-identifier +P7301 property_type external-identifier +P7302 property_type external-identifier +P7303 property_type external-identifier +P7304 property_type external-identifier +P7305 property_type external-identifier +P7306 property_type external-identifier +P7307 property_type external-identifier +P7308 property_type external-identifier +P7309 property_type item +P7310 property_type external-identifier +P7311 property_type external-identifier +P7312 property_type external-identifier +P7313 property_type external-identifier +P7314 property_type external-identifier +P7315 property_type string +P7316 property_type quantity +P7317 property_type external-identifier +P7318 property_type external-identifier +P7319 property_type external-identifier +P7320 property_type external-identifier +P7321 property_type external-identifier +P7322 property_type external-identifier +P7323 property_type external-identifier +P7324 property_type external-identifier +P7325 property_type external-identifier +P7326 property_type external-identifier +P7327 property_type item +P7328 property_type quantity +P7329 property_type external-identifier +P7330 property_type string +P7331 property_type external-identifier +P7332 property_type external-identifier +P7333 property_type external-identifier +P7334 property_type external-identifier +P7335 property_type external-identifier +P7336 property_type external-identifier +P7337 property_type external-identifier +P7338 property_type string +P7339 property_type external-identifier +P7340 property_type external-identifier +P7341 property_type external-identifier +P7342 property_type external-identifier +P7343 property_type external-identifier +P7344 property_type external-identifier +P7345 property_type external-identifier +P7346 property_type external-identifier +P7347 property_type url +P7348 property_type external-identifier +P7349 property_type external-identifier +P7350 property_type external-identifier +P7351 property_type external-identifier +P7352 property_type external-identifier +P7353 property_type external-identifier +P7354 property_type external-identifier +P7355 property_type external-identifier +P7356 property_type external-identifier +P7357 property_type external-identifier +P7358 property_type external-identifier +P7359 property_type external-identifier +P7360 property_type external-identifier +P7361 property_type external-identifier +P7362 property_type external-identifier +P7363 property_type external-identifier +P7364 property_type external-identifier +P7365 property_type external-identifier +P7366 property_type external-identifier +P7367 property_type item +P7368 property_type external-identifier +P7369 property_type external-identifier +P7370 property_type external-identifier +P7371 property_type external-identifier +P7372 property_type external-identifier +P7374 property_type item +P7375 property_type url +P7376 property_type item +P7377 property_type item +P7378 property_type item +P7379 property_type quantity +P7380 property_type string +P7381 property_type external-identifier +P7382 property_type external-identifier +P7383 property_type string +P7384 property_type external-identifier +P7387 property_type external-identifier +P7388 property_type external-identifier +P7389 property_type external-identifier +P7390 property_type external-identifier +P7391 property_type quantity +P7395 property_type external-identifier +P7396 property_type external-identifier +P7397 property_type external-identifier +P7398 property_type external-identifier +P7399 property_type external-identifier +P7400 property_type external-identifier +P7401 property_type external-identifier +P7402 property_type external-identifier +P7403 property_type external-identifier +P7404 property_type external-identifier +P7405 property_type external-identifier +P7406 property_type item +P7407 property_type string +P7408 property_type external-identifier +P7409 property_type external-identifier +P7410 property_type external-identifier +P7411 property_type external-identifier +P7412 property_type external-identifier +P7413 property_type external-identifier +P7414 property_type external-identifier +P7415 property_type string +P7416 property_type string +P7417 property_type string +P7418 property_type string +P7419 property_type item +P7420 property_type string +P7421 property_type string +P7422 property_type quantity +P7423 property_type external-identifier +P7425 property_type external-identifier +P7427 property_type external-identifier +P7428 property_type external-identifier +P7429 property_type external-identifier +P7430 property_type external-identifier +P7431 property_type external-identifier +P7432 property_type external-identifier +P7433 property_type external-identifier +P7434 property_type external-identifier +P7435 property_type external-identifier +P7436 property_type external-identifier +P7437 property_type external-identifier +P7438 property_type external-identifier +P7439 property_type external-identifier +P7440 property_type external-identifier +P7441 property_type external-identifier +P7442 property_type item +P7443 property_type quantity +P7444 property_type external-identifier +P7445 property_type external-identifier +P7446 property_type external-identifier +P7447 property_type external-identifier +P7448 property_type external-identifier +P7449 property_type external-identifier +P7450 property_type external-identifier +P7451 property_type external-identifier +P7452 property_type item +P7453 property_type external-identifier +P7454 property_type external-identifier +P7455 property_type external-identifier +P7456 property_type external-identifier +P7457 property_type string +P7458 property_type external-identifier +P7459 property_type external-identifier +P7460 property_type external-identifier +P7461 property_type external-identifier +P7462 property_type quantity +P7463 property_type external-identifier +P7464 property_type external-identifier +P7465 property_type external-identifier +P7466 property_type external-identifier +P7467 property_type external-identifier +P7468 property_type external-identifier +P7469 property_type item +P7470 property_type string +P7471 property_type external-identifier +P7472 property_type external-identifier +P7473 property_type external-identifier +P7474 property_type external-identifier +P7475 property_type external-identifier +P7476 property_type external-identifier +P7477 property_type external-identifier +P7478 property_type string +P7479 property_type item +P7480 property_type external-identifier +P7481 property_type item +P7482 property_type item +P7483 property_type external-identifier +P7484 property_type external-identifier +P7485 property_type external-identifier \ No newline at end of file diff --git a/kgtk/tests/test_triple_generation.py b/kgtk/tests/test_triple_generation.py new file mode 100644 index 000000000..2ab9c6fcb --- /dev/null +++ b/kgtk/tests/test_triple_generation.py @@ -0,0 +1,97 @@ +import unittest +from kgtk.triple_generator import TripleGenerator +from pathlib import Path + + +class TestTripleGeneration(unittest.TestCase): + + def test_truthy_property_triple_generation(self): + property_tsv_file = 'data/P10.tsv' + wikidata_property_file = 'data/wikidata_properties.tsv' + o = open('data/P10_truthy_tmp.ttl', 'w') + generator = TripleGenerator(wikidata_property_file, label_set='label', alias_set='aliases', + description_set='descriptions', ignore=True, n=100, truthy=True, use_id=True, + dest_fp=o) + for line_num, edge in enumerate(open(property_tsv_file)): + if edge.startswith("#"): + continue + else: + generator.entry_point(line_num + 1, edge) + generator.finalize() + o.close() + f1 = open('data/P10_truthy.ttl') + f2 = open('data/P10_truthy_tmp.ttl') + self.assertEqual(f1.readlines(), f2.readlines()) + f1.close() + f2.close() + p = Path('data/P10_truthy_tmp.ttl') + p.unlink() + + def test_property_triple_generation(self): + property_tsv_file = 'data/P10.tsv' + wikidata_property_file = 'data/wikidata_properties.tsv' + o = open('data/P10_not_truthy_tmp.ttl', 'w') + generator = TripleGenerator(wikidata_property_file, label_set='label', alias_set='aliases', + description_set='descriptions', ignore=True, n=100, truthy=False, use_id=True, + dest_fp=o) + for line_num, edge in enumerate(open(property_tsv_file)): + if edge.startswith("#"): + continue + else: + generator.entry_point(line_num + 1, edge) + generator.finalize() + o.close() + f1 = open('data/P10_not_truthy.ttl') + f2 = open('data/P10_not_truthy_tmp.ttl') + self.assertEqual(f1.readlines(), f2.readlines()) + f1.close() + f2.close() + p = Path('data/P10_not_truthy_tmp.ttl') + p.unlink() + + def test_truthy_qnode_triple_generation(self): + qnode_tsv_file = 'data/Q57160439.tsv' + wikidata_property_file = 'data/wikidata_properties.tsv' + o = open('data/Q57160439_truthy_tmp.ttl', 'w') + generator = TripleGenerator(wikidata_property_file, label_set='label', alias_set='aliases', + description_set='descriptions', ignore=True, n=100, truthy=True, use_id=True, + dest_fp=o) + for line_num, edge in enumerate(open(qnode_tsv_file)): + if edge.startswith("#"): + continue + else: + generator.entry_point(line_num + 1, edge) + generator.finalize() + + o.close() + + f1 = open('data/Q57160439_truthy.ttl') + f2 = open('data/Q57160439_truthy_tmp.ttl') + self.assertEqual(f1.readlines(), f2.readlines()) + f1.close() + f2.close() + p = Path('data/Q57160439_truthy_tmp.ttl') + p.unlink() + + def test_not_truthy_qnode_triple_generation(self): + qnode_tsv_file = 'data/Q57160439.tsv' + wikidata_property_file = 'data/wikidata_properties.tsv' + o = open('data/Q57160439_not_truthy_tmp.ttl', 'w') + generator = TripleGenerator(wikidata_property_file, label_set='label', alias_set='aliases', + description_set='descriptions', ignore=True, n=100, truthy=False, use_id=True, + dest_fp=o) + for line_num, edge in enumerate(open(qnode_tsv_file)): + if edge.startswith("#"): + continue + else: + generator.entry_point(line_num + 1, edge) + generator.finalize() + + o.close() + f1 = open('data/Q57160439_not_truthy.ttl') + f2 = open('data/Q57160439_not_truthy_tmp.ttl') + self.assertEqual(f1.readlines(), f2.readlines()) + f1.close() + f2.close() + p = Path('data/Q57160439_not_truthy_tmp.ttl') + p.unlink() From 32be6affdf243ebd947758992e32b9bba7856f88 Mon Sep 17 00:00:00 2001 From: saggu Date: Wed, 13 May 2020 13:37:53 -0700 Subject: [PATCH 186/278] unit test for small values --- kgtk/tests/data/small_values.tsv | 6 ++ kgtk/tests/data/small_values.ttl | 90 ++++++++++++++++++++++++++++ kgtk/tests/test_triple_generation.py | 24 ++++++++ 3 files changed, 120 insertions(+) create mode 100644 kgtk/tests/data/small_values.tsv create mode 100644 kgtk/tests/data/small_values.ttl diff --git a/kgtk/tests/data/small_values.tsv b/kgtk/tests/data/small_values.tsv new file mode 100644 index 000000000..e012fa736 --- /dev/null +++ b/kgtk/tests/data/small_values.tsv @@ -0,0 +1,6 @@ +node1 property node2 id +Q00005550-chemical-MESHC000006 P6897 0 Q00005550-chemical-MESHC000006-P2020013-18300 +Q00005550-chemical-MESHC000006 P7015 7 Q00005550-chemical-MESHC000006-P2020014-18301 +Q00005550-chemical-MESHC000006 P7079 1.9860001065575846e-07 Q00005550-chemical-MESHC000006-P2020015-18302 +Q00005550-chemical-MESHC000006 P7080 0.0004846436908691038 Q00005550-chemical-MESHC000006-P2020016-18304 +Q00005550-chemical-MESHC000006 P7083 0.0 Q00005550-chemical-MESHC000006-P2020017-18303 diff --git a/kgtk/tests/data/small_values.ttl b/kgtk/tests/data/small_values.ttl new file mode 100644 index 000000000..890ecc620 --- /dev/null +++ b/kgtk/tests/data/small_values.ttl @@ -0,0 +1,90 @@ +@prefix wikibase: . +@prefix wd: . +@prefix wdt: . +@prefix wdtn: . +@prefix wdno: . +@prefix wds: . +@prefix wdv: . +@prefix wdref: . +@prefix p: . +@prefix pr: . +@prefix prv: . +@prefix prn: . +@prefix ps: . +@prefix psv: . +@prefix psn: . +@prefix pq: . +@prefix pqv: . +@prefix pqn: . +@prefix prov: . +@prefix skos: . +@prefix schema: . + +wd:Q00005550-chemical-MESHC000006 a wikibase:Item ; + p:P6897 wds:Q00005550-chemical-MESHC000006-Q00005550-chemical-MESHC000006-P2020013-18300 ; + p:P7015 wds:Q00005550-chemical-MESHC000006-Q00005550-chemical-MESHC000006-P2020014-18301 ; + p:P7079 wds:Q00005550-chemical-MESHC000006-Q00005550-chemical-MESHC000006-P2020015-18302 ; + p:P7080 wds:Q00005550-chemical-MESHC000006-Q00005550-chemical-MESHC000006-P2020016-18304 ; + p:P7083 wds:Q00005550-chemical-MESHC000006-Q00005550-chemical-MESHC000006-P2020017-18303 ; + wdtn:P6897 wdv:Quantityc0c0c0c0 ; + wdtn:P7015 wdv:Quantityc7c0c0c0 ; + wdtn:P7079 wdv:Quantityc0-00000019860001065575846c0c0c0 ; + wdtn:P7080 wdv:Quantityc0-0004846436908691038c0c0c0 ; + wdtn:P7083 wdv:Quantityc0c0c0c0 ; + wdt:P6897 0.0 ; + wdt:P7015 7.0 ; + wdt:P7079 1.9860001065575846E-7 ; + wdt:P7080 0.0004846436908691038 ; + wdt:P7083 0.0 . + +wds:Q00005550-chemical-MESHC000006-Q00005550-chemical-MESHC000006-P2020013-18300 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + ps:P6897 0.0 ; + psn:P6897 wdv:Quantityc0c0c0c0 ; + psv:P6897 wdv:Quantityc0c0c0c0 . + +wds:Q00005550-chemical-MESHC000006-Q00005550-chemical-MESHC000006-P2020014-18301 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + ps:P7015 7.0 ; + psn:P7015 wdv:Quantityc7c0c0c0 ; + psv:P7015 wdv:Quantityc7c0c0c0 . + +wds:Q00005550-chemical-MESHC000006-Q00005550-chemical-MESHC000006-P2020015-18302 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + ps:P7079 1.9860001065575846E-7 ; + psn:P7079 wdv:Quantityc0-00000019860001065575846c0c0c0 ; + psv:P7079 wdv:Quantityc0-00000019860001065575846c0c0c0 . + +wds:Q00005550-chemical-MESHC000006-Q00005550-chemical-MESHC000006-P2020016-18304 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + ps:P7080 0.0004846436908691038 ; + psn:P7080 wdv:Quantityc0-0004846436908691038c0c0c0 ; + psv:P7080 wdv:Quantityc0-0004846436908691038c0c0c0 . + +wds:Q00005550-chemical-MESHC000006-Q00005550-chemical-MESHC000006-P2020017-18303 a wikibase:Statement ; + wikibase:rank wikibase:BestRank ; + ; + ps:P7083 0.0 ; + psn:P7083 wdv:Quantityc0c0c0c0 ; + psv:P7083 wdv:Quantityc0c0c0c0 . + +wdv:Quantityc0-00000019860001065575846c0c0c0 a wikibase:QuantityValue ; + wikibase:quantityAmount 1.9860001065575846E-7 ; + wikibase:quantityNormalized wdv:Quantityc0-00000019860001065575846c0c0c0 . + +wdv:Quantityc0-0004846436908691038c0c0c0 a wikibase:QuantityValue ; + wikibase:quantityAmount 0.0004846436908691038 ; + wikibase:quantityNormalized wdv:Quantityc0-0004846436908691038c0c0c0 . + +wdv:Quantityc7c0c0c0 a wikibase:QuantityValue ; + wikibase:quantityAmount 7.0 ; + wikibase:quantityNormalized wdv:Quantityc7c0c0c0 . + +wdv:Quantityc0c0c0c0 a wikibase:QuantityValue ; + wikibase:quantityAmount 0.0 ; + wikibase:quantityNormalized wdv:Quantityc0c0c0c0 . + diff --git a/kgtk/tests/test_triple_generation.py b/kgtk/tests/test_triple_generation.py index 2ab9c6fcb..91cc7e432 100644 --- a/kgtk/tests/test_triple_generation.py +++ b/kgtk/tests/test_triple_generation.py @@ -95,3 +95,27 @@ def test_not_truthy_qnode_triple_generation(self): f2.close() p = Path('data/Q57160439_not_truthy_tmp.ttl') p.unlink() + + def test_triple_small_values(self): + small_values_file = 'data/small_values.tsv' + wikidata_property_file = 'data/wikidata_properties.tsv' + o = open('data/small_values_tmp.ttl', 'w') + generator = TripleGenerator(wikidata_property_file, label_set='label', alias_set='aliases', + description_set='descriptions', ignore=True, n=100, truthy=True, use_id=True, + dest_fp=o) + for line_num, edge in enumerate(open(small_values_file)): + if edge.startswith("#"): + continue + else: + generator.entry_point(line_num + 1, edge) + generator.finalize() + + o.close() + + f1 = open('data/small_values.ttl') + f2 = open('data/small_values_tmp.ttl') + self.assertEqual(f1.readlines(), f2.readlines()) + f1.close() + f2.close() + p = Path('data/small_values_tmp.ttl') + p.unlink() \ No newline at end of file From 056538b6a30e30cef6ea15124b7d13af8ed08437 Mon Sep 17 00:00:00 2001 From: saggu Date: Wed, 13 May 2020 13:51:47 -0700 Subject: [PATCH 187/278] add travis ci --- .travis.yml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 000000000..f1b9f1c01 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,16 @@ +sudo: false +dist: trusty +language: python +python: +- 3.7 +install: +- sudo apt-get update +- pip install --upgrade pip +- pip install -r requirements.txt +script: +- python -m spacy download en_core_web_sm +- cd kgtk/tests +- python -m unittest discover +notifications: + slack: + secure: FfFhdBv7FgVTZrA/UUm3EcsH/dOvyOusIJ0o+Y+Ysf9DVeasBjJ2E7xaCMcs0KM8ypQuXTVMJyT88uGiJ3fNU/Sy5C/TEireGOWCqy84et/iFjMfIIHnPb3Nz6yLIrDsrrEufcLm1RDeMtQkvn55FfuLOoelfKe10/eAfR/luscoCr1LLqFxGZizpkYION9FCTlZ1CX+OK13ALuG9hqeCKy+k/PkmuwboQDW1N0Q7JcJTs90Pr02TZp83efePRmayXSjjhiy3npVsBYP/oQPyec1mgCSizn+lkTJJ80yzHe++e7zzpg5XbyLjSoA/ddz8AdRq5wD+BooVegJB0cxnMioEzHvpocIyUC28vGEBTbHCU+songs7z9WJyySTy3G1GaBSbcp6dOVDgTmizouBQbkL4/k+PJUDndsMN7hykDYzvlaVt2HZykiA+sf6EiW2RWPhWThmzo3ACJf30OTK78pUfuh1UcuxHcUz/Ve7V/2pP8wGnD2Imbj78GxKa+hzGQ+7lggExVUpPMCMPAJeFcSjbiLeUVO/muyqoRC6Mr1Y1ZlqL1EcKC9LC6jkXs0XV6jB3DRlr8YHiq6X1GPe0rSzV0/XUain9WY5jO15P8HBU2Pv4Y4hsU8LjVqBbX5r22Xquv6KaLQZVIJUOe2FGzfzYiiIXivYNvtATkxSfY= From 5e9df91e389b245e2e7ff376280274f0c1c42b81 Mon Sep 17 00:00:00 2001 From: saggu Date: Wed, 13 May 2020 13:54:03 -0700 Subject: [PATCH 188/278] travis build initiate --- kgtk/tests/test_triple_generation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/kgtk/tests/test_triple_generation.py b/kgtk/tests/test_triple_generation.py index 91cc7e432..a55a46a3a 100644 --- a/kgtk/tests/test_triple_generation.py +++ b/kgtk/tests/test_triple_generation.py @@ -8,6 +8,7 @@ class TestTripleGeneration(unittest.TestCase): def test_truthy_property_triple_generation(self): property_tsv_file = 'data/P10.tsv' wikidata_property_file = 'data/wikidata_properties.tsv' + o = open('data/P10_truthy_tmp.ttl', 'w') generator = TripleGenerator(wikidata_property_file, label_set='label', alias_set='aliases', description_set='descriptions', ignore=True, n=100, truthy=True, use_id=True, From b95567b0ee8bf4edda3ab84ae8af69328149ead1 Mon Sep 17 00:00:00 2001 From: saggu Date: Wed, 13 May 2020 13:57:42 -0700 Subject: [PATCH 189/278] update python version for travis --- .travis.yml | 2 +- kgtk/tests/test_triple_generation.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index f1b9f1c01..debb77c4b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,7 +2,7 @@ sudo: false dist: trusty language: python python: -- 3.7 +- 3.6.8 install: - sudo apt-get update - pip install --upgrade pip diff --git a/kgtk/tests/test_triple_generation.py b/kgtk/tests/test_triple_generation.py index a55a46a3a..91cc7e432 100644 --- a/kgtk/tests/test_triple_generation.py +++ b/kgtk/tests/test_triple_generation.py @@ -8,7 +8,6 @@ class TestTripleGeneration(unittest.TestCase): def test_truthy_property_triple_generation(self): property_tsv_file = 'data/P10.tsv' wikidata_property_file = 'data/wikidata_properties.tsv' - o = open('data/P10_truthy_tmp.ttl', 'w') generator = TripleGenerator(wikidata_property_file, label_set='label', alias_set='aliases', description_set='descriptions', ignore=True, n=100, truthy=True, use_id=True, From bb25904f1342fe07655820250a754b60b120a6f4 Mon Sep 17 00:00:00 2001 From: saggu Date: Wed, 13 May 2020 13:59:32 -0700 Subject: [PATCH 190/278] default python version --- .travis.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index debb77c4b..32ded647b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,8 +1,7 @@ sudo: false dist: trusty language: python -python: -- 3.6.8 + install: - sudo apt-get update - pip install --upgrade pip From 2a8f901b754996f81f0547336a2c077377b0a0b9 Mon Sep 17 00:00:00 2001 From: saggu Date: Wed, 13 May 2020 14:05:46 -0700 Subject: [PATCH 191/278] install kgtk as well --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 32ded647b..5cdd0a350 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,6 +6,7 @@ install: - sudo apt-get update - pip install --upgrade pip - pip install -r requirements.txt +- pip install . script: - python -m spacy download en_core_web_sm - cd kgtk/tests From be7cf926b56caf8db76ca64fe3149296cb65b9a5 Mon Sep 17 00:00:00 2001 From: saggu Date: Wed, 13 May 2020 14:13:11 -0700 Subject: [PATCH 192/278] force reinstall numpy, version conflict --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 5cdd0a350..b8bf289b0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,6 +6,7 @@ install: - sudo apt-get update - pip install --upgrade pip - pip install -r requirements.txt +- pip install --force-reinstall numpy --no-cache - pip install . script: - python -m spacy download en_core_web_sm From b752504faade3aa1bc64f63c0d04639e751261b7 Mon Sep 17 00:00:00 2001 From: saggu Date: Wed, 13 May 2020 14:21:12 -0700 Subject: [PATCH 193/278] python=3.7, dist=xenial --- .travis.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index b8bf289b0..3aaa00505 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,8 @@ sudo: false -dist: trusty +dist: xenial language: python +python: +- 3.7 install: - sudo apt-get update From 6152e25bd149aac6c96c001a2ae3482ccf8f7dd7 Mon Sep 17 00:00:00 2001 From: saggu Date: Wed, 13 May 2020 14:28:56 -0700 Subject: [PATCH 194/278] python==3.7,7 --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 3aaa00505..99ecf8d31 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,7 +2,7 @@ sudo: false dist: xenial language: python python: -- 3.7 +- 3.7.7 install: - sudo apt-get update From f3b51b3b9e22ce1d0b080b2a494d94eb17b5c79f Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 13 May 2020 15:45:34 -0700 Subject: [PATCH 195/278] Better fallback options. Better feedback oactual option values. --- kgtk/cli/cat.py | 8 +++ kgtk/cli/clean_data.py | 8 +++ kgtk/cli/ifexists.py | 25 +++++++--- kgtk/cli/ifnotexists.py | 25 +++++++--- kgtk/cli/join.py | 26 ++++++++-- kgtk/cli/validate.py | 10 +++- kgtk/io/kgtkreader.py | 105 ++++++++++++++++++++++++++++++++-------- kgtk/join/ifexists.py | 10 +++- kgtk/join/kgtkcat.py | 4 ++ kgtk/join/kgtkjoiner.py | 12 ++++- 10 files changed, 191 insertions(+), 42 deletions(-) diff --git a/kgtk/cli/cat.py b/kgtk/cli/cat.py index e911c98c6..618213a17 100644 --- a/kgtk/cli/cat.py +++ b/kgtk/cli/cat.py @@ -53,6 +53,7 @@ def run(input_file_paths: typing.List[Path], errors_to_stdout: bool = False, errors_to_stderr: bool = True, + show_options: bool = False, verbose: bool = False, very_verbose: bool = False, @@ -71,6 +72,13 @@ def run(input_file_paths: typing.List[Path], reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) + # Show the final option structures for debugging and documentation. + if show_options: + print("input: %s" % " ".join((str(input_file_path) for input_file_path in input_file_paths)), file=error_file) + print("--output-file=%s" % str(output_file_path), file=error_file) + reader_options.show(out=error_file) + print("=======", file=error_file, flush=True) + try: kc: KgtkCat = KgtkCat(input_file_paths=input_file_paths, output_path=output_file_path, diff --git a/kgtk/cli/clean_data.py b/kgtk/cli/clean_data.py index bb497eea4..aa22c512b 100644 --- a/kgtk/cli/clean_data.py +++ b/kgtk/cli/clean_data.py @@ -42,6 +42,7 @@ def run(input_file: typing.Optional[Path], output_file: typing.Optional[Path], errors_to_stdout: bool = False, errors_to_stderr: bool = False, + show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkReaderOptions and KgtkValueOptions want. @@ -56,6 +57,13 @@ def run(input_file: typing.Optional[Path], reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) + # Show the final option structures for debugging and documentation. + if show_options: + print("input: %s" % (str(input_file) if input_file is not None else "-"), file=error_file) + print("output: %s" % (str(output_file) if output_file is not None else "-"), file=error_file) + reader_options.show(out=error_file) + print("=======", file=error_file, flush=True) + if verbose: if input_file is not None: print("Cleaning data from '%s'" % str(input_file), file=error_file, flush=True) diff --git a/kgtk/cli/ifexists.py b/kgtk/cli/ifexists.py index 69718fbc7..288926305 100644 --- a/kgtk/cli/ifexists.py +++ b/kgtk/cli/ifexists.py @@ -50,18 +50,14 @@ def h(msg: str)->str: parser.add_argument("-o", "--output-file", dest="output_kgtk_file", help="The KGTK file to write", type=Path, default=None) - # This argument is retained for compatability with earlier versions of this command. - parser.add_argument( "--error-limit", dest="error_limit", - help=h("The maximum number of errors per input fule (default=%(default)s)"), - default=KgtkReaderOptions.ERROR_LIMIT_DEFAULT) - parser.add_argument( "--field-separator", dest="field_separator", help=h("Separator for multifield keys (default=%(default)s)") , default=IfExists.FIELD_SEPARATOR_DEFAULT) KgtkReader.add_debug_arguments(parser, expert=_expert) - KgtkReaderOptions.add_arguments(parser, mode_options=True, who="input", expert=_expert) - KgtkReaderOptions.add_arguments(parser, mode_options=True, who="filter", expert=_expert) + KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=_expert) + KgtkReaderOptions.add_arguments(parser, mode_options=True, who="input", expert=_expert, defaults=False) + KgtkReaderOptions.add_arguments(parser, mode_options=True, who="filter", expert=_expert, defaults=False) KgtkValueOptions.add_arguments(parser, expert=_expert) def run(input_kgtk_file: typing.Optional[Path], @@ -74,6 +70,7 @@ def run(input_kgtk_file: typing.Optional[Path], errors_to_stdout: bool = False, errors_to_stderr: bool = True, + show_options: bool = False, verbose: bool = False, very_verbose: bool = False, @@ -91,6 +88,20 @@ def run(input_kgtk_file: typing.Optional[Path], filter_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="filter", fallback=True) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) + # Show the final option structures for debugging and documentation. + if show_options: + print("input: %s" % (str(input_kgtk_file) if input_kgtk_file is not None else "-"), file=error_file) + if input_keys is not None: + print("--input-keys=%s" % " ".join(input_keys), file=error_file) + print("--filter-on=%s" % (str(filter_kgtk_file) if filter_kgtk_file is not None else "-"), file=error_file) + if filter_keys is not None: + print("--filter-keys=%s" % " ".join(filter_keys), file=error_file) + print("--output-file=%s" % (str(output_kgtk_file) if output_kgtk_file is not None else "-"), file=error_file) + print("--field-separator='%s'" % str(field_separator), file=error_file) + input_reader_options.show(out=error_file, who="input") + filter_reader_options.show(out=error_file, who="filter") + print("=======", file=error_file, flush=True) + try: ie: IfExists = IfExists( input_file_path=input_kgtk_file, diff --git a/kgtk/cli/ifnotexists.py b/kgtk/cli/ifnotexists.py index e49a481f7..dd29b6ec8 100644 --- a/kgtk/cli/ifnotexists.py +++ b/kgtk/cli/ifnotexists.py @@ -49,18 +49,14 @@ def h(msg: str)->str: parser.add_argument( "--filter-keys", "--right-keys", dest="filter_keys", help="The key columns in the filter-on file.", nargs='*') - # This argument is retained for compatability with earlier versions of this command. - parser.add_argument( "--error-limit", dest="error_limit", - help=h("The maximum number of errors per input fule (default=%(default)s)"), - default=KgtkReaderOptions.ERROR_LIMIT_DEFAULT) - parser.add_argument( "--field-separator", dest="field_separator", help=h("Separator for multifield keys"), default=IfExists.FIELD_SEPARATOR_DEFAULT) KgtkReader.add_debug_arguments(parser, expert=_expert) - KgtkReaderOptions.add_arguments(parser, mode_options=True, who="input", expert=_expert) - KgtkReaderOptions.add_arguments(parser, mode_options=True, who="filter", expert=_expert) + KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=_expert) + KgtkReaderOptions.add_arguments(parser, mode_options=True, who="input", expert=_expert, defaults=False) + KgtkReaderOptions.add_arguments(parser, mode_options=True, who="filter", expert=_expert, defaults=False) KgtkValueOptions.add_arguments(parser, expert=_expert) def run(input_kgtk_file: typing.Optional[Path], @@ -73,6 +69,7 @@ def run(input_kgtk_file: typing.Optional[Path], errors_to_stdout: bool = False, errors_to_stderr: bool = True, + show_options: bool = False, verbose: bool = False, very_verbose: bool = False, @@ -90,6 +87,20 @@ def run(input_kgtk_file: typing.Optional[Path], filter_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="filter", fallback=True) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) + # Show the final option structures for debugging and documentation. + if show_options: + print("input: %s" % (str(input_kgtk_file) if input_kgtk_file is not None else "-"), file=error_file) + if input_keys is not None: + print("--input-keys=%s" % " ".join(input_keys), file=error_file) + print("--filter-on=%s" % (str(filter_kgtk_file) if filter_kgtk_file is not None else "-"), file=error_file) + if filter_keys is not None: + print("--filter-keys=%s" % " ".join(filter_keys), file=error_file) + print("--output-file=%s" % (str(output_kgtk_file) if output_kgtk_file is not None else "-"), file=error_file) + print("--field-separator='%s'" % str(field_separator), file=error_file) + input_reader_options.show(out=error_file, who="input") + filter_reader_options.show(out=error_file, who="filter") + print("=======", file=error_file, flush=True) + try: ie: IfExists = IfExists( input_file_path=input_kgtk_file, diff --git a/kgtk/cli/join.py b/kgtk/cli/join.py index 7e88b5449..3be24680b 100644 --- a/kgtk/cli/join.py +++ b/kgtk/cli/join.py @@ -19,9 +19,10 @@ def parser(): return { 'help': 'Join two KGTK files', 'description': """Join two KGTK edge files or two KGTK node files. + Join keys are extracted from one or both input files and stored in memory, -then the data is processed in a second pass. -stdin will not work as an input file if two passes are needed. +then the data files are processed in a second pass. stdin will not work as an +input file if join keys are needed from it. The output file contains the union of the columns in the two input files, adjusted for predefined name aliasing. @@ -31,6 +32,16 @@ def parser(): Specify both to get a full outer join (equivalent to cat). Specify neither to get an inner join. +By default, node files are joined on the id column, while edge files are joined +on the node1 column. The label and node2 columns may be added to the edge file +join criteria. Alternatively, the left and right file join columns may be +listed explicitly. + +To join an edge file to a node file, or to join quasi-KGTK files, use the +following option (enable expert mode for more information): + +--mode=NONE + Expert mode provides additional command arguments. """ } @@ -87,8 +98,8 @@ def h(msg: str)->str: # files, or for all files. KgtkReader.add_debug_arguments(parser, expert=_expert) KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=_expert) - KgtkReaderOptions.add_arguments(parser, mode_options=True, who="left", expert=_expert) - KgtkReaderOptions.add_arguments(parser, mode_options=True, who="right", expert=_expert) + KgtkReaderOptions.add_arguments(parser, mode_options=True, who="left", expert=_expert, defaults=False) + KgtkReaderOptions.add_arguments(parser, mode_options=True, who="right", expert=_expert, defaults=False) KgtkValueOptions.add_arguments(parser, expert=_expert) def run(left_file_path: typing.Optional[Path], @@ -106,6 +117,7 @@ def run(left_file_path: typing.Optional[Path], errors_to_stdout: bool = False, errors_to_stderr: bool = True, + show_options: bool = False, verbose: bool = False, very_verbose: bool = False, @@ -144,6 +156,12 @@ def run(left_file_path: typing.Optional[Path], right_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="right", fallback=True) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) + # Show the final option structures for debugging and documentation. + if show_options: + # TODO: left_file_path, right_file_path, --join-on-label, etc. + left_reader_options.show(out=error_file, who="left") + right_reader_options.show(out=error_file, who="right") + try: kr: KgtkJoiner = KgtkJoiner( left_file_path=left_file_path, diff --git a/kgtk/cli/validate.py b/kgtk/cli/validate.py index 9e16bd90d..299b7018b 100644 --- a/kgtk/cli/validate.py +++ b/kgtk/cli/validate.py @@ -40,7 +40,7 @@ def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Names help="Process the only the header of the input file.", action="store_true") KgtkReader.add_debug_arguments(parser, expert=_expert) - KgtkReaderOptions.add_arguments(parser, mode_options=True, validate_by_default=True, expert=True) + KgtkReaderOptions.add_arguments(parser, mode_options=True, validate_by_default=True, expert=_expert) KgtkValueOptions.add_arguments(parser, expert=True) @@ -48,6 +48,7 @@ def run(kgtk_files: typing.Optional[typing.List[typing.Optional[Path]]], errors_to_stdout: bool = False, errors_to_stderr: bool = False, header_only: bool = False, + show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkReaderOptions and KgtkValueOptions want. @@ -65,6 +66,13 @@ def run(kgtk_files: typing.Optional[typing.List[typing.Optional[Path]]], reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) + # Show the final option structures for debugging and documentation. + if show_options: + print("input: %s" % " ".join((str(kgtk_file) for kgtk_file in kgtk_files)), file=error_file) + print("--header-only=%s" % str(header_only), file=error_file) + reader_options.show(out=error_file) + print("=======", file=error_file, flush=True) + try: kgtk_file: typing.Optional[Path] for kgtk_file in kgtk_files: diff --git a/kgtk/io/kgtkreader.py b/kgtk/io/kgtkreader.py index 23f8e6c0c..fdcd882d4 100644 --- a/kgtk/io/kgtkreader.py +++ b/kgtk/io/kgtkreader.py @@ -103,6 +103,7 @@ def add_arguments(cls, mode_options: bool = False, validate_by_default: bool = False, expert: bool = False, + defaults: bool = True, who: str = ""): # This helper function makes it easy to suppress options from @@ -114,6 +115,25 @@ def h(msg: str)->str: else: return SUPPRESS + # This helper function decices whether or not to include defaults + # in argument declarations. If we plan to make arguments with + # prefixes and fallbacks, the fallbacks (the ones without prefixes) + # should get defaults value, while the prefixed arguments should + # not get defaults. + # + # At the present time, boolean arguments can't use fallbacks. + # + # Note: In obscure circumstances (EnumNameAction, I'm looking at you), + # explicitly setting "default=None" may fail, whereas omitting the + # "default=" phrase succeeds. + # + # TODO: continue researching these issues. + def d(default: typing.Any)->typing.Mapping[str, typing.Any]: + if defaults: + return {"default": default} + else: + return { } + prefix1: str = "--" if len(who) == 0 else "--" + who + "-" prefix2: str = "" if len(who) == 0 else who + "_" prefix3: str = "" if len(who) == 0 else who + ": " @@ -124,8 +144,9 @@ def h(msg: str)->str: fgroup.add_argument(prefix1 + "column-separator", dest=prefix2 + "column_separator", help=h(prefix3 + "Column separator (default=)."), # TODO: provide the default with escapes, e.g. \t - type=str, default=KgtkFormat.COLUMN_SEPARATOR) + type=str, **d(default=KgtkFormat.COLUMN_SEPARATOR)) + # TODO: use an Enum or add choices. fgroup.add_argument(prefix1 + "compression-type", dest=prefix2 + "compression_type", help=h(prefix3 + "Specify the compression type (default=%(default)s).")) @@ -133,7 +154,7 @@ def h(msg: str)->str: fgroup.add_argument(prefix1 + "error-limit", dest=prefix2 + "error_limit", help=h(prefix3 + "The maximum number of errors to report before failing (default=%(default)s)"), - type=int, default=cls.ERROR_LIMIT_DEFAULT) + type=int, **d(default=cls.ERROR_LIMIT_DEFAULT)) fgroup.add_argument(prefix1 + "gzip-in-parallel", dest=prefix2 + "gzip_in_parallel", @@ -143,13 +164,13 @@ def h(msg: str)->str: fgroup.add_argument(prefix1 + "gzip-queue-size", dest=prefix2 + "gzip_queue_size", help=h(prefix3 + "Queue size for parallel gzip (default=%(default)s)."), - type=int, default=cls.GZIP_QUEUE_SIZE_DEFAULT) + type=int, **d(default=cls.GZIP_QUEUE_SIZE_DEFAULT)) if mode_options: fgroup.add_argument(prefix1 + "mode", dest=prefix2 + "mode", help=h(prefix3 + "Determine the KGTK file mode (default=%(default)s)."), - type=KgtkReaderMode, action=EnumNameAction, default=KgtkReaderMode.AUTO) + type=KgtkReaderMode, action=EnumNameAction, **d(KgtkReaderMode.AUTO)) hgroup: _ArgumentGroup = parser.add_argument_group(h(prefix3 + "Header parsing"), h("Options affecting " + prefix4 + "header parsing")) @@ -162,7 +183,7 @@ def h(msg: str)->str: hgroup.add_argument(prefix1 + "header-error-action", dest=prefix2 + "header_error_action", help=h(prefix3 + "The action to take when a header error is detected. Only ERROR or EXIT are supported (default=%(default)s)."), - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXIT) + type=ValidationAction, action=EnumNameAction, **d(default=ValidationAction.EXIT)) hgroup.add_argument(prefix1 + "skip-first-record", dest=prefix2 + "skip_first_record", @@ -172,7 +193,7 @@ def h(msg: str)->str: hgroup.add_argument(prefix1 + "unsafe-column-name-action", dest=prefix2 + "unsafe_column_name_action", help=h(prefix3 + "The action to take when a column name is unsafe (default=%(default)s)."), - type=ValidationAction, action=EnumNameAction, default=ValidationAction.REPORT) + type=ValidationAction, action=EnumNameAction, **d(default=ValidationAction.REPORT)) lgroup: _ArgumentGroup = parser.add_argument_group(h(prefix3 + "Line parsing"), h("Options affecting " + prefix4 + "data line parsing")) @@ -200,17 +221,17 @@ def h(msg: str)->str: lgroup.add_argument(prefix1 + "blank-required-field-line-action", dest=prefix2 + "blank_required_field_line_action", help=h(prefix3 + "The action to take when a line with a blank node1, node2, or id field (per mode) is detected (default=%(default)s)."), - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) + type=ValidationAction, action=EnumNameAction, **d(default=ValidationAction.EXCLUDE)) lgroup.add_argument(prefix1 + "comment-line-action", dest=prefix2 + "comment_line_action", help=h(prefix3 + "The action to take when a comment line is detected (default=%(default)s)."), - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) + type=ValidationAction, action=EnumNameAction, **d(default=ValidationAction.EXCLUDE)) lgroup.add_argument(prefix1 + "empty-line-action", dest=prefix2 + "empty_line_action", help=h(prefix3 + "The action to take when an empty line is detected (default=%(default)s)."), - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) + type=ValidationAction, action=EnumNameAction, **d(default=ValidationAction.EXCLUDE)) lgroup.add_argument(prefix1 + "fill-short-lines", dest=prefix2 + "fill_short_lines", @@ -220,17 +241,17 @@ def h(msg: str)->str: lgroup.add_argument(prefix1 + "invalid-value-action", dest=prefix2 + "invalid_value_action", help=h(prefix3 + "The action to take when a data cell value is invalid (default=%(default)s)."), - type=ValidationAction, action=EnumNameAction, default=ValidationAction.REPORT) + type=ValidationAction, action=EnumNameAction, **d(default=ValidationAction.REPORT)) lgroup.add_argument(prefix1 + "long-line-action", dest=prefix2 + "long_line_action", help=h(prefix3 + "The action to take when a long line is detected (default=%(default)s)."), - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) + type=ValidationAction, action=EnumNameAction, **d(default=ValidationAction.EXCLUDE)) lgroup.add_argument(prefix1 + "short-line-action", dest=prefix2 + "short_line_action", help=h(prefix3 + "The action to take when a short line is detected (default=%(default)s)."), - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) + type=ValidationAction, action=EnumNameAction, **d(default=ValidationAction.EXCLUDE)) lgroup.add_argument(prefix1 + "truncate-long-lines", dest=prefix2 + "truncate_long_lines", @@ -240,7 +261,7 @@ def h(msg: str)->str: lgroup.add_argument(prefix1 + "whitespace-line-action", dest=prefix2 + "whitespace_line_action", help=h(prefix3 + "The action to take when a whitespace line is detected (default=%(default)s)."), - type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXCLUDE) + type=ValidationAction, action=EnumNameAction, **d(default=ValidationAction.EXCLUDE)) @classmethod # Build the value parsing option structure. @@ -257,9 +278,9 @@ def from_dict(cls, # TODO: Figure out how to type check this method. def lookup(name: str, default): prefixed_name = prefix + name - if prefixed_name in d: + if prefixed_name in d and d[prefixed_name] is not None: return d[prefixed_name] - elif fallback and name in d: + elif fallback and name in d and d[name] is not None: return d[name] else: return default @@ -304,6 +325,34 @@ def from_args(cls, )->'KgtkReaderOptions': return cls.from_dict(vars(args), who=who, mode=mode, fallback=fallback) + def show(self, who: str="", out: typing.TextIO=sys.stderr): + prefix: str = "--" if len(who) == 0 else "--" + who + "-" + print("%smode=%s" % (prefix, self.mode.name), file=out) + print("%scolumn-separator='%s'" % (prefix, self.column_separator), file=out) + if self.force_column_names is not None: + print("%sforce_column_names=%s" % (prefix, " ".join(self.force_column_names)), file=out) + print("%sskip_first_record=%s" % (prefix, str(self.skip_first_record)), file=out) + print("%serror-limit=%s" % (prefix, str(self.error_limit)), file=out) + print("%srepair-and-validate-lines=%s" % (prefix, str(self.repair_and_validate_lines)), file=out) + print("%srepair-and-validate-values=%s" % (prefix, str(self.repair_and_validate_values)), file=out) + print("%sempty-line-action=%s" % (prefix, self.empty_line_action.name), file=out) + print("%scomment-line-action=%s" % (prefix, self.comment_line_action.name), file=out) + print("%swhitespace-line-action=%s" % (prefix, self.whitespace_line_action.name), file=out) + print("%sblank-required-field-line-action=%s" % (prefix, self.blank_required_field_line_action.name), file=out) + print("%sshort-line-action=%s" % (prefix, self.short_line_action.name), file=out) + print("%slong-line-action=%s" % (prefix, self.long_line_action.name), file=out) + print("%sheader-error-action=%s" % (prefix, self.header_error_action.name), file=out) + print("%sunsafe-column-name-action=%s" % (prefix, self.unsafe_column_name_action.name), file=out) + print("%sinvalid-value-action=%s" % (prefix, self.invalid_value_action.name), file=out) + print("%sfill-short-lines=%s" % (prefix, str(self.fill_short_lines)), file=out) + print("%struncate-long-lines=%s" % (prefix, str(self.truncate_long_lines)), file=out) + if self.compression_type is not None: + print("%scompression-type=%s" % (prefix, str(self.compression_type)), file=out) + print("%sgzip-in-parallel=%s" % (prefix, str(self.gzip_in_parallel)), file=out) + print("%sgzip-queue-size=%s" % (prefix, str(self.gzip_queue_size)), file=out) + + + DEFAULT_KGTK_READER_OPTIONS: KgtkReaderOptions = KgtkReaderOptions() @@ -959,6 +1008,12 @@ def h(msg: str)->str: # Avoid the argparse bug that prevents these two arguments from having # their help messages suppressed directly. + # + # TODO: Is there a better fix? + # + # TODO: replace --errors-to-stdout and --errors-to-stderr with + # --errors-to=stdout and --errors-to=stderr, using either an enum + # or choices. That will avoid the argparse bug, too. if expert: errors_to = egroup.add_mutually_exclusive_group() errors_to.add_argument( "--errors-to-stdout", dest="errors_to_stdout", @@ -968,17 +1023,19 @@ def h(msg: str)->str: help="Send errors to stderr instead of stdout", action="store_true") else: - egroup.add_argument( "--errors-to-stdout", dest="errors_to_stdout", - help=h("Send errors to stdout instead of stderr"), - action="store_true") egroup.add_argument( "--errors-to-stderr", dest="errors_to_stderr", help=h("Send errors to stderr instead of stdout"), action="store_true") + egroup.add_argument( "--errors-to-stdout", dest="errors_to_stdout", + help=h("Send errors to stdout instead of stderr"), + action="store_true") + + egroup.add_argument( "--show-options", dest="show_options", help="Print the options selected (default=%(default)s).", action='store_true') - egroup.add_argument("-v", "--verbose", dest="verbose", help="Print additional progress messages.", action='store_true') + egroup.add_argument("-v", "--verbose", dest="verbose", help="Print additional progress messages (default=%(default)s).", action='store_true') egroup.add_argument( "--very-verbose", dest="very_verbose", - help=h("Print additional progress messages."), + help=h("Print additional progress messages (default=%(default)s)."), action='store_true') def main(): @@ -1003,7 +1060,7 @@ def main(): KgtkReaderOptions.add_arguments(parser, mode_options=True, validate_by_default=True, expert=True) KgtkValueOptions.add_arguments(parser, expert=True) - args = parser.parse_args() + args: Namespace = parser.parse_args() error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr @@ -1011,6 +1068,12 @@ def main(): reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args) value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) + if args.show_options: + print("--test=%s" % str(args.test), file=error_file) + print("--test-validate=%s" % str(args.test_validate), file=error_file) + reader_options.show(out=error_file) + print("=======", file=error_file, flush=True) + kr: KgtkReader = KgtkReader.open(args.kgtk_file, error_file = error_file, options=reader_options, diff --git a/kgtk/join/ifexists.py b/kgtk/join/ifexists.py index 26f1f1965..48e7d2edc 100644 --- a/kgtk/join/ifexists.py +++ b/kgtk/join/ifexists.py @@ -71,7 +71,7 @@ def get_primary_key_column(self, kr: KgtkReader, who: str)->typing.List[int]: raise ValueError("The node1 column is missing from the %s node file." % who) return [ kr.node1_column_idx ] else: - raise ValueError("The %s file is neither edge nore node." % who) + raise ValueError("The %s file is neither edge nor node." % who) def get_edge_key_columns(self, kr: KgtkReader, who: str)-> typing.List[int]: if not kr.is_edge_file: @@ -97,6 +97,9 @@ def get_key_columns(self, supplied_keys: typing.Optional[typing.List[str]], kr: if supplied_keys is not None and len(supplied_keys) > 0: return self.get_supplied_key_columns(supplied_keys, kr, who) + if not (kr.is_node_file or kr.is_edge_file): + raise ValueError("The %s file is a quasi-KGTK file. Please supply its keys." % who) + if kr.is_node_file or other_kr.is_node_file: return self.get_primary_key_column(kr, who) @@ -230,6 +233,11 @@ def main(): filter_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args, who="filter") value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) + # Show the final option structures for debugging and documentation. + if show_options: + input_reader_options.show(out=error_file, who="input") + filter_reader_options.show(out=error_file, who="filter") + ie: IfExists = IfExists( input_file_path=args.input_file_path, input_keys=args.input_keys, diff --git a/kgtk/join/kgtkcat.py b/kgtk/join/kgtkcat.py index cb0d6ba8d..944c4cabe 100644 --- a/kgtk/join/kgtkcat.py +++ b/kgtk/join/kgtkcat.py @@ -171,6 +171,10 @@ def main(): reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args) value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) + # Show the final option structures for debugging and documentation. + if show_options: + reader_options.show(out=error_file) + kc: KgtkCat = KgtkCat(input_file_paths=args.input_file_paths, output_path=args.output_file_path, reader_options=reader_options, diff --git a/kgtk/join/kgtkjoiner.py b/kgtk/join/kgtkjoiner.py index a1662ae0e..e7d7574da 100644 --- a/kgtk/join/kgtkjoiner.py +++ b/kgtk/join/kgtkjoiner.py @@ -131,7 +131,7 @@ def build_join_idx_list(self, kr: KgtkReader, who: str, join_columns: typing.Opt print("Joining on id (index %s in the %s input file)" % (join_idx, who), file=self.error_file, flush=True) join_idx_list.append(join_idx) else: - raise ValueError("Unknown file type in build_join_idx_list(...)") + raise ValueError("Quasi-KGTK files require an explicit list of join columns") # join_on_label and join_on_node2 may be specified if self.join_on_label or self.join_on_node2: @@ -224,6 +224,11 @@ def ok_to_join(self, left_kr: KgtkReader, right_kr: KgtkReader)->bool: print("Both input files are node files.", file=self.error_file, flush=True) return True + elif (not (left_kr.is_node_file or left_kr.is_edge_file)) or (not(right_kr.is_edge_file or right_kr.is_node_file)): + if self.verbose: + print("One or both input files are quasi-KGTK files.", file=self.error_file, flush=True) + return True + else: print("Cannot join edge and node files.", file=self.error_file, flush=True) return False @@ -375,6 +380,11 @@ def main(): right_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args, who=KgtkJoiner.RIGHT) value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) + # Show the final option structures for debugging and documentation. + if args.show_options: + left_reader_options.show(out=error_file, who="left") + right_reader_options.show(out=error_file, who="right") + ej: KgtkJoiner = KgtkJoiner(left_file_path=args.left_file_path, right_file_path=args.right_file_path, output_path=args.output_file_path, From cfecb9768467c2ea5f5f4e86e1bf341f4b69c74f Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 13 May 2020 16:29:46 -0700 Subject: [PATCH 196/278] Better feedback for separation characters. --- kgtk/cli/ifexists.py | 2 +- kgtk/cli/ifnotexists.py | 2 +- kgtk/cli/join.py | 15 +++++++++++++++ kgtk/io/kgtkreader.py | 2 +- 4 files changed, 18 insertions(+), 3 deletions(-) diff --git a/kgtk/cli/ifexists.py b/kgtk/cli/ifexists.py index 288926305..ec24e8104 100644 --- a/kgtk/cli/ifexists.py +++ b/kgtk/cli/ifexists.py @@ -97,7 +97,7 @@ def run(input_kgtk_file: typing.Optional[Path], if filter_keys is not None: print("--filter-keys=%s" % " ".join(filter_keys), file=error_file) print("--output-file=%s" % (str(output_kgtk_file) if output_kgtk_file is not None else "-"), file=error_file) - print("--field-separator='%s'" % str(field_separator), file=error_file) + print("--field-separator=%s" % repr(field_separator), file=error_file) input_reader_options.show(out=error_file, who="input") filter_reader_options.show(out=error_file, who="filter") print("=======", file=error_file, flush=True) diff --git a/kgtk/cli/ifnotexists.py b/kgtk/cli/ifnotexists.py index dd29b6ec8..ba0d31242 100644 --- a/kgtk/cli/ifnotexists.py +++ b/kgtk/cli/ifnotexists.py @@ -96,7 +96,7 @@ def run(input_kgtk_file: typing.Optional[Path], if filter_keys is not None: print("--filter-keys=%s" % " ".join(filter_keys), file=error_file) print("--output-file=%s" % (str(output_kgtk_file) if output_kgtk_file is not None else "-"), file=error_file) - print("--field-separator='%s'" % str(field_separator), file=error_file) + print("--field-separator='%s'" % repr(field_separator), file=error_file) input_reader_options.show(out=error_file, who="input") filter_reader_options.show(out=error_file, who="filter") print("=======", file=error_file, flush=True) diff --git a/kgtk/cli/join.py b/kgtk/cli/join.py index 3be24680b..aa0e8c939 100644 --- a/kgtk/cli/join.py +++ b/kgtk/cli/join.py @@ -159,6 +159,21 @@ def run(left_file_path: typing.Optional[Path], # Show the final option structures for debugging and documentation. if show_options: # TODO: left_file_path, right_file_path, --join-on-label, etc. + print("left: %s" % (str(left_file_path) if left_file_path is not None else "-"), file=error_file) + print("right: %s" % (str(left_file_path) if left_file_path is not None else "-"), file=error_file) + print("--output-file=%s" % (str(output_file_path) if output_file_path is not None else "-"), file=error_file) + print("--left-join=%s" % str(left_join), file=error_file) + print("--right-join=%s" % str(right_join), file=error_file) + print("--join-on-label=%s" % str(join_on_label), file=error_file) + print("--join-on-node2=%s" % str(join_on_node2), file=error_file) + if left_join_columns is not None: + print("--left-join-columns=%s" % " ".join(left_join_columns), file=error_file) + if right_join_columns is not None: + print("--right-join-columns=%s" % " ".join(right_join_columns), file=error_file) + if prefix is not None: + print("--prefix=%s" % str(prefix), file=error_file) + print("--field-separator=%s" % repr(field_separator), file=error_file) + left_reader_options.show(out=error_file, who="left") right_reader_options.show(out=error_file, who="right") diff --git a/kgtk/io/kgtkreader.py b/kgtk/io/kgtkreader.py index fdcd882d4..fe6b9fea0 100644 --- a/kgtk/io/kgtkreader.py +++ b/kgtk/io/kgtkreader.py @@ -328,7 +328,7 @@ def from_args(cls, def show(self, who: str="", out: typing.TextIO=sys.stderr): prefix: str = "--" if len(who) == 0 else "--" + who + "-" print("%smode=%s" % (prefix, self.mode.name), file=out) - print("%scolumn-separator='%s'" % (prefix, self.column_separator), file=out) + print("%scolumn-separator=%s" % (prefix, repr(self.column_separator)), file=out) if self.force_column_names is not None: print("%sforce_column_names=%s" % (prefix, " ".join(self.force_column_names)), file=out) print("%sskip_first_record=%s" % (prefix, str(self.skip_first_record)), file=out) From cfe8ab172e5ef2836974429e7b7f15f1eea4847c Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 13 May 2020 16:49:02 -0700 Subject: [PATCH 197/278] Show value options when requested. --- kgtk/cli/clean_data.py | 1 + kgtk/cli/ifexists.py | 1 + kgtk/cli/ifnotexists.py | 1 + kgtk/cli/join.py | 1 + kgtk/cli/validate.py | 1 + kgtk/io/edgereader.py | 5 +++++ kgtk/io/kgtkreader.py | 1 + kgtk/io/nodereader.py | 5 +++++ kgtk/join/ifexists.py | 1 + kgtk/join/kgtkcat.py | 1 + kgtk/join/kgtkjoiner.py | 1 + kgtk/value/kgtkvalueoptions.py | 33 ++++++++++++++++++++++++--------- 12 files changed, 43 insertions(+), 9 deletions(-) diff --git a/kgtk/cli/clean_data.py b/kgtk/cli/clean_data.py index aa22c512b..51ac44f66 100644 --- a/kgtk/cli/clean_data.py +++ b/kgtk/cli/clean_data.py @@ -62,6 +62,7 @@ def run(input_file: typing.Optional[Path], print("input: %s" % (str(input_file) if input_file is not None else "-"), file=error_file) print("output: %s" % (str(output_file) if output_file is not None else "-"), file=error_file) reader_options.show(out=error_file) + value_options.show(out=error_file) print("=======", file=error_file, flush=True) if verbose: diff --git a/kgtk/cli/ifexists.py b/kgtk/cli/ifexists.py index ec24e8104..fcb4e42c6 100644 --- a/kgtk/cli/ifexists.py +++ b/kgtk/cli/ifexists.py @@ -100,6 +100,7 @@ def run(input_kgtk_file: typing.Optional[Path], print("--field-separator=%s" % repr(field_separator), file=error_file) input_reader_options.show(out=error_file, who="input") filter_reader_options.show(out=error_file, who="filter") + value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: diff --git a/kgtk/cli/ifnotexists.py b/kgtk/cli/ifnotexists.py index ba0d31242..1ee8f6534 100644 --- a/kgtk/cli/ifnotexists.py +++ b/kgtk/cli/ifnotexists.py @@ -99,6 +99,7 @@ def run(input_kgtk_file: typing.Optional[Path], print("--field-separator='%s'" % repr(field_separator), file=error_file) input_reader_options.show(out=error_file, who="input") filter_reader_options.show(out=error_file, who="filter") + value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: diff --git a/kgtk/cli/join.py b/kgtk/cli/join.py index aa0e8c939..0ef4a57e9 100644 --- a/kgtk/cli/join.py +++ b/kgtk/cli/join.py @@ -176,6 +176,7 @@ def run(left_file_path: typing.Optional[Path], left_reader_options.show(out=error_file, who="left") right_reader_options.show(out=error_file, who="right") + value_options.show(out=error_file) try: kr: KgtkJoiner = KgtkJoiner( diff --git a/kgtk/cli/validate.py b/kgtk/cli/validate.py index 299b7018b..0282fa144 100644 --- a/kgtk/cli/validate.py +++ b/kgtk/cli/validate.py @@ -71,6 +71,7 @@ def run(kgtk_files: typing.Optional[typing.List[typing.Optional[Path]]], print("input: %s" % " ".join((str(kgtk_file) for kgtk_file in kgtk_files)), file=error_file) print("--header-only=%s" % str(header_only), file=error_file) reader_options.show(out=error_file) + value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: diff --git a/kgtk/io/edgereader.py b/kgtk/io/edgereader.py index 3225c4579..bef4f29ec 100644 --- a/kgtk/io/edgereader.py +++ b/kgtk/io/edgereader.py @@ -89,6 +89,11 @@ def main(): reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args, mode=KgtkReaderMode.EDGE) value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) + if args.show_options: + reader_options.show(out=error_file) + value_options.show(out=error_file) + print("=======", file=error_file, flush=True) + # Force the edge mode: er: EdgeReader = EdgeReader.open_edge_file(args.kgtk_file, error_file=error_file, diff --git a/kgtk/io/kgtkreader.py b/kgtk/io/kgtkreader.py index fe6b9fea0..afdb45c51 100644 --- a/kgtk/io/kgtkreader.py +++ b/kgtk/io/kgtkreader.py @@ -1072,6 +1072,7 @@ def main(): print("--test=%s" % str(args.test), file=error_file) print("--test-validate=%s" % str(args.test_validate), file=error_file) reader_options.show(out=error_file) + value_options.show(out=error_file) print("=======", file=error_file, flush=True) kr: KgtkReader = KgtkReader.open(args.kgtk_file, diff --git a/kgtk/io/nodereader.py b/kgtk/io/nodereader.py index bf74beb85..ce389aada 100644 --- a/kgtk/io/nodereader.py +++ b/kgtk/io/nodereader.py @@ -79,6 +79,11 @@ def main(): reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args, mode=KgtkReaderMode.NODE) value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) + if args.show_options: + reader_options.show(out=error_file) + value_options.show(out=error_file) + print("=======", file=error_file, flush=True) + nr: NodeReader = NodeReader.open_node_file(args.kgtk_file, error_file=error_file, options=reader_options, diff --git a/kgtk/join/ifexists.py b/kgtk/join/ifexists.py index 48e7d2edc..509639ffc 100644 --- a/kgtk/join/ifexists.py +++ b/kgtk/join/ifexists.py @@ -237,6 +237,7 @@ def main(): if show_options: input_reader_options.show(out=error_file, who="input") filter_reader_options.show(out=error_file, who="filter") + value_options.show(out=error_file) ie: IfExists = IfExists( input_file_path=args.input_file_path, diff --git a/kgtk/join/kgtkcat.py b/kgtk/join/kgtkcat.py index 944c4cabe..4327b78d1 100644 --- a/kgtk/join/kgtkcat.py +++ b/kgtk/join/kgtkcat.py @@ -174,6 +174,7 @@ def main(): # Show the final option structures for debugging and documentation. if show_options: reader_options.show(out=error_file) + value_options.show(out=error_file) kc: KgtkCat = KgtkCat(input_file_paths=args.input_file_paths, output_path=args.output_file_path, diff --git a/kgtk/join/kgtkjoiner.py b/kgtk/join/kgtkjoiner.py index e7d7574da..2f32d1281 100644 --- a/kgtk/join/kgtkjoiner.py +++ b/kgtk/join/kgtkjoiner.py @@ -384,6 +384,7 @@ def main(): if args.show_options: left_reader_options.show(out=error_file, who="left") right_reader_options.show(out=error_file, who="right") + value_options.show(out=error_file) ej: KgtkJoiner = KgtkJoiner(left_file_path=args.left_file_path, right_file_path=args.right_file_path, diff --git a/kgtk/value/kgtkvalueoptions.py b/kgtk/value/kgtkvalueoptions.py index 5cb8e7526..704048954 100644 --- a/kgtk/value/kgtkvalueoptions.py +++ b/kgtk/value/kgtkvalueoptions.py @@ -4,6 +4,7 @@ from argparse import ArgumentParser, Namespace, SUPPRESS import attr +import sys import typing @attr.s(slots=True, frozen=True) @@ -188,6 +189,10 @@ def from_dict(cls, d: dict, who: str = "")->'KgtkValueOptions': additional_language_codes=d.get(prefix + "additional_language_codes", None), minimum_valid_year=d.get(prefix + "minimum_valid_year", cls.MINIMUM_VALID_YEAR), maximum_valid_year=d.get(prefix + "maximum_valid_year", cls.MAXIMUM_VALID_YEAR), + minimum_valid_lat=d.get(prefix + "minimum_valid_lat", cls.MINIMUM_VALID_LAT), + maximum_valid_lat=d.get(prefix + "maximum_valid_lat", cls.MAXIMUM_VALID_LAT), + minimum_valid_lon=d.get(prefix + "minimum_valid_lon", cls.MINIMUM_VALID_LON), + maximum_valid_lon=d.get(prefix + "maximum_valid_lon", cls.MAXIMUM_VALID_LON), escape_list_separators=d.get(prefix + "escape_list_separators", False)) @classmethod @@ -195,6 +200,23 @@ def from_dict(cls, d: dict, who: str = "")->'KgtkValueOptions': def from_args(cls, args: Namespace, who: str = "")->'KgtkValueOptions': return cls.from_dict(vars(args), who=who) + def show(self, who: str="", out: typing.TextIO=sys.stderr): + prefix: str = "--" if len(who) == 0 else "--" + who + "-" + print("%sallow-month-or-day-zero=%s" % (prefix, str(self.allow_month_or_day_zero)), file=out) + print("%srepair-month-or-day-zero=%s" % (prefix, str(self.repair_month_or_day_zero)), file=out) + print("%sallow-language-suffixes=%s" % (prefix, str(self.allow_language_suffixes)), file=out) + print("%sallow-lax-strings=%s" % (prefix, str(self.allow_lax_strings)), file=out) + print("%sallow-lax-lq-strings=%s" % (prefix, str(self.allow_lax_lq_strings)), file=out) + if self.additional_language_codes is not None: + print("%sadditional-language-codes=%s" % (prefix, " ".join(self.additional_language_codes)), file=out) + print("%sminimum-valid-year=%d" % (prefix, self.minimum_valid_year), file=out) + print("%smaximum-valid-year=%d" % (prefix, self.maximum_valid_year), file=out) + print("%sminimum-valid-lat=%f" % (prefix, self.minimum_valid_lat), file=out) + print("%smaximum-valid-lat=%f" % (prefix, self.maximum_valid_lat), file=out) + print("%sminimum-valid-lon=%f" % (prefix, self.minimum_valid_lon), file=out) + print("%smaximum-valid-lon=%f" % (prefix, self.maximum_valid_lon), file=out) + print("%sescape-list-separators=%s" % (prefix, str(self.escape_list_separators)), file=out) + DEFAULT_KGTK_VALUE_OPTIONS: KgtkValueOptions = KgtkValueOptions() def main(): @@ -210,15 +232,8 @@ def main(): # Build the value parsing option structure. value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) - print("allow_month_or_day_zero: %s" % str(value_options.allow_month_or_day_zero)) - print("allow_lax_strings: %s" % str(value_options.allow_lax_strings)) - print("allow_lax_lq_strings: %s" % str(value_options.allow_lax_lq_strings)) - print("allow_language_suffixes: %s" % str(value_options.allow_language_suffixes)) - if value_options.additional_language_codes is None: - print("additional_language_codes: None") - else: - print("additional_language_codes: [ %s ]" % ", ".join(value_options.additional_language_codes)) - + value_options.show() + # Test prefixed value option processing. left_value_options: KgtkValueOptions = KgtkValueOptions.from_args(args, who="left") print("left_allow_month_or_day_zero: %s" % str(left_value_options.allow_month_or_day_zero)) From ed5d0a155e89bac8ac95fa4448a69339ab2ed771 Mon Sep 17 00:00:00 2001 From: Rongpeng Date: Wed, 13 May 2020 17:23:28 -0700 Subject: [PATCH 198/278] fixed a value URI parsing error for Globecoordinate --- kgtk/triple_generator.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kgtk/triple_generator.py b/kgtk/triple_generator.py index c79789d3b..2b6edd5f0 100644 --- a/kgtk/triple_generator.py +++ b/kgtk/triple_generator.py @@ -270,9 +270,10 @@ def generate_normal_triple( elif edge_type == GlobeCoordinate: latitude, longitude = node2[1:].split("/") + latitude = float(latitude) + longitude = float(longitude) object = GlobeCoordinate( - latitude, longitude, 0.0001, globe=StringValue("Earth") - ) + latitude, longitude, 0.0001, globe=Item("Q2")) # earth elif edge_type == QuantityValue: # +70[+60,+80]Q743895 From 384bd9820fb31bd70c1e42cd429c47c58509e37b Mon Sep 17 00:00:00 2001 From: Craig Milo Rogers Date: Wed, 13 May 2020 17:57:25 -0700 Subject: [PATCH 199/278] Use composable optional bools. --- kgtk/cli/ifexists.py | 10 ++-- kgtk/cli/ifnotexists.py | 10 ++-- kgtk/cli/join.py | 11 ++-- kgtk/cli/validate.py | 4 +- kgtk/io/kgtkreader.py | 33 ++++------- kgtk/join/ifexists.py | 13 +++-- kgtk/join/kgtkjoiner.py | 19 ++++-- kgtk/utils/argparsehelpers.py | 30 ++++++++++ kgtk/value/kgtkvalueoptions.py | 103 +++++++++++++++------------------ 9 files changed, 134 insertions(+), 99 deletions(-) create mode 100644 kgtk/utils/argparsehelpers.py diff --git a/kgtk/cli/ifexists.py b/kgtk/cli/ifexists.py index fcb4e42c6..4b0d055ad 100644 --- a/kgtk/cli/ifexists.py +++ b/kgtk/cli/ifexists.py @@ -42,13 +42,15 @@ def h(msg: str)->str: parser.add_argument( "input_kgtk_file", nargs="?", help="The KGTK file to filter. May be omitted or '-' for stdin.", type=Path) - parser.add_argument( "--input-keys", "--left-keys", dest="input_keys", help="The key columns in the file being filtered.", nargs='*') + parser.add_argument( "--input-keys", "--left-keys", dest="input_keys", + help="The key columns in the file being filtered (default=None).", nargs='*') - parser.add_argument( "--filter-on", dest="filter_kgtk_file", help="The KGTK file to filter against.", type=Path, required=True) + parser.add_argument( "--filter-on", dest="filter_kgtk_file", help="The KGTK file to filter against (required).", type=Path, required=True) - parser.add_argument( "--filter-keys", "--right-keys", dest="filter_keys", help="The key columns in the filter-on file.", nargs='*') + parser.add_argument( "--filter-keys", "--right-keys", dest="filter_keys", + help="The key columns in the filter-on file (default=None).", nargs='*') - parser.add_argument("-o", "--output-file", dest="output_kgtk_file", help="The KGTK file to write", type=Path, default=None) + parser.add_argument("-o", "--output-file", dest="output_kgtk_file", help="The KGTK file to write (required).", type=Path, default=None) parser.add_argument( "--field-separator", dest="field_separator", help=h("Separator for multifield keys (default=%(default)s)") diff --git a/kgtk/cli/ifnotexists.py b/kgtk/cli/ifnotexists.py index 1ee8f6534..854ef86bd 100644 --- a/kgtk/cli/ifnotexists.py +++ b/kgtk/cli/ifnotexists.py @@ -41,13 +41,15 @@ def h(msg: str)->str: parser.add_argument( "input_kgtk_file", nargs="?", help="The KGTK file to filter. May be omitted or '-' for stdin.", type=Path) - parser.add_argument( "--filter-on", dest="_filter_kgtk_file", help="The KGTK file to filter against.", type=Path, required=True) + parser.add_argument( "--input-keys", "--left-keys", dest="input_keys", + help="The key columns in the file being filtered (default=None).", nargs='*') - parser.add_argument("-o", "--output-file", dest="output_kgtk_file", help="The KGTK file to write", type=Path, default=None) + parser.add_argument( "--filter-on", dest="_filter_kgtk_file", help="The KGTK file to filter against (required).", type=Path, required=True) - parser.add_argument( "--input-keys", "--left-keys", dest="input_keys", help="The key columns in the file being filtered.", nargs='*') + parser.add_argument( "--filter-keys", "--right-keys", dest="filter_keys", + help="The key columns in the filter-on file (default=None).", nargs='*') - parser.add_argument( "--filter-keys", "--right-keys", dest="filter_keys", help="The key columns in the filter-on file.", nargs='*') + parser.add_argument("-o", "--output-file", dest="output_kgtk_file", help="The KGTK file to write (required),", type=Path, default=None) parser.add_argument( "--field-separator", dest="field_separator", help=h("Separator for multifield keys"), diff --git a/kgtk/cli/join.py b/kgtk/cli/join.py index 0ef4a57e9..8fbed6207 100644 --- a/kgtk/cli/join.py +++ b/kgtk/cli/join.py @@ -13,6 +13,7 @@ from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.join.kgtkjoiner import KgtkJoiner +from kgtk.utils.argparsehelpers import optional_bool from kgtk.value.kgtkvalueoptions import KgtkValueOptions def parser(): @@ -71,15 +72,16 @@ def h(msg: str)->str: parser.add_argument( "--join-on-label", dest="join_on_label", help="If both input files are edge files, include the label column in the join (default=%(default)s).", - action='store_true') + type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--join-on-node2", dest="join_on_node2", help="If both input files are edge files, include the node2 column in the join (default=%(default)s).", - action='store_true') + type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--left-file-join-columns", dest="left_join_columns", help="Left file join columns (default=None).", nargs='+') - parser.add_argument( "--left-join", dest="left_join", help="Perform a left outer join (default=%(default)s).", action='store_true') + parser.add_argument( "--left-join", dest="left_join", help="Perform a left outer join (default=%(default)s).", + type=optional_bool, nargs='?', const=True, default=False) parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to write (default=%(default)s).", type=Path, default="-") @@ -88,7 +90,8 @@ def h(msg: str)->str: parser.add_argument( "--right-file-join-columns", dest="right_join_columns", help="Right file join columns (default=None).", nargs='+') - parser.add_argument( "--right-join", dest="right_join", help="Perform a right outer join (default=%(default)s).", action='store_true') + parser.add_argument( "--right-join", dest="right_join", help="Perform a right outer join (default=%(default)s).", + type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--field-separator", dest="field_separator", help=h("Separator for multifield keys (default=%(default)s)") diff --git a/kgtk/cli/validate.py b/kgtk/cli/validate.py index 0282fa144..7d88e2fa2 100644 --- a/kgtk/cli/validate.py +++ b/kgtk/cli/validate.py @@ -18,6 +18,7 @@ from kgtk.cli_argparse import KGTKArgumentParser from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions +from kgtk.utils.argparsehelpers import optional_bool from kgtk.value.kgtkvalueoptions import KgtkValueOptions def parser(): @@ -37,7 +38,8 @@ def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Names parser.add_argument( "kgtk_files", nargs="*", help="The KGTK file(s) to validate. May be omitted or '-' for stdin.", type=Path) parser.add_argument( "--header-only", dest="header_only", - help="Process the only the header of the input file.", action="store_true") + help="Process the only the header of the input file.", + type=optional_bool, nargs='?', const=True, default=False) KgtkReader.add_debug_arguments(parser, expert=_expert) KgtkReaderOptions.add_arguments(parser, mode_options=True, validate_by_default=True, expert=_expert) diff --git a/kgtk/io/kgtkreader.py b/kgtk/io/kgtkreader.py index afdb45c51..9de0bfc8e 100644 --- a/kgtk/io/kgtkreader.py +++ b/kgtk/io/kgtkreader.py @@ -30,6 +30,7 @@ from kgtk.kgtkformat import KgtkFormat from kgtk.io.kgtkbase import KgtkBase +from kgtk.utils.argparsehelpers import optional_bool from kgtk.utils.closableiter import ClosableIter, ClosableIterTextIOWrapper from kgtk.utils.enumnameaction import EnumNameAction from kgtk.utils.gzipprocess import GunzipProcess @@ -104,7 +105,8 @@ def add_arguments(cls, validate_by_default: bool = False, expert: bool = False, defaults: bool = True, - who: str = ""): + who: str = "", + ): # This helper function makes it easy to suppress options from # The help message. The options are still there, and initialize @@ -121,8 +123,6 @@ def h(msg: str)->str: # should get defaults value, while the prefixed arguments should # not get defaults. # - # At the present time, boolean arguments can't use fallbacks. - # # Note: In obscure circumstances (EnumNameAction, I'm looking at you), # explicitly setting "default=None" may fail, whereas omitting the # "default=" phrase succeeds. @@ -159,7 +159,7 @@ def d(default: typing.Any)->typing.Mapping[str, typing.Any]: fgroup.add_argument(prefix1 + "gzip-in-parallel", dest=prefix2 + "gzip_in_parallel", help=h(prefix3 + "Execute gzip in parallel (default=%(default)s)."), - action='store_true') + type=optional_bool, nargs='?', const=True, **d(default=False)) fgroup.add_argument(prefix1 + "gzip-queue-size", dest=prefix2 + "gzip_queue_size", @@ -188,7 +188,7 @@ def d(default: typing.Any)->typing.Mapping[str, typing.Any]: hgroup.add_argument(prefix1 + "skip-first-record", dest=prefix2 + "skip_first_record", help=h(prefix3 + "Skip the first record when forcing column names (default=%(default)s)."), - action='store_true') + type=optional_bool, nargs='?', const=True, **d(default=False)) hgroup.add_argument(prefix1 + "unsafe-column-name-action", dest=prefix2 + "unsafe_column_name_action", @@ -201,22 +201,12 @@ def d(default: typing.Any)->typing.Mapping[str, typing.Any]: lgroup.add_argument(prefix1 + "repair-and-validate-lines", dest=prefix2 + "repair_and_validate_lines", help=h(prefix3 + "Repair and validate lines (default=%(default)s)."), - action='store_true', default=validate_by_default) - - lgroup.add_argument(prefix1 + "do-not-repair-and-validate-lines", - dest=prefix2 + "repair_and_validate_lines", - help=h(prefix3 + "Do not repair and validate lines."), - action='store_false') + type=optional_bool, nargs='?', const=True, **d(default=validate_by_default)) lgroup.add_argument(prefix1 + "repair-and-validate-values", dest=prefix2 + "repair_and_validate_values", help=h(prefix3 + "Repair and validate values (default=%(default)s)."), - action='store_true', default=validate_by_default) - - lgroup.add_argument(prefix1 + "do-not-repair-and-validate-values", - dest=prefix2 + "repair-and-validate_values", - help=h(prefix3 + "Do not repair and validate values."), - action='store_false') + type=optional_bool, nargs='?', const=True, **d(default=validate_by_default)) lgroup.add_argument(prefix1 + "blank-required-field-line-action", dest=prefix2 + "blank_required_field_line_action", @@ -236,7 +226,7 @@ def d(default: typing.Any)->typing.Mapping[str, typing.Any]: lgroup.add_argument(prefix1 + "fill-short-lines", dest=prefix2 + "fill_short_lines", help=h(prefix3 + "Fill missing trailing columns in short lines with empty values (default=%(default)s)."), - action='store_true') + type=optional_bool, nargs='?', const=True, **d(default=False)) lgroup.add_argument(prefix1 + "invalid-value-action", dest=prefix2 + "invalid_value_action", @@ -256,7 +246,7 @@ def d(default: typing.Any)->typing.Mapping[str, typing.Any]: lgroup.add_argument(prefix1 + "truncate-long-lines", dest=prefix2 + "truncate_long_lines", help=h(prefix3 + "Remove excess trailing columns in long lines (default=%(default)s)."), - action='store_true') + type=optional_bool, nargs='?', const=True, **d(default=False)) lgroup.add_argument(prefix1 + "whitespace-line-action", dest=prefix2 + "whitespace_line_action", @@ -1050,13 +1040,14 @@ def main(): parser = ArgumentParser() parser.add_argument(dest="kgtk_file", help="The KGTK file to read", type=Path, nargs="?") KgtkReader.add_debug_arguments(parser, expert=True) - parser.add_argument( "--test", dest="test_method", help="The test to perform", + parser.add_argument( "--test", dest="test_method", help="The test to perform (default=%(default)s).", choices=["rows", "concise-rows", "kgtk-values", "concise-kgtk-values", "dicts", "concise-dicts", "kgtk-value-dicts", "concise-kgtk-value-dicts"], default="rows") - parser.add_argument( "--test-validate", dest="test_validate", help="Validate KgtkValue objects in test.", action='store_true') + parser.add_argument( "--test-validate", dest="test_validate", help="Validate KgtkValue objects in test (default=%(default)s).", + type=optional_bool, nargs='?', const=True, default=False) KgtkReaderOptions.add_arguments(parser, mode_options=True, validate_by_default=True, expert=True) KgtkValueOptions.add_arguments(parser, expert=True) diff --git a/kgtk/join/ifexists.py b/kgtk/join/ifexists.py index 509639ffc..d25dd6c7b 100644 --- a/kgtk/join/ifexists.py +++ b/kgtk/join/ifexists.py @@ -27,6 +27,7 @@ from kgtk.kgtkformat import KgtkFormat from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter +from kgtk.utils.argparsehelpers import optional_bool from kgtk.utils.enumnameaction import EnumNameAction from kgtk.utils.validationaction import ValidationAction from kgtk.value.kgtkvalueoptions import KgtkValueOptions @@ -208,16 +209,18 @@ def main(): parser.add_argument(dest="input_file_path", help="The KGTK file with the input data", type=Path, nargs="?") - parser.add_argument( "--filter-on", dest="filter_file_path", help="The KGTK file with the filter data", type=Path, required=True) + parser.add_argument( "--filter-on", dest="filter_file_path", help="The KGTK file with the filter data (required).", type=Path, required=True) parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to write (default=%(default)s).", type=Path, default="-") - parser.add_argument( "--field-separator", dest="field_separator", help="Separator for multifield keys", default=IfExists.FIELD_SEPARATOR_DEFAULT) + parser.add_argument( "--field-separator", dest="field_separator", help="Separator for multifield keys (default=%(default)s)", + default=IfExists.FIELD_SEPARATOR_DEFAULT) - parser.add_argument( "--invert", dest="invert", help="Invert the test (if not exists).", action='store_true') + parser.add_argument( "--invert", dest="invert", help="Invert the test (if not exists) (default=%(default)s).", + type=optional_bool, nargs='?', const=True, default=False) - parser.add_argument( "--input-keys", dest="input_keys", help="The key columns in the input file.", nargs='*') - parser.add_argument( "--filter-keys", dest="filter_keys", help="The key columns in the filter file.", nargs='*') + parser.add_argument( "--input-keys", dest="input_keys", help="The key columns in the input file (default=None).", nargs='*') + parser.add_argument( "--filter-keys", dest="filter_keys", help="The key columns in the filter file (default=None).", nargs='*') KgtkReader.add_debug_arguments(parser) KgtkReaderOptions.add_arguments(parser, mode_options=True, who="input") diff --git a/kgtk/join/kgtkjoiner.py b/kgtk/join/kgtkjoiner.py index 2f32d1281..659738af5 100644 --- a/kgtk/join/kgtkjoiner.py +++ b/kgtk/join/kgtkjoiner.py @@ -16,6 +16,7 @@ from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.join.kgtkmergecolumns import KgtkMergeColumns +from kgtk.utils.argparsehelpers import optional_bool from kgtk.value.kgtkvalueoptions import KgtkValueOptions @attr.s(slots=True, frozen=True) @@ -356,15 +357,25 @@ def main(): parser.add_argument(dest="right_file_path", help="The right KGTK file to join", type=Path) parser.add_argument( "--field-separator", dest="field_separator", help="Separator for multifield keys", default=KgtkJoiner.FIELD_SEPARATOR_DEFAULT) - parser.add_argument( "--join-on-label", dest="join_on_label", help="If both input files are edge files, include the label column in the join.", action='store_true') - parser.add_argument( "--join-on-node2", dest="join_on_node2", help="If both input files are edge files, include the node2 column in the join.", action='store_true') + parser.add_argument( "--join-on-label", dest="join_on_label", + help="If both input files are edge files, include the label column in the join (default=%(default)s).", + type=optional_bool, nargs='?', const=True, default=False) + + parser.add_argument( "--join-on-node2", dest="join_on_node2", + help="If both input files are edge files, include the node2 column in the join (default=%(default)s).", + type=optional_bool, nargs='?', const=True, default=False) + parser.add_argument( "--left-file-join-columns", dest="left_join_columns", help="Left file join columns.", nargs='+') - parser.add_argument( "--left-join", dest="left_join", help="Perform a left outer join.", action='store_true') + + parser.add_argument( "--left-join", dest="left_join", help="Perform a left outer join (default=%(default)s).", + type=optional_bool, nargs='?', const=True, default=False) parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to write", type=Path, default=None) parser.add_argument( "--prefix", dest="prefix", help="An optional prefix applied to right file column names in the output file (default=None).") parser.add_argument( "--right-file-join-columns", dest="right_join_columns", help="Right file join columns.", nargs='+') - parser.add_argument( "--right-join", dest="right_join", help="Perform a right outer join.", action='store_true') + + parser.add_argument( "--right-join", dest="right_join", help="Perform a right outer join (default=%(default)s).", + type=optional_bool, nargs='?', const=True, default=False) KgtkReader.add_debug_arguments(parser, expert=True) KgtkReaderOptions.add_arguments(parser, mode_options=True, who=KgtkJoiner.LEFT, expert=True) diff --git a/kgtk/utils/argparsehelpers.py b/kgtk/utils/argparsehelpers.py new file mode 100644 index 000000000..ba09b8370 --- /dev/null +++ b/kgtk/utils/argparsehelpers.py @@ -0,0 +1,30 @@ +"""This argparse type conversion function implements optional boolean arguments. + +--arg +--arg=True +--arg=False + +and other variations. A default value of None is allowed for fallback +argument composition. + +Sample usage: + +parser.add_argument(prefix1 + "gzip-in-parallel", + dest=prefix2 + "gzip_in_parallel", + help=h(prefix3 + "Execute gzip in parallel (default=%(default)s)."), + type=optional_bool, nargs='?', const=True, **d(default=False)) + +""" + +import typing + +def optional_bool(value)->typing.Optional[bool]: + if value is None: + return None + if isinstance(value, bool): + return value + if value.lower() in {'false', 'f', '0', 'no', 'n'}: + return False + elif value.lower() in {'true', 't', '1', 'yes', 'y'}: + return True + raise ValueError(f'{value} is not a valid boolean value') diff --git a/kgtk/value/kgtkvalueoptions.py b/kgtk/value/kgtkvalueoptions.py index 704048954..b4e483a9e 100644 --- a/kgtk/value/kgtkvalueoptions.py +++ b/kgtk/value/kgtkvalueoptions.py @@ -7,6 +7,8 @@ import sys import typing +from kgtk.utils.argparsehelpers import optional_bool + @attr.s(slots=True, frozen=True) class KgtkValueOptions: """ @@ -65,6 +67,7 @@ def add_arguments(cls, who: str = "", desc: str = ".", expert: bool = False, + defaults: bool = True, ): """Add arguments for KgtkValue option processing. @@ -90,89 +93,77 @@ def h(msg: str)->str: else: return SUPPRESS + # This helper function decices whether or not to include defaults + # in argument declarations. If we plan to make arguments with + # prefixes and fallbacks, the fallbacks (the ones without prefixes) + # should get defaults value, while the prefixed arguments should + # not get defaults. + # + # Note: In obscure circumstances (EnumNameAction, I'm looking at you), + # explicitly setting "default=None" may fail, whereas omitting the + # "default=" phrase succeeds. + # + # TODO: continue researching these issues. + def d(default: typing.Any)->typing.Mapping[str, typing.Any]: + if defaults: + return {"default": default} + else: + return { } + vgroup = parser.add_argument_group(h(prefix3 + "Data value parsing"), h("Options controlling the parsing and processing of KGTK data values" + desc)) vgroup.add_argument( prefix1 + "additional-language-codes", dest=prefix2 + "additional_language_codes", help=h(prefix3 + "Additional language codes (default=None)."), nargs="*", default=None) - lsgroup= vgroup.add_mutually_exclusive_group() - lsgroup.add_argument( prefix1 + "allow-language-suffixes", dest=prefix2 + "allow_language_suffixes", + vgroup.add_argument( prefix1 + "allow-language-suffixes", dest=prefix2 + "allow_language_suffixes", help=h(prefix3 + "Allow language identifier suffixes starting with a dash (default=%(default)s)."), - action='store_true', default=True) - - lsgroup.add_argument( prefix1 + "disallow-language-suffixes", dest=prefix2 + "allow_language_suffixes", - help=h(prefix3 + "Disallow language identifier suffixes starting with a dash."), - action='store_false') + type=optional_bool, nargs='?', const=True, **d(default=False)) - laxgroup= vgroup.add_mutually_exclusive_group() - laxgroup.add_argument( prefix1 + "allow-lax-strings", dest=prefix2 + "allow_lax_strings", - help=h(prefix3 + "Do not check if double quotes are backslashed inside strings (default=%(default)s)."), - action='store_true', default=False) - laxgroup.add_argument( prefix1 + "disallow-lax-strings", dest=prefix2 + "allow_lax_strings", - help=h(prefix3 + "Check if double quotes are backslashed inside strings."), - action='store_false') + vgroup.add_argument( prefix1 + "allow-lax-strings", dest=prefix2 + "allow_lax_strings", + help=h(prefix3 + "Do not check if double quotes are backslashed inside strings (default=%(default)s)."), + type=optional_bool, nargs='?', const=True, **d(default=False)) - lqgroup= vgroup.add_mutually_exclusive_group() - lqgroup.add_argument( prefix1 + "allow-lax-lq-strings", dest=prefix2 + "allow_lax_lq_strings", - help=h(prefix3 + "Do not check if single quotes are backslashed inside language qualified strings (default=%(default)s)."), - action='store_true', default=False) + vgroup.add_argument( prefix1 + "allow-lax-lq-strings", dest=prefix2 + "allow_lax_lq_strings", + help=h(prefix3 + "Do not check if single quotes are backslashed inside language qualified strings (default=%(default)s)."), + type=optional_bool, nargs='?', const=True, **d(default=False)) - lqgroup.add_argument( prefix1 + "disallow-lax-lq-strings", dest=prefix2 + "allow_lax_lq_strings", - help=h(prefix3 + "Check if single quotes are backslashed inside language qualified strings."), - action='store_false') + vgroup.add_argument( prefix1 + "allow-month-or-day-zero", dest=prefix2 + "allow_month_or_day_zero", + help=h(prefix3 + "Allow month or day zero in dates (default=%(default)s)."), + type=optional_bool, nargs='?', const=True, **d(default=False)) - amd0group= vgroup.add_mutually_exclusive_group() - amd0group.add_argument( prefix1 + "allow-month-or-day-zero", dest=prefix2 + "allow_month_or_day_zero", - help=h(prefix3 + "Allow month or day zero in dates (default=%(default)s)."), - action='store_true', default=False) - - amd0group.add_argument( prefix1 + "disallow-month-or-day-zero", dest=prefix2 + "allow_month_or_day_zero", - help=h(prefix3 + "Allow month or day zero in dates."), - action='store_false') - - rmd0group= vgroup.add_mutually_exclusive_group() - rmd0group.add_argument( prefix1 + "repair-month-or-day-zero", dest=prefix2 + "repair_month_or_day_zero", - help=h(prefix3 + "Repair month or day zero in dates (default=%(default)s)."), - action='store_true', default=False) - - rmd0group.add_argument( prefix1 + "no-repair-month-or-day-zero", dest=prefix2 + "repair_month_or_day_zero", - help=h(prefix3 + "Do not repair month or day zero in dates."), - action='store_false') + vgroup.add_argument( prefix1 + "repair-month-or-day-zero", dest=prefix2 + "repair_month_or_day_zero", + help=h(prefix3 + "Repair month or day zero in dates (default=%(default)s)."), + type=optional_bool, nargs='?', const=True, **d(default=False)) vgroup.add_argument( prefix1 + "minimum-valid-year", dest=prefix2 + "minimum_valid_year", help=h(prefix3 + "The minimum valid year in dates (default=%(default)d)."), - type=int, default=cls.MINIMUM_VALID_YEAR) + type=int, **d(default=cls.MINIMUM_VALID_YEAR)) vgroup.add_argument( prefix1 + "maximum-valid-year", dest=prefix2 + "maximum_valid_year", help=h(prefix3 + "The maximum valid year in dates (default=%(default)d)."), - type=int, default=cls.MAXIMUM_VALID_YEAR) + type=int, **d(default=cls.MAXIMUM_VALID_YEAR)) vgroup.add_argument( prefix1 + "minimum-valid-lat", dest=prefix2 + "minimum_valid_lat", - help=h(prefix3 + "The minimum valid latitude (default=%(default)d)."), - type=int, default=cls.MINIMUM_VALID_LAT) + help=h(prefix3 + "The minimum valid latitude (default=%(default)f)."), + type=int, **d(default=cls.MINIMUM_VALID_LAT)) vgroup.add_argument( prefix1 + "maximum-valid-lat", dest=prefix2 + "maximum_valid_lat", - help=h(prefix3 + "The maximum valid latitude (default=%(default)d)."), - type=int, default=cls.MAXIMUM_VALID_LAT) + help=h(prefix3 + "The maximum valid latitude (default=%(default)f)."), + type=int, **d(default=cls.MAXIMUM_VALID_LAT)) vgroup.add_argument( prefix1 + "minimum-valid-lon", dest=prefix2 + "minimum_valid_lon", - help=h(prefix3 + "The minimum valid longitude (default=%(default)d)."), - type=int, default=cls.MINIMUM_VALID_LON) + help=h(prefix3 + "The minimum valid longitude (default=%(default)f)."), + type=int, **d(default=cls.MINIMUM_VALID_LON)) vgroup.add_argument( prefix1 + "maximum-valid-lon", dest=prefix2 + "maximum_valid_lon", - help=h(prefix3 + "The maximum valid longitude (default=%(default)d)."), - type=int, default=cls.MAXIMUM_VALID_LON) - - elsgroup= vgroup.add_mutually_exclusive_group() - elsgroup.add_argument( prefix1 + "escape-list-separators", dest=prefix2 + "escape_list_separators", - help=h(prefix3 + "Escape all list separators instead of splitting on them (default=%(default)s)."), - action='store_true', default=False) + help=h(prefix3 + "The maximum valid longitude (default=%(default)f)."), + type=int, **d(default=cls.MAXIMUM_VALID_LON)) - elsgroup.add_argument( prefix1 + "no-escape-list-separators", dest=prefix2 + "escape_list_separators", - help=h(prefix3 + "Do not escape list separators."), - action='store_false') + vgroup.add_argument( prefix1 + "escape-list-separators", dest=prefix2 + "escape_list_separators", + help=h(prefix3 + "Escape all list separators instead of splitting on them (default=%(default)s)."), + type=optional_bool, nargs='?', const=True, **d(default=False)) @classmethod # Build the value parsing option structure. From 6b32b24776e755c5eef8bff9a3fb1298ceaee991 Mon Sep 17 00:00:00 2001 From: greatyyx Date: Thu, 14 May 2020 11:50:14 -0700 Subject: [PATCH 200/278] create kgtk exception auto handler, apply it in filter --- kgtk/cli/filter.py | 8 ++------ kgtk/exceptions.py | 11 ++++++++++- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/kgtk/cli/filter.py b/kgtk/cli/filter.py index 23fc3232f..5fa18d9d6 100644 --- a/kgtk/cli/filter.py +++ b/kgtk/cli/filter.py @@ -28,9 +28,8 @@ def add_arguments(parser): def run(datatype, pattern, input, subj_col, pred_col, obj_col): # import modules locally - import socket import sh # type: ignore - from kgtk.exceptions import KGTKException + from kgtk.exceptions import kgtk_exception_auto_handler props=[subj_col, pred_col, obj_col] @@ -62,8 +61,5 @@ def prepare_filter(property, prop_pattern): elif not sys.stdin.isatty(): sh.mlr('--%slite' % datatype, 'filter', filter_str, _in=sys.stdin, _out=sys.stdout, _err=sys.stderr) - except sh.SignalException_SIGPIPE: - # handles SIGPIPE, if it raises to upper level, it will cause another error - pass except Exception as e: - raise KGTKException(e) + kgtk_exception_auto_handler(e) diff --git a/kgtk/exceptions.py b/kgtk/exceptions.py index c70e03797..2d614320f 100644 --- a/kgtk/exceptions.py +++ b/kgtk/exceptions.py @@ -1,6 +1,7 @@ import sys import warnings import traceback +import sh class KGTKException(BaseException): @@ -11,6 +12,14 @@ def __init__(self, message): self.message = message +def kgtk_exception_auto_handler(e: Exception): + if isinstance(e, (sh.SignalException_SIGPIPE, BrokenPipeError)): + return + elif isinstance(e, KGTKException): + raise e + raise KGTKException(KGTKException.message + str(e)) + + class KGTKArgumentParseException(KGTKException): # same as https://docs.python.org/3/library/argparse.html#argparse.ArgumentParser.error return_code = 2 @@ -34,7 +43,7 @@ def __call__(self, func, *args, **kwargs): if return_code != 0: warnings.warn('Please raise exception instead of returning non-zero value') return return_code - except BrokenPipeError: + except (sh.SignalException_SIGPIPE, BrokenPipeError): pass except BaseException: type_, exc_val, exc_tb = sys.exc_info() From 8d6e7534254d2ccd5f7bf1047d63f17aec3b23c4 Mon Sep 17 00:00:00 2001 From: Daniel Garijo Date: Thu, 14 May 2020 18:16:59 -0700 Subject: [PATCH 201/278] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 07bfa8bed..480f8a2d1 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# KGTK: Knowledge Graph Toolkit +# KGTK: Knowledge Graph Toolkit [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3605675.svg)](https://doi.org/10.5281/zenodo.3605675) KGTK is a Python library for easy manipulation with knowledge graphs. It provides a flexible framework that allows chaining of common graph operations, such as: extraction of subgraphs, filtering, computation of graph metrics, validation, cleaning, generating embeddings, and so on. Its principal format is TSV, though we do support a number of other inputs. From f1896aa30b0c7fc7dcd17eaa90140cf0ebfb8d87 Mon Sep 17 00:00:00 2001 From: Daniel Garijo Date: Thu, 14 May 2020 18:27:06 -0700 Subject: [PATCH 202/278] fix doi. Initial version of doc --- .readthedocs.yml | 15 ++++++ README.md | 2 +- docs/index.md | 18 +++++++ docs/specification.md | 114 ++++++++++++++++++++++++++++++++++++++++++ mkdocs.yml | 13 +++++ 5 files changed, 161 insertions(+), 1 deletion(-) create mode 100644 .readthedocs.yml create mode 100644 docs/index.md create mode 100644 docs/specification.md create mode 100644 mkdocs.yml diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 000000000..b6cb68e87 --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,15 @@ +# Required +version: 2 + +# Build documentation with MkDocs +mkdocs: + configuration: mkdocs.yml + +# Optionally build your docs in additional formats such as PDF and ePub +formats: all + +# Optionally set the version of Python and requirements required to build your docs +python: + version: 3.7 + install: + - requirements: docs/requirements.txt diff --git a/README.md b/README.md index 480f8a2d1..a3ab034d3 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# KGTK: Knowledge Graph Toolkit [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3605675.svg)](https://doi.org/10.5281/zenodo.3605675) +# KGTK: Knowledge Graph Toolkit I](https://zenodo.org/badge/DOI/10.5281/zenodo.3828069.svg)](https://doi.org/10.5281/zenodo.3828069) KGTK is a Python library for easy manipulation with knowledge graphs. It provides a flexible framework that allows chaining of common graph operations, such as: extraction of subgraphs, filtering, computation of graph metrics, validation, cleaning, generating embeddings, and so on. Its principal format is TSV, though we do support a number of other inputs. diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 000000000..547d6ad25 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,18 @@ +# Knowledge Graph Toolkit (KGTK) + +KGTK is a Python library for easy manipulation with knowledge graphs. It provides a flexible framework that allows chaining of common graph operations, such as: extraction of subgraphs, filtering, computation of graph metrics, validation, cleaning, generating embeddings, and so on. Its principal format is TSV, though we do support a number of other inputs. + +## Features + +* Computation of class instances +* Computation of reachable nodes +* Filtering based on property values +* Removal of columns +* Sorting +* Computation of various embeddings +* Cleaning and validation +* Computation of graph metrics +* Joining and concatenation of graphs +* Manipulation of Wikidata data + + diff --git a/docs/specification.md b/docs/specification.md new file mode 100644 index 000000000..daf864a61 --- /dev/null +++ b/docs/specification.md @@ -0,0 +1,114 @@ +The current mapping between OWL and OpenAPI specification (OAS) supported by OBA can be seen below. + +!!! warning + We are currently working on improving the mapping with complex axiomatization of domains and ranges and other property annotations (minimum and maximum cardinality, etc.) + +**Namespaces** used in this document: + + - owl: [http://www.w3.org/2002/07/owl#](http://www.w3.org/2002/07/owl#) + - rdfs: [http://www.w3.org/2000/01/rdf-schema#](http://www.w3.org/2000/01/rdf-schema#) + - skos: [http://www.w3.org/2004/02/skos/core#](http://www.w3.org/2004/02/skos/core#) + - prov: [http://www.w3.org/ns/prov#](http://www.w3.org/ns/prov#) + +## owl:Class + +Each class in the ontology is associated with two paths for the GET operation, one path for POST, PUT and DELETE operations; and a schema. For example, consider the class "Plan" from [http://purl.org/net/p-plan](http://purl.org/net/p-plan). The following GET paths would be generated: + +```yaml +/plans: + get: + description: Gets a list of all instances of Plan (more information in http://purl.org/net/p-plan#Plan) + parameters: + {...} #omitted for simplicity + responses: + 200: + content: + application/json: + schema: + items: + $ref: '#/components/schemas/Plan' + type: array + description: Successful response - returns an array with the instances of Plan. + headers: + link: + description: Information about pagination + schema: + type: string + summary: List all instances of Plan +``` + +```yaml +/plans/{id}: + get: + description: Gets the details of a given Plan (more information in http://purl.org/net/p-plan#Plan) + parameters: + {...} #omitted for simplicity, the response is similar to the one above +``` +And the following Schema would be generated: + +```yaml +Plan: + description: A p-plan:Plan is a specific type of prov:Plan. It is composed of smaller steps that use and produce Variables. + properties: + wasGeneratedBy: + {...} #omitted other properties for simplicity. +``` + +### rdfs:subClassOf + +Subclasses inherit all properties from their respective superclasses. The OpenAPI specification has the `allOf` clause to indicate this behavior. However, this was not supported by any existing generators until very recently, and therefore OBA will iterate through all superclasses to add the appropriate properties for a given schema. + +## owl:ObjectProperty + +Each object property is added to its corresponding schema definition that uses it as domain. For example, in the P-Plan ontology, `Plan` has a property `isSubPlanofPlan` which has domain `Plan`. This would be represented as follows in the OpenAPI specification: + +```yaml + Plan: + description: A p-plan:Plan is a specific type of prov:Plan. It is composed of smaller steps that use and produce Variables. + properties: + isSubPlanOfPlan: + description: A p-plan:Plan may be a subplan of another bigger p-plan:Plan. p-plan:isSubPlanOfPlan is used to state the link among the two different plans. + items: + $ref: '#/components/schemas/Plan' + nullable: true + type: array +``` + +## owl:DataTypeProperty + +Similar mapping to an objec property, except that no schemas will be used as reference under the `items` field. For example, consider a `dateCreated` property that indicates when an item is created: + +```yaml +dateCreated: + description: Creation date of the item + items: + type: string + nullable: true + type: array +``` + +### rdfs:domain and rdfs:range + +For each object and datatype property, OBA will analyze their `rdfs:domain` and `rdfs:range` to assign the property in the right schema (using `rdfs:domain`) and use the appropriate reference or datatype (by inspecting `rdfs:range`). At the moment, cardinality constraints are not taken into account in this mapping. + + +## Other important considerations + +All properties are `nullable` (i.e., optional) and are returned as a list. This is because from the development perspective, it is easier to deal with lists (even if they have one element) than having to distinguish whether the object returned is a list or not. + +Complex unions and intersections are considered of type `object` instead of a particular schema. + +## Class and property documentation +OBA uses `rdfs:comment`, `skos:definition` and `prov:definition` annotations in the ontology for creating definitions of the classes and properties in OBA. An example can be seen below: + +Example: +```yaml +Plan: + description: A p-plan:Plan is a specific type of prov:Plan. It is composed of smaller steps that use and produce Variables. + properties: + isSubPlanOfPlan: + description: A p-plan:Plan may be a subplan of another bigger p-plan:Plan. p-plan:isSubPlanOfPlan is used to state the link among the two different plans. Note that if p1 is a p-plan:subPlan of p2, p1will not necessarily be a step of p2. A multistep will represent p1 in p2, and link to p1 with the p-plan.hasStepDecomposition relationship. + {...} #Rest of the schema ommited for brevity +``` + + diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 000000000..f737063fe --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,13 @@ +site_name: KGTK documentation +nav: + - Home: index.md + - KGTK file specification: specification.md +theme: + name: material + +markdown_extensions: + - admonition + - codehilite: + guess_lang: false + - toc: + permalink: true From 211b7c3f50d5907066b3fe3e53448cd30b3f4cd4 Mon Sep 17 00:00:00 2001 From: GreatYYX Date: Thu, 14 May 2020 18:59:26 -0700 Subject: [PATCH 203/278] fix doi in readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a3ab034d3..beabf0907 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# KGTK: Knowledge Graph Toolkit I](https://zenodo.org/badge/DOI/10.5281/zenodo.3828069.svg)](https://doi.org/10.5281/zenodo.3828069) +# KGTK: Knowledge Graph Toolkit [![doi](https://zenodo.org/badge/DOI/10.5281/zenodo.3828069.svg)](https://doi.org/10.5281/zenodo.3828069) KGTK is a Python library for easy manipulation with knowledge graphs. It provides a flexible framework that allows chaining of common graph operations, such as: extraction of subgraphs, filtering, computation of graph metrics, validation, cleaning, generating embeddings, and so on. Its principal format is TSV, though we do support a number of other inputs. @@ -98,4 +98,4 @@ More detailed description of the arguments will be added here promptly. ### Developer Instructions -Please refer to [this](README_dev.md) \ No newline at end of file +Please refer to [this](README_dev.md) From c25ab7aa2d88877a30cb67b45bb18d2e4555743e Mon Sep 17 00:00:00 2001 From: Daniel Garijo Date: Thu, 14 May 2020 19:19:57 -0700 Subject: [PATCH 204/278] Update specification.md --- docs/specification.md | 420 +++++++++++++++++++++++++++++++++--------- 1 file changed, 338 insertions(+), 82 deletions(-) diff --git a/docs/specification.md b/docs/specification.md index daf864a61..4b776f6e2 100644 --- a/docs/specification.md +++ b/docs/specification.md @@ -1,114 +1,370 @@ -The current mapping between OWL and OpenAPI specification (OAS) supported by OBA can be seen below. +## KGTK File Format -!!! warning - We are currently working on improving the mapping with complex axiomatization of domains and ranges and other property annotations (minimum and maximum cardinality, etc.) +**Authors:** Hans Chalupsky, Craig Roger, Pedro Szekely -**Namespaces** used in this document: +**Version:** 2.0 - - owl: [http://www.w3.org/2002/07/owl#](http://www.w3.org/2002/07/owl#) - - rdfs: [http://www.w3.org/2000/01/rdf-schema#](http://www.w3.org/2000/01/rdf-schema#) - - skos: [http://www.w3.org/2004/02/skos/core#](http://www.w3.org/2004/02/skos/core#) - - prov: [http://www.w3.org/ns/prov#](http://www.w3.org/ns/prov#) -## owl:Class +KGTK uses a text-based, columnar file format that aims to be simple, readable, expressive, yet self-describing and easily generatable and parsable by standard tools. The KGTK file design is focused on being able to represent arbitrary knowledge graphs, but can be used to describe any attributed, labeled or unlabeled hypergraph. -Each class in the ontology is associated with two paths for the GET operation, one path for POST, PUT and DELETE operations; and a schema. For example, consider the class "Plan" from [http://purl.org/net/p-plan](http://purl.org/net/p-plan). The following GET paths would be generated: +## Basic File Structure +**Encoding**: KGTK files are text files that use UTF8 encoding for Unicode characters. -```yaml -/plans: - get: - description: Gets a list of all instances of Plan (more information in http://purl.org/net/p-plan#Plan) - parameters: - {...} #omitted for simplicity - responses: - 200: - content: - application/json: - schema: - items: - $ref: '#/components/schemas/Plan' - type: array - description: Successful response - returns an array with the instances of Plan. - headers: - link: - description: Information about pagination - schema: - type: string - summary: List all instances of Plan +**Separator characters**: files are TAB-separated multi-column files, values containing TAB characters need to escape them with the \t escape sequence. + +**Comments**: lines that begin with a #-sign are treated as comments and will be ignored, lines consisting of all whitespace will also be ignored. +Headers: the first line of each file is interpreted as a header line which needs to list the names of required and optional columns. Column names must be nonblank and unique within a file. Column names must be symbols. Column names should not contain quoted whitespace. + +**Newlines and special characters**: each line ends with an end-of-line character or character sequence (such as CR, LF, or CR LF). Text values that need to contain a newline character can encode them via `\n` and/or `\r`. Other escape sequences mirroring those defined by Python are also supported. Backslash can more generally be used to escape characters with special meaning, for example, `\|` to escape a vertical bar in a values list. Leading and trailing whitespace in values other than inside quoted strings is disallowed. + +**Columns and null values**: each file can have an arbitrary number of columns, however, the number of columns in each content line has to be constant across the file. Specific required columns are described in more detail below. Undefined values can be specified by the empty string which is a zero-length field (not the empty quoted string). + +**Unordered rows**: Records in a KGTK file may appear in any order, and may be reordered freely, without changing their semantic meaning. Duplicate records may be created or removed without changing the semantic content of the file. This means that comments and blank lines appearing in a KGTK source file may be removed by certain processing steps that cannot easily preserve them (e.g. with a sort or join operation). + +## Representing Graphs +KGTK defines knowledge graphs (or more generally any attributed graph or hypergraph) as a set of nodes and a set of edges between those nodes. KGTK represents everything of meaning via an edge. Edges themselves can be attributed by having edges asserted about them, thus, KGTK can in fact represent arbitrary hypergraphs. KGTK intentionally does not distinguish attributes or qualifiers on nodes and edges from full-fledged edges, tools operating on KGTK graphs can instead interpret edges differently if they so desire. In KGTK, everything can be a node, and every node can have any type of edge to any other node. + +Nodes are described in one or more node files, and edges in one or more edge files. The resulting graph is built from the union of all loaded files. There is some redundancy of representation between node and edge files. In fact, all graphs can be described with just an edge file, and some graphs can be described with just a node file. However, certain aspects can be described more concisely with a node file and others only with an edge file, thus both formats are available for use by an application. + +Nodes and edges must have unique IDs, however, IDs can be left implicit and will then be system generated. + +## KGTK Data Types +KGTK represents data via nodes and edges between those nodes. Since edges can themselves serve as nodes, those two sets are not disjoint. + +KGTK uses two basic data types to represent nodes and edges: symbols and literals. Symbols are names such as `Node42` or `a90b-bc8f`, literals are numbers or quoted strings, for example, `3.1415` or `“John Doe”`. Both symbols and literals may contain internal whitespace (except for unescaped TABs and newline characters). + +There is a third type we call structured (or fancy) literals, which are useful to concisely represent things such as dates or locations. For example, `@043.26193/010.92708` represents the location with latitude `043.26193` and longitude `010.92708`. However, this is just shorthand for a location node with latitude and longitude edges leading to those numeric values. + +To allow us to easily specify (and parse) an object type without a verbose type declaration or other complex syntactic structure, we adopt the convention where the first character of a value tells us its data type. The table below lists different sets of first characters and the data type they correspond to with some examples. + +|First Character | Data Type | Examples| +|----------------|-----------|---------| +|0-9, +, -,. | Number |1, 42, 3.14e-10, 0.01, .1, 0xff| +|“ |String|“John Doe” +|^, @, ‘, !|Structured Literal|^10:30, ‘Deutsch’@de +|otherwise|Symbol|Node42, \0ob1 + +Note that in the last symbol example the special meaning of 0 was escaped with the backslash character (which does not itself become part of the symbol’s name). Without that the value `0ob1` would be interpreted as an illegal octal numeric value. + +## Predefined Names +KGTK comes with a small set of predefined column names and edge labels that either need to be used at certain positions in node or edge files, or that are used by KGTK to translate structured literals into their internal representation. The table below lists those names together with their allowable aliases. Aliases are expensive to process; we may want to define a KGTK file profile that excludes aliases. +The presence of ID as an alias for id implies that the predefined names are sensitive to case We might want to consider making column names insensitive to case, although that can also cause processing inefficiencies. +If a predefined name or allowable alias appears as a column name, no other column name may appear from the same set of equivalent names. + +|Predefined Name|Allowable Aliases|Description| +|---------------|-----------------|-----------| +|id|ID|Node and edge IDs| +|node1|from, subject|Start node of an edge| +|node2|to, object|End node of an edge| +|label|predicate, relation, relationship|Node or edge label +|source||Node or edge provenance| +|text, language||Field values for language-qualified strings| +|magnitude, tolerance, unit ||Field values for dimensioned numbers| +latitude, longitude||Field values for locations | +|year, month, day, hour, minute, second, nanosecond, timezone, precision||Field values for times and dates| + +## Edge File Format + +The edge file is the core representational structure for KGTK graphs. Everything can be specified in the edge file or files. Node files only provide a different point of view that makes the representation of node-centric information more concise. + +Edge files specify the set of edges in a knowledge graph. They have three mandatory columns: node1, label, and node2 (or their aliases). The label might be left blank to represent unlabeled graphs[CMR: I am concerned that blank label values may cause syntactic (not semantic) confusion. I think it would be better to use a special value, such as _.], however, we will ignore lines with blank node1 or node2 (for us that does not correspond to unknown, just missing). [For processing efficiency, we might want to define an edge file profile that disallows comment lines, blank lines, and lines with blank node1 or node2 values.] + +An optional edge ID field can be used to name an edge. All additional columns have a user-defined meaning and are optional. Here is a small example edge file: +``` +node1 label node2 +N1 rdf:type Person +N1 label “Moe” +N2 rdf:type Person +N2 label “Larry” +N3 rdf:type Person +N3 label “Curly” +N1 brotherOf N3 +N1 friendOf N2 +N1 friendOf N3 +N1 diedAtAge 77 +``` + +This file defines three nodes with types and respective labels (all specified via edges), and some relationships between them. We used here an RDF-ish type label with an rdf: namespace prefix, but there is no requirement for that, any other label could have been used. Similarly, type names such as Person could be prefixed with a namespace or use a full URI. Multiple values as for N1’s friends can be specified via multiple entries or via a special list syntax described below. + +Any symbol or literal can serve as a node ID or label, so another representation for this information would be the following: + +``` +node1 label node2 +“Moe” rdf:type Person +“Larry” rdf:type Person +“Curly” rdf:type Person +“Moe” brotherOf “Curly” +“Moe” friendOf “Larry” +“Moe” friendOf “Curly” +77 “death age of” “Moe” ``` -```yaml -/plans/{id}: - get: - description: Gets the details of a given Plan (more information in http://purl.org/net/p-plan#Plan) - parameters: - {...} #omitted for simplicity, the response is similar to the one above +The meaning of a column is defined by its column header, so the order of columns does not matter. The following would be an equivalent representation of the three node types: + +``` +label node1 node2 +rdf:type “Moe” Person +rdf:type “Larry” Person +rdf:type “Curly” Person ``` -And the following Schema would be generated: -```yaml -Plan: - description: A p-plan:Plan is a specific type of prov:Plan. It is composed of smaller steps that use and produce Variables. - properties: - wasGeneratedBy: - {...} #omitted other properties for simplicity. +Additional columns can be used to specify edges about an edge. For example: + +``` +node1 label node2 creator source +“Moe” rdf:type Person “Hans” Wikipedia +“Larry” rdf:type Person “Hans” Wikipedia +“Curly” rdf:type Person “Hans” Wikipedia ``` -### rdfs:subClassOf +Each edge is uniquely identified by its (node1, label, node2) triple (ignoring the order in which these columns were specified in the file). So, additional values about a particular edge can be added by repeating the edge and listing the value. For example: -Subclasses inherit all properties from their respective superclasses. The OpenAPI specification has the `allOf` clause to indicate this behavior. However, this was not supported by any existing generators until very recently, and therefore OBA will iterate through all superclasses to add the appropriate properties for a given schema. +``` +node1 label node2 creator source +“Moe” rdf:type Person “Hans” Wikipedia +“Larry” rdf:type Person “Hans” Wikipedia +“Curly” rdf:type Person “Hans” Wikipedia +# we repeat the edge triple but only list additional +# values where they apply, other columns are left blank: +“Curly” rdf:type Person IMDB +``` +To allow us to use edges in both the node1 and node2 positions of an edge or to use them as arguments in an explicit node1/label/node2 triple, we can name or alias them via an explicit id column. The names or aliases can then be used as stand-ins for the explicit triple. For example: + +``` +node1 label node2 creator id +“Moe” rdf:type Person “Hans” E1 +“Larry” rdf:type Person “Hans” E2 +“Curly” rdf:type Person “Hans” E3 +E1 source Wikipedia +E2 source Wikipedia +E3 source Wikipedia +E3 source IMDB +# the first creator edge is equivalent to this one: +E1 creator “Hans” +``` -## owl:ObjectProperty -Each object property is added to its corresponding schema definition that uses it as domain. For example, in the P-Plan ontology, `Plan` has a property `isSubPlanofPlan` which has domain `Plan`. This would be represented as follows in the OpenAPI specification: +Column values in the edges table are simply a shorthand for a more explicit line-based edge representation using edge IDs. However, for edges without explicitly provided IDs, columns are the only way to say something about them. Column values are only related to the edge they are modifying, they are not related or linked to each other in any way. -```yaml - Plan: - description: A p-plan:Plan is a specific type of prov:Plan. It is composed of smaller steps that use and produce Variables. - properties: - isSubPlanOfPlan: - description: A p-plan:Plan may be a subplan of another bigger p-plan:Plan. p-plan:isSubPlanOfPlan is used to state the link among the two different plans. - items: - $ref: '#/components/schemas/Plan' - nullable: true - type: array +Columnar edges can themselves be named via IDs, for example: +``` +node1 label node2 creator id +“Moe” rdf:type Person “Hans” E1 +E1 creator “Hans” E11 ``` -## owl:DataTypeProperty +Note that explicit IDs are simply aliases for the internal edge ID based on the triple, they do not replace that ID, they simply point to it. In future versions of KGTK, we might allow edge IDs that are only unique within a file which is OK since they will point to a global ID based on the edge triple. Since edge IDs are simply aliases, an edge can have multiple IDs defined for it, all pointing to the same triple ID. -Similar mapping to an objec property, except that no schemas will be used as reference under the `items` field. For example, consider a `dateCreated` property that indicates when an item is created: +## Multi-valued Edges +As shown above, multi-valued edges can be represented through separate entries in the edge table. Alternatively, there is a list syntax available using the | separator. For example, here is an alternative way to represent the multiple sources for one of the edges: -```yaml -dateCreated: - description: Creation date of the item - items: - type: string - nullable: true - type: array ``` +node1 label node2 creator source +“Curly” rdf:type Person “Hans” Wikipedia|IMDB +``` + +This representation is equivalent to the following: -### rdfs:domain and rdfs:range +``` +node1 label node2 creator source +“Curly” rdf:type Person “Hans” Wikipedia +“Curly” rdf:type Person IMDB +``` +For value lists care must be taken that individual values must either do not contain vertical bars, or if they do, theythat they must be are escaped by backslash escape syntax. -For each object and datatype property, OBA will analyze their `rdfs:domain` and `rdfs:range` to assign the property in the right schema (using `rdfs:domain`) and use the appropriate reference or datatype (by inspecting `rdfs:range`). At the moment, cardinality constraints are not taken into account in this mapping. +List values will provide a valuable conciseness when records are viewed by humans. However, they may impost complexity on tools that use KGTK files. We may want to define a KGTK profile that excludes list values. +Multiple values are combined without ordering using a set semantics, duplicates will simply be ignored. -## Other important considerations +List values are not allowed in node1, label and node2 columns of the edge table. This simplifies parsing and avoids edge IDs being associated with multiple edges. -All properties are `nullable` (i.e., optional) and are returned as a list. This is because from the development perspective, it is easier to deal with lists (even if they have one element) than having to distinguish whether the object returned is a list or not. +## Unlabeled and Undirected Edges +Even though unusual for knowledge graphs, edges might be unlabeled to represent purely structural information more common in standard graph representations. To represent an unlabeled edge, the label column in the edge file can simply be blank. By default, edges are assumed to be directed from node1 to node2. To represent a blank, undirected edge, the special predicate label _ (underscore) can be used. To represent labeled but undirected edges, the edge label needs to start with an _ (underscore), for example, _brotherOf. +Node File Format +Node files allow a more concise node-centric specification of edges. They have one mandatory column for the node ID (using the predefined name or its alias(es)). Lines with blank node IDs are ignored. Node files must not contain a node1 column, in order to distinguish node files from edge files, which may contain an id column. We might want to disallow node2 columns from node files, too.All other columns are optional and specify edges where the identified node is node1. Here is a small example that simply adds labels to our three nodes: -Complex unions and intersections are considered of type `object` instead of a particular schema. +``` +id label +N1 “Moe” +N2 “Larry” +N3 “Curly” +``` -## Class and property documentation -OBA uses `rdfs:comment`, `skos:definition` and `prov:definition` annotations in the ontology for creating definitions of the classes and properties in OBA. An example can be seen below: +A minimal version of the nodes file above would only contain the id column (e.g., to communicate a set of nodes to some operation). Here is a more elaborate example adding types, creators and sources: -Example: -```yaml -Plan: - description: A p-plan:Plan is a specific type of prov:Plan. It is composed of smaller steps that use and produce Variables. - properties: - isSubPlanOfPlan: - description: A p-plan:Plan may be a subplan of another bigger p-plan:Plan. p-plan:isSubPlanOfPlan is used to state the link among the two different plans. Note that if p1 is a p-plan:subPlan of p2, p1will not necessarily be a step of p2. A multistep will represent p1 in p2, and link to p1 with the p-plan.hasStepDecomposition relationship. - {...} #Rest of the schema ommited for brevity ``` +id label rdf:type creator source +N1 “Moe” Person “Hans” Wikipedia +N2 “Larry” Person “Hans” Wikipedia +N3 “Curly” Person “Hans” Wikipedia|IMDB +``` + +The equivalent edge file for the above looks like this. Note that here the creator and source edges are on nodes and not on edges as in our previous examples: + +``` +node1 label node2 +N1 label “Moe” +N1 rdf:type Person +N1 creator “Hans” +N1 source Wikipedia +N2 label “Larry” +N2 rdf:type Person +N2 creator “Hans” +N2 source Wikipedia +N3 label “Curly” +N3 rdf:type Person +N3 creator “Hans” +N3 source Wikipedia +N3 source IMDB +``` + +This example illustrates that the node table is simply a slightly more concise, node-centric representation that is most useful for dense edges, that is, edges that have values for most or all nodes. + +## Edge Collections and Graphs +KGTK does not have a specific graph type to collect or name sets of edges (different from RDF). Instead, edges can be grouped by linking them to collection nodes using the same edge syntax as used for all other edges. For example, the following edge table assigns the three type edges to the collection Stooges via a graph edge each: + +``` +node1 label node2 graph +“Moe” rdf:type Person Stooges +“Larry” rdf:type Person Stooges +“Curly” rdf:type Person Stooges +``` + +There is nothing special about the label graph used for those edges, any other name could have been used (for example, memberOf). The above corresponds to the following explicit edge representation: + +``` +node1 label node2 id +“Moe” rdf:type Person e1 +“Larry” rdf:type Person e2 +“Curly” rdf:type Person e3 +e1 graph Stooges +e2 graph Stooges +e3 graph Stooges +``` + + +By defining collection or graph membership via explicit edges, edges can be in more than one graph. + +To make it possible to define such membership edges about columnar edges, without having to list all of them explicitly, we introduce a special syntax `*