diff --git a/datamart_isi/__init__.py b/datamart_isi/__init__.py index 3c5f8bb..47f0730 100644 --- a/datamart_isi/__init__.py +++ b/datamart_isi/__init__.py @@ -1,3 +1,4 @@ +name = "datamart_isi" from .entries import DatamartSearchResult, DatasetColumn from pkgutil import extend_path __path__ = extend_path(__path__, __name__) \ No newline at end of file diff --git a/datamart_isi/augment.py b/datamart_isi/augment.py index 5ca8f58..d17bdcc 100644 --- a/datamart_isi/augment.py +++ b/datamart_isi/augment.py @@ -11,6 +11,8 @@ from datamart_isi.utilities import connection from SPARQLWrapper import SPARQLWrapper, JSON, POST, URLENCODED from itertools import chain +from datamart_isi.utilities.geospatial_related import GeospatialRelated +from datamart_isi.cache.wikidata_cache import QueryCache class Augment(object): @@ -28,6 +30,7 @@ def __init__(self) -> None: self.qm.setRequestMethod(URLENCODED) self.profiler = Profiler() self.logger = logging.getLogger(__name__) + self.wikidata_cache_manager = QueryCache() def query_by_sparql(self, query: dict, dataset: pd.DataFrame = None) -> typing.Optional[typing.List[dict]]: """ @@ -50,7 +53,7 @@ def query_by_sparql(self, query: dict, dataset: pd.DataFrame = None) -> typing.O return [] return results else: - print("\n\n[ERROR] No query given, query failed!\n\n") + self.logger.error("No query given, query failed!") return [] def parse_sparql_query(self, json_query, dataset) -> str: @@ -112,11 +115,11 @@ def parse_sparql_query(self, json_query, dataset) -> str: if "variables_search" in json_query.keys() and json_query["variables_search"] != {}: if "temporal_variable" in json_query["variables_search"].keys(): tv = json_query["variables_search"]["temporal_variable"] - TemporalGranularity = {'second': 14, 'minute': 13, 'hour': 12, 'day': 11, 'month': 10, 'year': 9} + temporal_granularity = {'second': 14, 'minute': 13, 'hour': 12, 'day': 11, 'month': 10, 'year': 9} start_date = pd.to_datetime(tv["start"]).isoformat() end_date = pd.to_datetime(tv["end"]).isoformat() - granularity = TemporalGranularity[tv["granularity"]] + granularity = temporal_granularity[tv["granularity"]] spaqrl_query += ''' ?variable pq:C2013 ?time_granularity . ?variable pq:C2011 ?start_time . @@ -125,6 +128,21 @@ def parse_sparql_query(self, json_query, dataset) -> str: FILTER(!((?start_time > "''' + end_date + '''"^^xsd:dateTime) || (?end_time < "''' + start_date + '''"^^xsd:dateTime))) ''' + if "geospatial_variable" in json_query["variables_search"].keys(): + geo_variable = json_query["variables_search"]["geospatial_variable"] + qnodes = self.parse_geospatial_query(geo_variable) + if qnodes: + # find similar dataset from datamart + query_part = " ".join(qnodes) + # query_part = "q1494 q1400 q759 q1649 q1522 q1387 q16551" # COMMENT: for testing + spaqrl_query += ''' + ?variable pq:C2006 [ + bds:search """''' + query_part + '''""" ; + bds:relevance ?score_geo ; + ]. + ''' + bind = "?score_geo" if bind == "" else bind + "+ ?score_geo" + # if "title_search" in json_query.keys() and json_query["title_search"] != '': # query_title = json_query["title_search"] # spaqrl_query += ''' @@ -140,6 +158,40 @@ def parse_sparql_query(self, json_query, dataset) -> str: spaqrl_query += "\n }" + "\n" + ORDER + "\n" + LIMIT return spaqrl_query + + def parse_geospatial_query(self, geo_variable): + geo_gra_dict = {'country': 'Q6256', 'state': 'Q7275', 'city': 'Q515', 'county': 'Q28575', + 'postal_code': 'Q37447'} + qnodes = set() + + # located inside a bounding box + if "latitude1" in geo_variable.keys() and "latitude2" in geo_variable.keys(): + geo1_related = GeospatialRelated(float(geo_variable["latitude1"]), float(geo_variable["longitude1"])) + geo1_related.coordinate_transform() # axis transformation + geo2_related = GeospatialRelated(float(geo_variable["latitude2"]), float(geo_variable["longitude2"])) + geo2_related.coordinate_transform() + # find top left point and bottom right point + top_left_point, botm_right_point = geo1_related.distinguish_two_points(geo2_related) + granularity = geo_gra_dict[geo_variable["granularity"]] + + if top_left_point and botm_right_point: + # get Q nodes located inside a geospatial bounding box from wikidata query + sparql_query = "select distinct ?place where \n{\n ?place wdt:P31/wdt:P279* wd:" + granularity + " .\n" \ + + "SERVICE wikibase:box {\n ?place wdt:P625 ?location .\n" \ + + "bd:serviceParam wikibase:cornerWest " + "\"Point(" + str( + top_left_point[0]) + " " + str(top_left_point[1]) + ")\"^^geo:wktLiteral .\n" \ + + "bd:serviceParam wikibase:cornerEast " + "\"Point(" + str( + botm_right_point[0]) + " " + str(botm_right_point[1]) + ")\"^^geo:wktLiteral .\n}\n" \ + + "SERVICE wikibase:label { bd:serviceParam wikibase:language \"en\" }\n}\n" + results = self.wikidata_cache_manager.get_result(sparql_query) + if results: + for each in results: + value = each["place"]["value"] + value = value.split('/')[-1] + qnodes.add(value) + + return qnodes + # # def query(self, # col: pd.Series = None, diff --git a/datamart_isi/cache/general_search_cache.py b/datamart_isi/cache/general_search_cache.py index 09cbfb0..19ba56d 100644 --- a/datamart_isi/cache/general_search_cache.py +++ b/datamart_isi/cache/general_search_cache.py @@ -75,7 +75,19 @@ def add_to_memcache(self, supplied_dataframe, search_result_serialized, augment_ else: raise ValueError("Unsupport type of supplied_data result as " + str(type(supplied_dataframe)) + "!") - path_to_augment_results = os.path.join(config.cache_file_storage_base_loc, hash_key + ".pkl") + # add supplied data for further updating if needed + try: + search_result_json = json.loads(search_result_serialized) + if "wikifier_choice" in search_result_json: + storage_loc = os.path.join(config.cache_file_storage_base_loc, "wikifier_cache") + else: + storage_loc = os.path.join(config.cache_file_storage_base_loc, "general_search_cache") + except: + storage_loc = os.path.join(config.cache_file_storage_base_loc, "other_cache") + + path_to_supplied_dataframe = os.path.join(storage_loc, str(hash_supplied_dataframe) + ".pkl") + path_to_augment_results = os.path.join(storage_loc, hash_key + ".pkl") + with open(path_to_augment_results, "wb") as f: pickle.dump(augment_results, f) @@ -88,18 +100,6 @@ def add_to_memcache(self, supplied_dataframe, search_result_serialized, augment_ if not response_code2: self._logger.warning("Pushing timestamp failed! What happened???") - # add supplied data for further updating if needed - if search_result_serialized: - try: - search_result_json = json.loads(search_result_serialized) - if "wikifier_choice" in search_result_json: - storage_loc = os.path.join(config.cache_file_storage_base_loc, "wikifier_cache") - else: - storage_loc = os.path.join(config.cache_file_storage_base_loc, "general_search_cache") - except: - storage_loc = os.path.join(config.cache_file_storage_base_loc, "other_cache") - - path_to_supplied_dataframe = os.path.join(storage_loc, str(hash_supplied_dataframe) + ".pkl") with open(path_to_supplied_dataframe, "wb") as f: pickle.dump(supplied_dataframe, f) response_code3 = self.mc.set("supplied_data" + hash_key, path_to_supplied_dataframe) diff --git a/datamart_isi/cache/metadata_cache.py b/datamart_isi/cache/metadata_cache.py index 10ebf92..faa58af 100644 --- a/datamart_isi/cache/metadata_cache.py +++ b/datamart_isi/cache/metadata_cache.py @@ -3,18 +3,72 @@ import os import logging import json +import pandas as pd from d3m.container.dataset import D3MDatasetLoader from d3m.container import Dataset as d3m_Dataset from d3m.metadata.base import ALL_ELEMENTS from d3m.base import utils as d3m_utils from datamart_isi.config import cache_file_storage_base_loc +from datamart_isi.config import default_temp_path +DEFAULT_TEMP_PATH = default_temp_path _logger = logging.getLogger(__name__) seed_dataset_store_location = os.path.join(cache_file_storage_base_loc, "datasets_cache") +wikifier_target_cache_exist_mark = "wikifier_target_cache_exist_mark" +if not os.path.exists(seed_dataset_store_location): + print(f'Creating directory: {seed_dataset_store_location}') + os.makedirs(seed_dataset_store_location, exist_ok=True) class MetadataCache: + @staticmethod + def get_hash_key(input_data: pd.DataFrame) -> str: + """ + Function used to get the hash key for dataset cache + :param input_data: + :return: the hash key of the input data + """ + data_columns_list = input_data.columns.tolist() + data_columns_list.sort() + hash_generator = hashlib.md5() + + hash_generator.update(str(data_columns_list).encode('utf-8')) + hash_key = str(hash_generator.hexdigest()) + _logger.debug("Current columns are: " + str(data_columns_list)) + _logger.debug("Current dataset's hash key is: " + hash_key) + return hash_key + + @staticmethod + def save_specific_wikifier_targets(current_dataframe, column_to_p_node_dict, + cache_folder: str = seed_dataset_store_location) -> bool: + hash_key = MetadataCache.get_hash_key(current_dataframe) + # delete previous exist wikification target first + MetadataCache.delete_specific_p_nodes_file(current_dataframe) + file_loc = os.path.join(cache_folder, hash_key + "_metadata") + if os.path.exists(file_loc): + with open(file_loc, "r") as f: + current_dataset_metadata_dict = json.load(f) + else: + current_dataset_metadata_dict = dict() + try: + # add wikifier targets to file + current_dataset_metadata_dict[wikifier_target_cache_exist_mark] = True + for i in range(current_dataframe.shape[1]): + current_column_name = current_dataframe.columns[i] + if current_column_name in column_to_p_node_dict: + current_dataset_metadata_dict[current_column_name + "_wikifier_target"] = column_to_p_node_dict[current_column_name] + + with open(file_loc, "w") as f: + json.dump(current_dataset_metadata_dict, f) + _logger.info("Saving wikifier targets to " + file_loc + " success!") + return True + + except Exception as e: + _logger.error("Saving dataset failed!") + _logger.debug(e, exc_info=True) + return False + @staticmethod def check_and_get_dataset_real_metadata(input_dataset: d3m_Dataset, cache_folder: str = seed_dataset_store_location): """ @@ -28,25 +82,27 @@ def check_and_get_dataset_real_metadata(input_dataset: d3m_Dataset, cache_folder res_id, input_dataframe = d3m_utils.get_tabular_resource(dataset=input_dataset, resource_id=None) input_columns = input_dataframe.columns.tolist() input_columns.sort() - hash_generator = hashlib.md5() - hash_generator.update(str(input_columns).encode('utf-8')) - hash_key = str(hash_generator.hexdigest()) + hash_key = MetadataCache.get_hash_key(input_dataframe) _logger.debug("Current columns are: " + str(input_columns)) _logger.debug("Current dataset's hash key is: " + hash_key) try: file_loc = os.path.join(cache_folder, hash_key + "_metadata") if os.path.exists(file_loc): - _logger.info("found exist metadata from seed datasets! Will use that") + with open(file_loc, "r") as f: metadata_info = json.load(f) - _logger.info("The hit dataset id is: " + metadata_info["dataset_id"]) - - for i in range(len(input_columns)): - selector = (res_id, ALL_ELEMENTS, i) - current_column_name = input_dataframe.columns[i] - new_semantic_type = metadata_info[current_column_name] - input_dataset.metadata = input_dataset.metadata.update(selector, {"semantic_types": new_semantic_type}) - return True, input_dataset + if "dataset_id" in metadata_info: + _logger.info("found exist metadata! Will use that") + _logger.info("The hit dataset id is: " + metadata_info["dataset_id"]) + for i in range(len(input_columns)): + selector = (res_id, ALL_ELEMENTS, i) + current_column_name = input_dataframe.columns[i] + new_semantic_type = metadata_info[current_column_name] + input_dataset.metadata = input_dataset.metadata.update(selector, {"semantic_types": new_semantic_type}) + return True, input_dataset + else: + _logger.info("Found file but the file do not contains metadata information for columns") + return False, input_dataset else: _logger.warning("No exist metadata from seed datasets found!") return False, input_dataset @@ -92,20 +148,19 @@ def save_metadata_from_dataset(current_dataset: d3m_Dataset, cache_folder: str = :return: a Bool indicate saving success or not """ try: - current_dataset_metadata_dict = dict() res_id, current_dataframe = d3m_utils.get_tabular_resource(dataset=current_dataset, resource_id=None) + hash_key = MetadataCache.get_hash_key(current_dataframe) + file_loc = os.path.join(cache_folder, hash_key + "_metadata") + if os.path.exists(file_loc): + with open(file_loc, "r") as f: + current_dataset_metadata_dict = json.load(f) + else: + current_dataset_metadata_dict = dict() + current_dataset_metadata_dict["dataset_id"] = current_dataset.metadata.query(())['id'] for i in range(current_dataframe.shape[1]): each_metadata = current_dataset.metadata.query((res_id, ALL_ELEMENTS, i)) current_dataset_metadata_dict[current_dataframe.columns[i]] = each_metadata['semantic_types'] - input_columns = current_dataframe.columns.tolist() - input_columns.sort() - hash_generator = hashlib.md5() - hash_generator.update(str(input_columns).encode('utf-8')) - hash_key = str(hash_generator.hexdigest()) - _logger.debug("Current columns are: " + str(input_columns)) - _logger.debug("Current dataset's hash key is: " + hash_key) - file_loc = os.path.join(cache_folder, hash_key + "_metadata") with open(file_loc, "w") as f: json.dump(current_dataset_metadata_dict, f) _logger.info("Saving " + current_dataset_metadata_dict["dataset_id"] + " to " + file_loc + " success!") @@ -115,3 +170,47 @@ def save_metadata_from_dataset(current_dataset: d3m_Dataset, cache_folder: str = _logger.error("Saving dataset failed!") _logger.debug(e, exc_info=True) return False + + @staticmethod + def generate_specific_meta_path(supplied_dataframe, cache_folder: str = seed_dataset_store_location): + hash_key = MetadataCache.get_hash_key(supplied_dataframe) + file_loc = os.path.join(cache_folder, hash_key + "_metadata") + return file_loc + + @staticmethod + def get_specific_p_nodes(supplied_dataframe) -> typing.Optional[dict]: + specific_q_nodes_file = MetadataCache.generate_specific_meta_path(supplied_dataframe) + if os.path.exists(specific_q_nodes_file): + with open(specific_q_nodes_file, 'r') as f: + loaded_metadata = json.load(f) + specific_p_nodes_dict = dict() + # if no mark exist, it means this dataset's wikifier cache not saved, so we should return None + if wikifier_target_cache_exist_mark not in loaded_metadata: + return None + # otherwise, find corresponding wikifier targets + # it is possible to return an empty dict to indicate that no columns can be wikified + for key, value in loaded_metadata.items(): + if key.endswith("_wikifier_target"): + column_name = key[:-16] + specific_p_nodes_dict[column_name] = value + return specific_p_nodes_dict + else: + return None + + @staticmethod + def delete_specific_p_nodes_file(supplied_dataframe): + specific_q_nodes_file = MetadataCache.generate_specific_meta_path(supplied_dataframe) + if os.path.exists(specific_q_nodes_file): + with open(specific_q_nodes_file, "r") as f: + loaded_metadata = json.load(f) + keys_need_to_remove = [] + for key in loaded_metadata.keys(): + if key.endswith("_wikifier_target") or key == wikifier_target_cache_exist_mark: + keys_need_to_remove.append(key) + _logger.debug("Following specific wikifier targets will be removed:" + str(keys_need_to_remove)) + for each_key in keys_need_to_remove: + loaded_metadata.pop(each_key) + + with open(specific_q_nodes_file, "w") as f: + json.dump(loaded_metadata, f) + _logger.info("Delete specific p node files on {} success!".format(specific_q_nodes_file)) diff --git a/datamart_isi/config.py b/datamart_isi/config.py index 3836efe..1cecbf6 100644 --- a/datamart_isi/config.py +++ b/datamart_isi/config.py @@ -1,14 +1,11 @@ from . import config_services import os -import socket -host_name = socket.gethostname() - -if host_name == "dsbox02": - home_dir = "/data00/dsbox/datamart" -else: - home_dir = os.getenv("HOME") +home_dir = os.getenv("HOME") +# in the case that no correct home dir found (e.g. in docker) +if home_dir == "/": + home_dir = "/tmp" default_datamart_url = config_services.get_default_datamart_url() @@ -47,6 +44,7 @@ # elastic search to fetch FB embeddings wikidata_uri_template = '' +# em_es_url = "http://kg2018a.isi.edu:9200" # em_es_url = "http://sitaware.isi.edu:9200" # em_es_index = "wiki_fb_embeddings_1" # em_es_type = "vectors" @@ -60,3 +58,5 @@ min_longitude_val = -180 max_latitude_val = 90 min_latitude_val = -90 + +maximum_accept_wikifier_size = 2000000 diff --git a/datamart_isi/config_services.py b/datamart_isi/config_services.py index 8427b66..3a05615 100644 --- a/datamart_isi/config_services.py +++ b/datamart_isi/config_services.py @@ -2,6 +2,7 @@ import os import json +import typing config_file = Path(os.path.join(os.path.dirname(__file__), 'datamart-services.json')) @@ -19,19 +20,24 @@ def _get_service_def(service_name) -> dict: return None -def get_service_url(service_name, as_url=True) -> str: +def get_host_port_path(service_name) -> typing.Tuple[str, int, str]: definition = _get_service_def(service_name) if definition is None: print('get_service_url missing definition: ', service_name) raise ValueError(f'Service name not found: {service_name}') default_host = service_defs['server'].get('default_host', '') - host = service_defs['server'].get('host', default_host) + host = definition.get('host', default_host) if host == '': raise ValueError(f'Host for service {service_name} not defined') - port = definition['port'] + port = int(definition['port']) path = definition.get('path', '') + return (host, port, path) + + +def get_service_url(service_name, as_url=True) -> str: + host, port, path = get_host_port_path(service_name) if as_url: if path: url = f'http://{host}:{port}/{path}' diff --git a/datamart_isi/datamart-services-docker-dsbox01.json b/datamart_isi/datamart-services-docker-dsbox01.json new file mode 100644 index 0000000..913d080 --- /dev/null +++ b/datamart_isi/datamart-services-docker-dsbox01.json @@ -0,0 +1,67 @@ +{ + "server": { + "default_host": "" + }, + "services": [ + { + "name": "isi_datamart", + "host": "datamart", + "port": 9000, + "path": "" + }, + { + "name": "wikidata", + "host": "dsbox02.isi.edu", + "port": 8888, + "path": "bigdata/namespace/wdq/sparql" + }, + { + "name": "wikifier_identifier", + "host": "isi_datamart", + "port": 9000, + "path": "get_identifiers" + }, + { + "name": "wikifier_knowledge_graph", + "host": "minds03.isi.edu", + "port": 8396, + "path": "wikify" + }, + { + "name": "general_search", + "host": "blazegraph_satellite", + "port": 8080, + "path": "bigdata/namespace/datamart3/sparql" + }, + { + "name": "general_search_test", + "host": "blazegraph_satellite", + "port": 8080, + "path": "blazegraph/namespace/datamart4/sparql" + }, + { + "name": "memcached", + "host": "memcached", + "port": 11211, + "path": "" + }, + { + "name": "redis", + "host": "redis", + "port": 6379, + "path": "" + }, + { + "name": "elasticsearch", + "host": "es01", + "port": 9200, + "path": "" + }, + { + "name": "elasticsearch_fb_embeddings", + "host": "kg2018a.isi.edu", + "port": 9200, + "path": "wiki_fb_embeddings_1/vectors" + } + ] +} diff --git a/datamart_isi/datamart-services-docker.json b/datamart_isi/datamart-services-docker.json index 748672d..1881d8a 100644 --- a/datamart_isi/datamart-services-docker.json +++ b/datamart_isi/datamart-services-docker.json @@ -1,67 +1,67 @@ { - "server" : { - "default_host": "" + "server": { + "default_host": "" + }, + "services": [ + { + "name": "isi_datamart", + "host": "datamart", + "port": 9000, + "path": "" + }, + { + "name": "wikidata", + "host": "blazegraph", + "port": 8888, + "path": "bigdata/namespace/wdq/sparql" + }, + { + "name": "wikifier_identifier", + "host": "wikifier", + "port": 4444, + "path": "get_identifiers" + }, + { + "name": "wikifier_knowledge_graph", + "host": "wikifier_kg", + "port": 8396, + "path": "wikify" + }, + { + "name": "general_search", + "host": "blazegraph", + "port": 9002, + "path": "blazegraph/namespace/datamart3/sparql" + }, + { + "name": "general_search_test", + "host": "blazegraph", + "port": 9002, + "path": "blazegraph/namespace/datamart4/sparql" + }, + { + "name": "memcached", + "host": "memcached", + "port": 11211, + "path": "" + }, + { + "name": "redis", + "host": "redis", + "port": 6379, + "path": "" + }, + { + "name": "elasticsearch", + "host": "es01", + "port": 9200, + "path": "" + }, + { + "name": "elasticsearch_fb_embeddings", + "host": "kg2018a.isi.edu", + "port": 9200, + "path": "wiki_fb_embeddings_1/vectors" } - "services": [ - { - "name": "isi_datamart", - "host": "datamart", - "port": 9000, - "path": "" - }, - { - "name": "wikidata", - "host": "blazegraph", - "port": 8888, - "path": "bigdata/namespace/wdq/sparql" - }, - { - "name": "wikifier_identifier", - "host": "wikifier", - "port": 4444, - "path": "get_identifiers" - }, - { - "name": "wikifier_knowledge_graph", - "host": "wikifier_kg", - "port": 8396, - "path": "wikify" - }, - { - "name": "general_search", - "host": "blazegraph", - "port": 9002, - "path": "blazegraph/namespace/datamart3/sparql" - }, - { - "name": "general_search_test", - "host": "blazegraph", - "port": 9002, - "path": "blazegraph/namespace/datamart4/sparql" - }, - { - "name": "memcached", - "host": "memcached", - "port": 11211, - "path": "" - }, - { - "name":"redis", - "host": "redis", - "port": 6379, - "path": "" - }, - { - "name": "elasticsearch", - "host": "es01", - "port": 9200, - "path": "" - }, - { - "name": "elasticsearch_fb_embeddings", - "host": "kg2018a.isi.edu", - "port": 9200, - "path": "wiki_fb_embeddings_1/vectors" - } - ] -} + ] +} \ No newline at end of file diff --git a/datamart_isi/datamart-services-dsbox01.json b/datamart_isi/datamart-services-dsbox01.json index f28c88e..b446f9b 100644 --- a/datamart_isi/datamart-services-dsbox01.json +++ b/datamart_isi/datamart-services-dsbox01.json @@ -17,7 +17,7 @@ }, { "name": "wikifier_identifier", - "host": "dsbox02.isi.edu", + "host": "dsbox01.isi.edu", "host-test": "dsbox01.isi.edu", "port": 9000, "path": "get_identifiers" @@ -30,31 +30,32 @@ }, { "name": "general_search", - "host": "blazegraph_satellite", - "port": 8080, + "host": "dsbox01.isi.edu", + "port": 9002, "port_external": 9002, "path": "bigdata/namespace/datamart3/sparql" }, { "name": "general_search_test", - "host": "blazegraph_satellite", - "port": 8080, + "host": "dsbox01.isi.edu", + "port": 9002, + "port_external": 9002, "path": "blazegraph/namespace/datamart4/sparql" }, { "name": "memcached", - "host": "memcached", + "host": "dsbox01.isi.edu", "port": 11211, "path": "" }, { - "name":"redis", - "host": "dsbox02.isi.edu", + "name": "redis", + "host": "dsbox01.isi.edu", "port": 6379, "path": "" }, { - "name": "elasticsearch", + "name": "elasticsearch_not_used", "host": "dsbox02.isi.edu", "port": 9200, "path": "" diff --git a/datamart_isi/datamart-services-server-dsbox01.json b/datamart_isi/datamart-services-server-dsbox01.json new file mode 100644 index 0000000..eda2093 --- /dev/null +++ b/datamart_isi/datamart-services-server-dsbox01.json @@ -0,0 +1,70 @@ +{ + "server" : { + "default_host": "" + }, + "services": [ + { + "name": "isi_datamart", + "host": "dsbox01.isi.edu", + "port": 9000, + "path": "" + }, + { + "name": "wikidata", + "host": "dsbox02.isi.edu", + "port": 8888, + "path": "bigdata/namespace/wdq/sparql" + }, + { + "name": "wikifier_identifier", + "host": "isi_datamart", + "host-test": "dsbox01.isi.edu", + "port": 9000, + "path": "get_identifiers" + }, + { + "name": "wikifier_knowledge_graph", + "host": "minds03.isi.edu", + "port": 8396, + "path": "wikify" + }, + { + "name": "general_search", + "host": "blazegraph_satellite", + "port": 8080, + "port_external": 9002, + "path": "bigdata/namespace/datamart3/sparql" + }, + { + "name": "general_search_test", + "host": "blazegraph_satellite", + "port": 8080, + "port_external": 9002, + "path": "blazegraph/namespace/datamart4/sparql" + }, + { + "name": "memcached", + "host": "memcached", + "port": 11211, + "path": "" + }, + { + "name": "redis", + "host": "redis", + "port": 6379, + "path": "" + }, + { + "name": "elasticsearch_not_used", + "host": "dsbox02.isi.edu", + "port": 9200, + "path": "" + }, + { + "name": "elasticsearch_fb_embeddings", + "host": "kg2018a.isi.edu", + "port": 9200, + "path": "wiki_fb_embeddings_1/vectors" + } + ] +} diff --git a/datamart_isi/datamart-services-server-dsbox02.json b/datamart_isi/datamart-services-server-dsbox02.json new file mode 100644 index 0000000..6bc097b --- /dev/null +++ b/datamart_isi/datamart-services-server-dsbox02.json @@ -0,0 +1,74 @@ +{ + "server" : { + "default_host": "" + }, + "services": [ + { + "name": "isi_datamart", + "host": "dsbox02.isi.edu", + "port": 9000, + "path": "" + }, + { + "name": "wikidata", + "host": "dsbox02.isi.edu", + "port": 8888, + "path": "bigdata/namespace/wdq/sparql" + }, + { + "name": "wikifier_identifier", + "host": "dsbox02.isi.edu", + "host_internal": "isi_datamart", + "port": 9000, + "path": "get_identifiers" + }, + { + "name": "wikifier_knowledge_graph", + "host": "minds03.isi.edu", + "port": 8396, + "path": "wikify" + }, + { + "name": "general_search", + "host": "dsbox02.isi.edu", + "host_internal": "blazegraph_satellite", + "port": 8080, + "port_external": 9002, + "path": "bigdata/namespace/datamart3/sparql" + }, + { + "name": "general_search_test", + "host": "dsbox02.isi.edu", + "host_internal": "blazegraph_satellite", + "port": 8080, + "port_external": 9002, + "path": "blazegraph/namespace/datamart4/sparql" + }, + { + "name": "memcached", + "host": "dsbox02.isi.edu", + "host_internal": "memcached", + "port": 11211, + "path": "" + }, + { + "name": "redis", + "host": "dsbox02.isi.edu", + "host_internal": "redis", + "port": 6379, + "path": "" + }, + { + "name": "elasticsearch_not_used", + "host": "dsbox02.isi.edu", + "port": 9200, + "path": "" + }, + { + "name": "elasticsearch_fb_embeddings", + "host": "kg2018a.isi.edu", + "port": 9200, + "path": "wiki_fb_embeddings_1/vectors" + } + ] +} diff --git a/datamart_isi/entries.py b/datamart_isi/entries.py index b48ebc6..42ff19a 100644 --- a/datamart_isi/entries.py +++ b/datamart_isi/entries.py @@ -10,6 +10,7 @@ import string import time from ast import literal_eval +from itertools import combinations from d3m import container from d3m import utils @@ -17,6 +18,7 @@ from d3m.container import Dataset as d3m_Dataset from d3m.base import utils as d3m_utils from d3m.metadata.base import DataMetadata, ALL_ELEMENTS +from collections import defaultdict from datamart import TabularVariable, ColumnRelationship, AugmentSpec from datamart_isi import config @@ -133,6 +135,8 @@ def get_next_page(self, *, limit: typing.Optional[int] = 20, timeout: int = None search_res = timeout_call(timeout, self._search_datamart, []) elif self.search_query[self.current_searching_query_index].search_type == "vector": search_res = timeout_call(timeout, self._search_vector, []) + elif self.search_query[self.current_searching_query_index].search_type == "geospatial": + search_res = timeout_call(timeout, self._search_geospatial_data, []) else: raise ValueError("Unknown search query type for " + self.search_query[self.current_searching_query_index].search_type) @@ -324,6 +328,7 @@ def _search_datamart(self) -> typing.List["DatamartSearchResult"]: search_result = [] variables_search = self.search_query[self.current_searching_query_index].variables_search keywords_search = self.search_query[self.current_searching_query_index].keywords_search + # COMMENT: title does not used, may delete later variables, title = dict(), dict() variables_temp = dict() # this temp is specially used to store variable for time query for each_variable in self.search_query[self.current_searching_query_index].variables: @@ -439,6 +444,123 @@ def _search_vector(self) -> typing.List["DatamartSearchResult"]: finally: return vector_results + def _search_geospatial_data(self) -> typing.List["DatamartSearchResult"]: + """ + function used for searching geospatial data + :return: List[DatamartSearchResult] + """ + self._logger.debug("Start searching geospatial data on wikidata and datamart...") + search_results = [] + + if type(self.supplied_data) is d3m_Dataset: + res_id, supplied_dataframe = d3m_utils.get_tabular_resource(dataset=self.supplied_data, resource_id=None) + else: + supplied_dataframe = self.supplied_data + + # try to find possible columns of latitude and longitude + possible_longitude_or_latitude = [] + for each in range(len(supplied_dataframe.columns)): + if type(self.supplied_data) is d3m_Dataset: + selector = (res_id, ALL_ELEMENTS, each) + else: + selector = (ALL_ELEMENTS, each) + each_column_meta = self.supplied_data.metadata.query(selector) + + if "https://metadata.datadrivendiscovery.org/types/Location" in each_column_meta["semantic_types"]: + try: + column_data = supplied_dataframe.iloc[:, each].astype(float).dropna() + if max(column_data) <= config.max_longitude_val and min(column_data) >= config.min_longitude_val: + possible_longitude_or_latitude.append(each) + elif max(column_data) <= config.max_latitude_val and min(column_data) >= config.min_latitude_val: + possible_longitude_or_latitude.append(each) + except: + pass + + if len(possible_longitude_or_latitude) < 2: + self._logger.debug("Supplied dataset does not have geospatial data!") + return search_results + else: + self._logger.debug("Finding columns:" + str(possible_longitude_or_latitude) + " which might be geospatial data columns...") + + possible_la_or_long_comb = list(combinations(possible_longitude_or_latitude, 2)) + for column_index_comb in possible_la_or_long_comb: + latitude_index, longitude_index = -1 , -1 + # try to get the correct latitude and longitude pairs + for each_column_index in column_index_comb: + try: + column_data = supplied_dataframe.iloc[:, each_column_index].astype(float).dropna() + column_name = supplied_dataframe.columns[each_column_index] + + # must be longitude when its min is in [-180, -90), or max is in (90, 180] + if config.max_latitude_val < max(column_data) <= config.max_longitude_val \ + or (config.min_latitude_val > min(column_data) >= config.min_longitude_val): + longitude_index = each_column_index + else: + # determine the type by header [latitude, longitude] + if any([True for i in column_name if i in ['a', 'A']]): + latitude_index = each_column_index + elif any([True for i in column_name if i in ['o', 'O', 'g', 'G']]): + longitude_index = each_column_index + + except Exception as e: + self._logger.debug(e, exc_info=True) + self._logger.error("Can't parse location information for column No." + str(each_column_index) + + " with column name " + column_name) + + # search on datamart and wikidata by city qnodes + if latitude_index != -1 and longitude_index != -1: + self._logger.info("Latitude column is: " + str(latitude_index) + " and longitude is: " + str(longitude_index) + "...") + granularity = {'city'} + radius = 100 + + for gran in granularity: + search_variables = {'metadata': { + 'search_result': { + 'latitude_index': latitude_index, + 'longitude_index': longitude_index, + 'radius': radius, + 'granularity': gran + }, + 'search_type': 'geospatial' + }} + # do wikidata query service to find city q-node columns + return_ds = DownloadManager.query_geospatial_wikidata(self.supplied_data, search_variables, self.connection_url) + _, return_df = d3m_utils.get_tabular_resource(dataset=return_ds, resource_id=None) + + if return_df.columns[-1].startswith('Geo_') and return_df.columns[-1].endswith('_wikidata'): + qnodes = return_df.iloc[:, -1] + qnodes_set = list(set(qnodes)) + coverage_score = len(qnodes_set)/len(qnodes) + + # search on datamart + qnodes_str = " ".join(qnodes_set) + variables = [VariableConstraint(key=return_df.columns[-1], values=qnodes_str)] + self.search_query[self.current_searching_query_index].variables = variables + search_res = timeout_call(1800, self._search_datamart, []) + search_results.extend(search_res) + + # search on wikidata + temp_q_nodes_columns = self.q_nodes_columns + self.q_nodes_columns = [-1] + search_res = timeout_call(1800, self._search_wikidata, [None, return_df]) + search_results.extend(search_res) + self.q_nodes_columns = temp_q_nodes_columns + + if search_results: + for each_result in search_results: + # change metadata's score + old_score = each_result.score() + new_score = old_score * coverage_score + each_result.metadata_manager.score = new_score + # change score in datamart_search_result + if "score" in each_result.search_result.keys(): + each_result.search_result["score"]["value"] = new_score + + search_results.sort(key=lambda x: x.score(), reverse=True) + + self._logger.debug("Running search on geospatial data finished.") + return search_results + class Datamart(object): """ @@ -478,13 +600,13 @@ def search(self, query: 'DatamartQuery') -> DatamartQueryCursor: """ return DatamartQueryCursor(augmenter=self.augmenter, search_query=[query], supplied_data=None, - connection_url=self.connection_url) + connection_url=self.connection_url, need_run_wikifier = False) def search_with_data(self, query: 'DatamartQuery', supplied_data: container.Dataset, need_wikidata=True) \ -> DatamartQueryCursor: """ Search using on a query and a supplied dataset. -X + This method is a "smart" search, which leaves the Datamart to determine how to evaluate the relevance of search result with regard to the supplied data. For example, a Datamart may try to identify named entities and date ranges in the supplied data and search for companion datasets which overlap. @@ -495,7 +617,7 @@ def search_with_data(self, query: 'DatamartQuery', supplied_data: container.Data Datamart implementations should return a DatamartQueryCursor immediately. Parameters - ------_--- + ---------- query : DatamartQuery Query specification supplied_data : container.Dataset @@ -515,7 +637,7 @@ def search_with_data(self, query: 'DatamartQuery', supplied_data: container.Data need_run_wikifier = False else: need_run_wikifier = None - search_queries = [DatamartQuery(search_type="wikidata"), DatamartQuery(search_type="vector")] + search_queries = [DatamartQuery(search_type="wikidata"), DatamartQuery(search_type="vector"), DatamartQuery(search_type="geospatial")] # try to update with more correct metadata if possible updated_result = MetadataCache.check_and_get_dataset_real_metadata(supplied_data) @@ -613,7 +735,6 @@ def generate_datamart_query_from_data(self, supplied_data: container.Dataset, all_query_variables = [] keywords = [] translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation)) - possible_longitude_or_latitude = list() for each_constraint in data_constraints: for each_column in each_constraint.columns: @@ -657,22 +778,6 @@ def generate_datamart_query_from_data(self, supplied_data: container.Dataset, + " with column name " + supplied_data[each_column_res_id].columns[each_column_index]) treat_as_a_text_column = True - - # geospacial type data - elif "https://metadata.datadrivendiscovery.org/types/Location" in each_column_meta["semantic_types"]: - try: - column_data = supplied_data[each_column_res_id].iloc[:, each_column_index].astype(float).dropna() - if max(column_data) <= config.max_longitude_val and min(column_data) >= config.min_longitude_val: - possible_longitude_or_latitude.append(each_column_index) - elif max(column_data) <= config.max_latitude_val and min(column_data) >= config.min_latitude_val: - possible_longitude_or_latitude.append(each_column_index) - - except Exception as e: - self._logger.debug(e, exc_info=True) - self._logger.error("Can't parse location information for column No." + str(each_column_index) - + " with column name " + supplied_data[each_column_res_id].columns[each_column_index]) - treat_as_a_text_column = True - # for some special condition (DA_medical_malpractice), a column could have a DateTime tag but unable to be parsed # in such condition, we should search and treat it as a Text column then if 'http://schema.org/Text' in each_column_meta["semantic_types"] or treat_as_a_text_column: @@ -917,6 +1022,35 @@ def _download_general(self, run_wikifier) -> pd.DataFrame: self._logger.debug("download_general function finished.") return return_result + def _dummy_download_wikidata(self) -> pd.DataFrame: + """ + This function only should be used when the wikidata column on the search result is not found on supplied data + This function will append same amount of blank columns to ensure the augmented data's column number and column names + are same as normal condition + :return: a DataFrame + """ + # TODO: check if this can help to prevent fail on some corner case + self._logger.warning("Adding empty wikidata columns!") + p_nodes_needed = self.search_result["p_nodes_needed"] + target_q_node_column_name = self.search_result["target_q_node_column_name"] + specific_p_nodes_record = MetadataCache.get_specific_p_nodes(self.supplied_dataframe) + columns_need_to_add = [] + # if specific_p_nodes_record is not None: + # for each_column in self.supplied_dataframe.columns: + # # if we find that this column should be wikified but not exist in supplied dataframe + # if each_column in specific_p_nodes_record and each_column + "_wikidata" not in self.supplied_dataframe.columns: + # columns_need_to_add.append(each_column + "_wikidata") + for each_p_node in p_nodes_needed: + each_p_node_name = Utils.get_node_name(each_p_node) + columns_need_to_add.append(target_q_node_column_name + "_" + each_p_node_name) + columns_need_to_add.append("joining_pairs") + + dummy_result = copy.copy(self.supplied_dataframe) + for each_column in columns_need_to_add: + dummy_result[each_column] = "" + + return dummy_result + def _download_wikidata(self) -> pd.DataFrame: """ :return: return_df: the materialized wikidata d3m_DataFrame, @@ -930,8 +1064,13 @@ def _download_wikidata(self) -> pd.DataFrame: try: q_node_column_number = self.supplied_dataframe.columns.tolist().index(target_q_node_column_name) except ValueError: - raise ValueError("Could not find corresponding q node column for " + target_q_node_column_name + - ". Maybe use the wrong search results?") + q_node_column_number = None + self._logger.error("Could not find corresponding q node column for " + target_q_node_column_name + + ". It is possible that using wrong supplied data or wikified wrong columns before") + + if not q_node_column_number: + return self._dummy_download_wikidata() + q_nodes_list = set(self.supplied_dataframe.iloc[:, q_node_column_number].tolist()) q_nodes_list = list(q_nodes_list) q_nodes_list.sort() @@ -1040,7 +1179,7 @@ def _run_wikifier(self, supplied_data) -> d3m_Dataset: """ self._logger.debug("Start running wikifier.") # here because this part's code if for augment, we already have cache for that - results = d3m_wikifier.run_wikifier(supplied_data=supplied_data, use_cache=False) + results = d3m_wikifier.run_wikifier(supplied_data=supplied_data, use_cache=True) self._logger.debug("Running wikifier finished.") return results @@ -1093,8 +1232,11 @@ def augment(self, supplied_data, augment_columns=None, connection_url=None, augm search_result_serialized=self.serialize()) cache_result = self.general_search_cache_manager.get_cache_results(cache_key) if cache_result is not None: + if type(cache_result) is string and cache_result == "failed": + self._logger.warning("This augment was failed last time!") self._logger.info("Using caching results") return cache_result + except Exception as e: cache_key = None self._logger.error("Some error happened when getting results from cache!") @@ -1102,24 +1244,37 @@ def augment(self, supplied_data, augment_columns=None, connection_url=None, augm self._logger.info("Cache not hit, start running augment.") - if self.search_type == "wikifier": - res = self._run_wikifier(supplied_data) + try: + if self.search_type == "wikifier": + res = timeout_call(1800, self._run_wikifier, [supplied_data]) + # res = self._run_wikifier(supplied_data) - else: - if type(supplied_data) is d3m_DataFrame: - res = self._augment(supplied_data=supplied_data, augment_columns=augment_columns, generate_metadata=True, - return_format="df", augment_resource_id=augment_resource_id) - elif type(supplied_data) is d3m_Dataset: - res = self._augment(supplied_data=supplied_data, augment_columns=augment_columns, generate_metadata=True, - return_format="ds", augment_resource_id=augment_resource_id) else: - raise ValueError("Unknown input type for supplied data as: " + str(type(supplied_data))) + if type(supplied_data) is d3m_DataFrame: + res = timeout_call(1800, self._augment, [supplied_data, augment_columns, True, "df", augment_resource_id]) + + # res = self._augment(supplied_data=supplied_data, augment_columns=augment_columns, generate_metadata=True, + # return_format="df", augment_resource_id=augment_resource_id) + elif type(supplied_data) is d3m_Dataset: + res = timeout_call(1800, self._augment, [supplied_data, augment_columns, True, "ds", augment_resource_id]) + # res = self._augment(supplied_data=supplied_data, augment_columns=augment_columns, generate_metadata=True, + # return_format="ds", augment_resource_id=augment_resource_id) + else: + raise ValueError("Unknown input type for supplied data as: " + str(type(supplied_data))) - # sometime the index will be not continuous after augment, need to reset to ensure the index is continuous - res[augment_resource_id].reset_index(drop=True) + if res is not None: + # sometime the index will be not continuous after augment, need to reset to ensure the index is continuous + res[augment_resource_id].reset_index(drop=True) - res[augment_resource_id].fillna('', inplace=True) - res[augment_resource_id] = res[augment_resource_id].astype(str) + res[augment_resource_id].fillna('', inplace=True) + res[augment_resource_id] = res[augment_resource_id].astype(str) + else: + res = "failed" + + except Exception as e: + self._logger.error("Augment failed!") + self._logger.debug(e, exc_info=True) + res = "failed" # should not cache wikifier results here, as we already cached it in wikifier part # and we don't know if the wikifier success or not here @@ -1130,7 +1285,8 @@ def augment(self, supplied_data, augment_columns=None, connection_url=None, augm hash_key=cache_key ) # save the augmented result's metadata if second augment is conducted - MetadataCache.save_metadata_from_dataset(res) + if type(res) is not string and res != "failed": + MetadataCache.save_metadata_from_dataset(res) if not response: self._logger.warning("Push augment results to results failed!") else: @@ -1167,60 +1323,90 @@ def _augment(self, supplied_data, augment_columns=None, generate_metadata=True, df_dict = dict() start = time.time() columns_new = None + left_pairs = defaultdict(list) + right_pairs = defaultdict(list) + for r1, r2 in self.pairs: - i += 1 - r1_int = int(r1) - if r1_int in r1_paired: - continue - r1_paired.add(r1_int) - left_res = supplied_data_df.loc[r1_int] - right_res = download_result.loc[int(r2)] - if column_names_to_join is None: - column_names_to_join = right_res.index.difference(left_res.index) - if self.search_type == "general": - # only for general search condition, we should remove the target join columns - right_join_column_name = self.search_result['variableName']['value'] - if right_join_column_name in column_names_to_join: - column_names_to_join = column_names_to_join.drop(right_join_column_name) - # if specified augment columns given, only append these columns - if augment_columns: - augment_columns_with_column_names = [] - max_length = self.d3m_metadata.query((ALL_ELEMENTS,))['dimension']['length'] - for each in augment_columns: - if each.column_index < max_length: - each_column_meta = self.d3m_metadata.query((ALL_ELEMENTS, each.column_index)) - augment_columns_with_column_names.append(each_column_meta["name"]) - else: - self._logger.error("Index out of range, will ignore: " + str(each.column_index)) - column_names_to_join = column_names_to_join.intersection(augment_columns_with_column_names) - - columns_new = left_res.index.tolist() - columns_new.extend(column_names_to_join.tolist()) - dcit_right = right_res[column_names_to_join].to_dict() - dict_left = left_res.to_dict() - dcit_right.update(dict_left) - df_dict[i] = dcit_right - - df_joined = pd.DataFrame.from_dict(df_dict, "index") - # add up the rows don't have pairs - unpaired_rows = set(range(supplied_data_df.shape[0])) - r1_paired - if len(unpaired_rows) > 0: - unpaired_rows_list = [i for i in unpaired_rows] - df_joined = df_joined.append(supplied_data_df.iloc[unpaired_rows_list, :], ignore_index=True) - - # ensure that the original dataframe columns are at the first left part - if columns_new is not None: - df_joined = df_joined[columns_new] + left_pairs[int(r1)].append(int(r2)) + right_pairs[int(r2)].append(int(r1)) + + max_v1 = 0 + max_v2 = 0 + for k, v in left_pairs.items(): + if len(v) > max_v1: + max_v1 = len(v) + + for k, v in right_pairs.items(): + if len(v) > max_v2: + max_v2 = len(v) + + maximum_accept_duplicate_amount = self.supplied_data['learningData'].shape[0] / 20 + self._logger.info("Maximum accept duplicate amount is: " + str(maximum_accept_duplicate_amount)) + self._logger.info("duplicate amount for left is: " + str(max_v1)) + self._logger.info("duplicate amount for right is: " + str(max_v2)) + + if max_v1 >= maximum_accept_duplicate_amount and max_v2 >= maximum_accept_duplicate_amount: + # if n_to_m_condition + self._logger.error("Could not augment for n-m relationship.") + df_joined = supplied_data_df + else: - self._logger.error("Attention! It seems augment do not add any extra columns!") + for r1, r2 in self.pairs: + i += 1 + r1_int = int(r1) + if r1_int in r1_paired: + continue + r1_paired.add(r1_int) + left_res = supplied_data_df.loc[r1_int] + right_res = download_result.loc[int(r2)] + if column_names_to_join is None: + column_names_to_join = right_res.index.difference(left_res.index) + if self.search_type == "general": + # only for general search condition, we should remove the target join columns + right_join_column_name = self.search_result['variableName']['value'] + if right_join_column_name in column_names_to_join: + column_names_to_join = column_names_to_join.drop(right_join_column_name) + # if specified augment columns given, only append these columns + if augment_columns: + augment_columns_with_column_names = [] + max_length = self.d3m_metadata.query((ALL_ELEMENTS,))['dimension']['length'] + for each in augment_columns: + if each.column_index < max_length: + each_column_meta = self.d3m_metadata.query((ALL_ELEMENTS, each.column_index)) + augment_columns_with_column_names.append(each_column_meta["name"]) + else: + self._logger.error("Index out of range, will ignore: " + str(each.column_index)) + column_names_to_join = column_names_to_join.intersection(augment_columns_with_column_names) + + columns_new = left_res.index.tolist() + columns_new.extend(column_names_to_join.tolist()) + dcit_right = right_res[column_names_to_join].to_dict() + dict_left = left_res.to_dict() + dcit_right.update(dict_left) + df_dict[i] = dcit_right + + df_joined = pd.DataFrame.from_dict(df_dict, "index") + # add up the rows don't have pairs + unpaired_rows = set(range(supplied_data_df.shape[0])) - r1_paired + if len(unpaired_rows) > 0: + unpaired_rows_list = [i for i in unpaired_rows] + df_joined = df_joined.append(supplied_data_df.iloc[unpaired_rows_list, :], ignore_index=True) + + # ensure that the original dataframe columns are at the first left part + if columns_new is not None: + df_joined = df_joined[columns_new] + else: + self._logger.error("Attention! It seems augment do not add any extra columns!") - # if search with wikidata, we should remove duplicate Q node column - self._logger.info("Join finished, totally take " + str(time.time() - start) + " seconds.") + # if search with wikidata, we should remove duplicate Q node column + self._logger.info("Join finished, totally take " + str(time.time() - start) + " seconds.") + # END augment part if 'q_node' in df_joined.columns: df_joined = df_joined.drop(columns=['q_node']) if 'id' in df_joined.columns: + df_joined = df_joined.sort_values(by=['id']) df_joined = df_joined.drop(columns=['id']) # start adding column metadata for dataset diff --git a/datamart_isi/profilers/basic_profiler.py b/datamart_isi/profilers/basic_profiler.py index 32813e3..aa3eacc 100644 --- a/datamart_isi/profilers/basic_profiler.py +++ b/datamart_isi/profilers/basic_profiler.py @@ -53,16 +53,40 @@ def profile_semantic_type(column: pd.Series) -> typing.List: # TODO: we need to check str is text or categorical here # when to use "https://metadata.datadrivendiscovery.org/types/CategoricalData" semantic_types = ["https://metadata.datadrivendiscovery.org/types/Attribute"] - if column.dtype.name == "object": - semantic_types.append("http://schema.org/Text") - elif "float" in column.dtype.name: - semantic_types.append("http://schema.org/Float") - elif "int" in column.dtype.name: + if BasicProfiler.is_int_type(column): semantic_types.append("http://schema.org/Integer") - elif "datetime" in column.dtype.name: + elif BasicProfiler.is_float_type(column): + semantic_types.append("http://schema.org/Float") + elif BasicProfiler.is_datetime_type(column): semantic_types.append("http://schema.org/DateTime") + else: + semantic_types.append("http://schema.org/Text") return semantic_types + @staticmethod + def is_int_type(column): + try: + column.astype(int) + return True + except Exception as e: + return False + + @staticmethod + def is_float_type(column): + try: + column.astype(float) + return True + except Exception as e: + return False + + @staticmethod + def is_datetime_type(column): + try: + pd.to_datetime(column) + return True + except Exception as e: + return False + @staticmethod def profile_temporal_coverage(column: pd.Series, coverage: dict = None) -> typing.Union[dict, bool]: """Profiling this temporal column . diff --git a/datamart_isi/utilities/connection.py b/datamart_isi/utilities/connection.py index 24b066e..eeaf362 100644 --- a/datamart_isi/utilities/connection.py +++ b/datamart_isi/utilities/connection.py @@ -1,3 +1,5 @@ +import typing + from datamart_isi import config_services @@ -27,3 +29,8 @@ def get_es_fb_embedding_server_url() -> str: def get_general_search_test_server_url() -> str: return config_services.get_service_url('general_search') + + +def get_redis_host_port() -> typing.Tuple[str, int]: + host, port, _ = config_services.get_host_port_path('redis') + return (host, port) diff --git a/datamart_isi/utilities/d3m_metadata.py b/datamart_isi/utilities/d3m_metadata.py index a908b26..78773a8 100644 --- a/datamart_isi/utilities/d3m_metadata.py +++ b/datamart_isi/utilities/d3m_metadata.py @@ -659,16 +659,23 @@ def generate_metadata_for_augment_result(self, df_joined, return_format, supplie "semantic_types": ("http://schema.org/Text", "https://metadata.datadrivendiscovery.org/types/Attribute",), } + if current_column_name.endswith("_wikidata"): - data = list(filter(None, df_joined.iloc[:, i].dropna().tolist())) - if all(re.match(r'^Q\d+$', x) for x in data): - new_metadata_i["semantic_types"] = ("http://schema.org/Text", + # add vector semantic type here + if current_column_name.startswith("vector_"): + new_metadata_i["semantic_types"] = ("http://schema.org/Float", "https://metadata.datadrivendiscovery.org/types/Attribute", - q_node_semantic_type ) + else: + data = list(filter(None, df_joined.iloc[:, i].astype(str).dropna())) + if all(re.match(r'^Q\d+$', x) for x in data): + new_metadata_i["semantic_types"] = ("http://schema.org/Text", + "https://metadata.datadrivendiscovery.org/types/Attribute", + q_node_semantic_type + ) else: - self._logger.error("Please check!") - self._logger.error("No metadata found for column No." + str(i) + "with name " + current_column_name) + self._logger.warning("Please check!") + self._logger.warning("No metadata found for column No." + str(i) + "with name " + current_column_name) metadata_new = metadata_new.update(each_selector, new_metadata_i) return_result = None diff --git a/datamart_isi/utilities/d3m_wikifier.py b/datamart_isi/utilities/d3m_wikifier.py index 41124aa..e943f98 100644 --- a/datamart_isi/utilities/d3m_wikifier.py +++ b/datamart_isi/utilities/d3m_wikifier.py @@ -12,8 +12,10 @@ from d3m.container import Dataset as d3m_Dataset from d3m.container import DataFrame as d3m_DataFrame from d3m.metadata.base import ALL_ELEMENTS +from datamart_isi.cache.metadata_cache import MetadataCache from datamart_isi import config from os import path +from pandas.util import hash_pandas_object Q_NODE_SEMANTIC_TYPE = config.q_node_semantic_type DEFAULT_TEMP_PATH = config.default_temp_path @@ -32,14 +34,23 @@ def run_wikifier(supplied_data: d3m_Dataset, use_cache=True): output_ds = copy.copy(supplied_data) need_column_type = config.need_wikifier_column_type_list res_id, supplied_dataframe = d3m_utils.get_tabular_resource(dataset=supplied_data, resource_id=None) - specific_p_nodes = get_specific_p_nodes(supplied_dataframe) - if specific_p_nodes: + specific_p_nodes = MetadataCache.get_specific_p_nodes(supplied_dataframe) + if use_cache and specific_p_nodes is not None: target_columns = list() _logger.info("Get specific column<->p_nodes relationship from previous TRAIN run. Will only wikifier those columns!") _logger.info(str(specific_p_nodes)) for i, each_column_name in enumerate(supplied_dataframe.columns.tolist()): if each_column_name in specific_p_nodes: - target_columns.append(i) + # double check whether this column should be wikified + each_column_semantic_type = supplied_data.metadata.query((res_id, ALL_ELEMENTS, i))['semantic_types'] + _logger.debug("column No." + str(i) + "'s semantic type is " + str(each_column_semantic_type)) + skip_column_type = config.skip_wikifier_column_type_list + if set(each_column_semantic_type).intersection(skip_column_type): + _logger.warning("Detect the column semantic type should not be wikified on column No." + str(i) + + ": " + each_column_name + "! Will remove this column from wikifier target columns") + continue + else: + target_columns.append(i) else: # if specific p nodes not given, try to find possible candidate p nodes columns @@ -67,13 +78,13 @@ def run_wikifier(supplied_data: d3m_Dataset, use_cache=True): temp.remove(each) target_columns = temp - if target_columns is None: + if len(target_columns) == 0: + _logger.info("No columns found need to be wikified!") return supplied_data _logger.debug("The target columns need to be wikified are: " + str(target_columns)) # here because this function is called from augment part, so this part - wikifier_res = wikifier.produce(inputs=pd.DataFrame(supplied_dataframe), target_columns=target_columns, - target_p_nodes=specific_p_nodes, use_cache=use_cache) + wikifier_res = wikifier.produce(inputs=pd.DataFrame(supplied_dataframe), target_columns=target_columns, target_p_nodes=specific_p_nodes, use_cache = use_cache) output_ds[res_id] = d3m_DataFrame(wikifier_res, generate_metadata=False) # update metadata on column length selector = (res_id, ALL_ELEMENTS) @@ -104,36 +115,6 @@ def run_wikifier(supplied_data: d3m_Dataset, use_cache=True): return supplied_data -def generate_specific_meta_path(supplied_dataframe): - columns_list = supplied_dataframe.columns.tolist() - columns_list.sort() - hash_generator = hashlib.md5() - hash_generator.update(str(columns_list).encode('utf-8')) - hash_key = str(hash_generator.hexdigest()) - temp_path = os.getenv('D3MLOCALDIR', DEFAULT_TEMP_PATH) - specific_q_nodes_file = os.path.join(temp_path, hash_key + "_column_to_P_nodes") - _logger.debug("Current dataset cache searching path is: " + temp_path) - _logger.debug("Current columns are: " + str(columns_list)) - _logger.debug("Current dataset's hash key is: " + hash_key) - return specific_q_nodes_file - - -def get_specific_p_nodes(supplied_dataframe) -> typing.Optional[list]: - specific_q_nodes_file = generate_specific_meta_path(supplied_dataframe) - if path.exists(specific_q_nodes_file): - with open(specific_q_nodes_file, 'r') as f: - res = json.load(f) - return res - else: - return None - - -def delete_specific_p_nodes_file(supplied_dataframe): - specific_q_nodes_file = generate_specific_meta_path(supplied_dataframe) - if path.exists(specific_q_nodes_file): - os.remove(specific_q_nodes_file) - - def check_and_correct_q_nodes_semantic_type(input): """ Function used to detect whether a dataset or a dataframe already contains q nodes columns or not @@ -160,7 +141,7 @@ def check_and_correct_q_nodes_semantic_type(input): each_metadata = input.metadata.query(selector) if Q_NODE_SEMANTIC_TYPE in each_metadata['semantic_types']: - _logger.info("Q nodes columns found in input data, will not run wikifier.") + _logger.debug("Q nodes semantic type found in column No.{}, will not run wikifier.".format(str(i))) find_q_node_columns = True elif 'http://schema.org/Text' in each_metadata["semantic_types"]: @@ -172,30 +153,50 @@ def check_and_correct_q_nodes_semantic_type(input): 'https://metadata.datadrivendiscovery.org/types/Attribute', Q_NODE_SEMANTIC_TYPE) }) - _logger.info("Q nodes columns found in input data, will not run wikifier.") + _logger.debug("Q nodes format data found in column No.{}, will not run wikifier.".format(str(i))) find_q_node_columns = True return find_q_node_columns, input -# def save_specific_p_nodes(original_dataframe, wikifiered_dataframe) -> bool: -# try: -# original_columns_list = set(original_dataframe.columns.tolist()) -# wikifiered_columns_list = set(wikifiered_dataframe.columns.tolist()) -# p_nodes_list = list(wikifiered_columns_list - original_columns_list) -# p_nodes_list.sort() -# p_nodes_str = ",".join(p_nodes_list) -# -# hash_generator = hashlib.md5() -# hash_generator.update(str(p_nodes_str).encode('utf-8')) -# hash_key = str(hash_generator.hexdigest()) -# temp_path = os.getenv('D3MLOCALDIR', DEFAULT_TEMP_PATH) -# specific_q_nodes_file = os.path.join(temp_path, hash_key) -# if path.exists(specific_q_nodes_file): -# _logger.warning("The specific p nodes file already exist! Will replace the old one!") -# -# with open(specific_q_nodes_file, 'w') as f: -# f.write(p_nodes_str) -# return True -# -# except Exception as e: -# _logger.debug(e, exc_info=True) -# return False + + +def save_wikifier_choice(input_dataframe: pd.DataFrame, choice: bool = None) -> bool: + """ + Function used to check whether a given dataframe need to run wikifier or not, if check failed, default not to do wikifier + :param choice: a optional param, if given, use user's setting, otherwise by checking the size of the input dataframe + :param input_dataframe: the supplied dataframe that need to be wikified + :return: a bool, True means it need to be wikifiered, False means not need + """ + try: + hash_input_data = str(hash_pandas_object(input_dataframe).sum()) + # if folder / file, create it + storage_loc = os.path.join(config.cache_file_storage_base_loc, "other_cache") + if not os.path.exists(storage_loc): + os.mkdir(storage_loc) + file_loc = os.path.join(storage_loc, "wikifier_choice.json") + if os.path.exists(file_loc): + with open(file_loc, 'r') as f: + wikifier_choices = json.load(f) + else: + wikifier_choices = dict() + + if choice is None: + input_size = input_dataframe.shape[0] * input_dataframe.shape[1] + if input_size >= config.maximum_accept_wikifier_size: + choice = False + else: + choice = True + + if hash_input_data in wikifier_choices.keys() and wikifier_choices[hash_input_data] != choice: + _logger.warning("Exist wikifier choice and the old choice is different!") + _logger.warning("Now change wikifier choice for dataset with hash tag " + hash_input_data + " to " + str(choice)) + + wikifier_choices[hash_input_data] = choice + + with open(file_loc, 'w') as f: + json.dump(wikifier_choices, f) + return choice + + except Exception as e: + _logger.error("Saving wikifier choice failed!") + _logger.debug(e, exc_info=True) + return False diff --git a/datamart_isi/utilities/download_manager.py b/datamart_isi/utilities/download_manager.py index b3f83e5..73f7733 100644 --- a/datamart_isi/utilities/download_manager.py +++ b/datamart_isi/utilities/download_manager.py @@ -1,11 +1,28 @@ -from datamart_isi import config -from . import connection - import requests import pandas +import logging +import os +import json +import copy +import frozendict +from multiprocessing import Pool + +from d3m.container import Dataset as d3m_Dataset +from d3m.container import DataFrame as d3m_DataFrame +from d3m.base import utils as d3m_utils +from datamart_isi.cache.general_search_cache import GeneralSearchCache +from datamart_isi.cache.metadata_cache import MetadataCache +from datamart_isi.cache.wikidata_cache import QueryCache +from datamart_isi import config +from datamart_isi.utilities import connection +from SPARQLWrapper import SPARQLWrapper, JSON, POST, URLENCODED +from d3m.metadata.base import ALL_ELEMENTS WIKIDATA_URI_TEMPLATE = config.wikidata_uri_template EM_ES_URL = connection.get_es_fb_embedding_server_url() +Q_NODE_SEMANTIC_TYPE = config.q_node_semantic_type +logger = logging.getLogger(__name__) + # EM_ES_URL = config.em_es_url # EM_ES_INDEX = config.em_es_index # EM_ES_TYPE = config.em_es_type @@ -18,7 +35,7 @@ def fetch_fb_embeddings(q_nodes_list, target_q_node_column_name): qnodes = list(filter(None, q_nodes_list)) qnode_uris = [WIKIDATA_URI_TEMPLATE.format(qnode) for qnode in qnodes] # do elastic search - num_of_try = int(len(qnode_uris)/1024) + 1 if len(qnode_uris)%1024 != 0 else int(len(qnode_uris)/1024) + num_of_try = int(len(qnode_uris)/1024) + 1 if len(qnode_uris) % 1024 != 0 else int(len(qnode_uris)/1024) res = dict() for i in range(num_of_try): query = { @@ -57,3 +74,195 @@ def fetch_fb_embeddings(q_nodes_list, target_q_node_column_name): return_df = return_df.append(each_result, ignore_index=True) return return_df + + @staticmethod + def parse_geospatial_query(geo_variable): + """ + Finding closest q-node for a (latitude, longitude) point + :param geo_variable: dict + :return: a qnode: str + """ + geo_gra_dict = {'country': 'Q6256', 'state': 'Q7275', 'city': 'Q515', 'county': 'Q28575', + 'postal_code': 'Q37447'} + + wikidata_server = connection.get_wikidata_server_url() + qm_wikidata = SPARQLWrapper(wikidata_server) + qm_wikidata.setReturnFormat(JSON) + qm_wikidata.setMethod(POST) + qm_wikidata.setRequestMethod(URLENCODED) + + results = None + if "latitude" in geo_variable.keys() and "longitude" in geo_variable.keys(): + granularity = geo_gra_dict[geo_variable["granularity"]] + radius = geo_variable["radius"] + x, y = geo_variable["longitude"], geo_variable["latitude"] + + if x and y: + # find closest Q nodes around a geospatial point from wikidata query + sparql_query = "select distinct ?place where \n{\n ?place wdt:P31/wdt:P279* wd:" + granularity + " .\n" \ + + "SERVICE wikibase:around {\n ?place wdt:P625 ?location .\n" \ + + "bd:serviceParam wikibase:center " + "\"Point(" + str(x) + " " + str(y) + ")\"^^geo:wktLiteral .\n" \ + + "bd:serviceParam wikibase:radius " + "\"" + str(radius) + "\" .\n" \ + + "bd:serviceParam wikibase:distance ?dist. \n}\n" \ + + "SERVICE wikibase:label { bd:serviceParam wikibase:language \"en\" }\n}\n" \ + + "ORDER BY ASC(?dist) \n Limit 1 \n" + try: + qm_wikidata.setQuery(sparql_query) + results = qm_wikidata.query().convert()['results']['bindings'] + except Exception as e: + logger.error("Query for " + str(geo_variable) + " failed!") + logger.debug(e, exc_info=True) + + qnode = '' + if results: + value = results[0]["place"]["value"] + qnode = value.split('/')[-1] + # logger.info("Qnode:" + qnode) + + return qnode + + @staticmethod + def query_geospatial_wikidata(supplied_dataset, search_result, endpoint) -> d3m_Dataset: + """ + Finding augment_geospatial_result in cache, if not exists, do multiprocessing to find qnodes for all geospatial points. + :param supplied_dataset: d3m dataset + :param search_result: dict + :param endpoint: connection url + :return: d3m dataset after augmentation + """ + general_search_cache_manager = GeneralSearchCache(connection_url=endpoint) + res_id, supplied_dataframe = d3m_utils.get_tabular_resource(dataset=supplied_dataset, + resource_id=None, + has_hyperparameter=False) + search_result_str = json.dumps(search_result) + # try cache first + try: + cache_key = general_search_cache_manager.get_hash_key(supplied_dataframe=supplied_dataframe, + search_result_serialized=search_result_str) + cache_result = general_search_cache_manager.get_cache_results(cache_key) + if cache_result is not None: + logger.info("Get augment results from memcache success!") + return cache_result + except Exception as e: + cache_key = None + + latitude_index = search_result['metadata']['search_result']['latitude_index'] + longitude_index = search_result['metadata']['search_result']['longitude_index'] + radius = search_result['metadata']['search_result']['radius'] + gran = search_result['metadata']['search_result']['granularity'] + + # set query information + geo_variables_list = [] + for latitude, longitude in zip(supplied_dataframe.iloc[:, latitude_index],supplied_dataframe.iloc[:, longitude_index]): + geo_variable = {"latitude": latitude, "longitude": longitude, "radius": radius, "granularity": gran} + geo_variables_list.append(geo_variable) + + # get qnodes from multiple processing + logger.debug("Start to query geospatial data") + with Pool(os.cpu_count()) as p: + qnodes = p.map(DownloadManager.parse_geospatial_query, geo_variables_list) + logger.debug("Finished querying geospatial data") + + # augment qnodes in dataframe + output_df = copy.copy(supplied_dataframe) + lat_name, long_name = supplied_dataframe.columns[latitude_index], supplied_dataframe.columns[longitude_index] + if qnodes and set(qnodes) != set(''): + output_df["Geo_" + lat_name + "_" + long_name + "_" + gran + "_wikidata"] = qnodes + else: + logger.debug("No geospatial Qnodes!") + + # generate dataset + output_ds = copy.copy(supplied_dataset) + output_ds[res_id] = d3m_DataFrame(output_df, generate_metadata=False) + + # update metadata on column length + selector = (res_id, ALL_ELEMENTS) + old_meta = dict(output_ds.metadata.query(selector)) + old_meta_dimension = dict(old_meta['dimension']) + old_meta_dimension['length'] = output_df.shape[1] + old_meta['dimension'] = frozendict.FrozenOrderedDict(old_meta_dimension) + new_meta = frozendict.FrozenOrderedDict(old_meta) + output_ds.metadata = output_ds.metadata.update(selector, new_meta) + + # update qnode column's metadata + selector = (res_id, ALL_ELEMENTS, output_df.shape[1] - 1) + metadata = {"name": output_df.columns[-1], + "structural_type": str, + 'semantic_types': ( + "http://schema.org/Text", + "https://metadata.datadrivendiscovery.org/types/Attribute", + Q_NODE_SEMANTIC_TYPE + )} + output_ds.metadata = output_ds.metadata.update(selector, metadata) + + # save to cache + if cache_key: + response = general_search_cache_manager.add_to_memcache(supplied_dataframe=supplied_dataframe, + search_result_serialized=search_result_str, + augment_results=output_ds, + hash_key=cache_key + ) + # save the augmented result's metadata if second augment is conducted + MetadataCache.save_metadata_from_dataset(output_ds) + if not response: + logger.warning("Push augment results to results failed!") + else: + logger.info("Push augment results to memcache success!") + return output_ds + + @staticmethod + def fetch_qnode_info(input_df, endpoint): + """ + + :param input_df: wikifier result + :return: output_df: for wikidata columns, add labels and descriptions + """ + wikidata_cache_manager = QueryCache(connection_url=endpoint) + col_name = input_df.columns.tolist() + new_col_name = [] + + for name in col_name: + new_col_name.append(name) + if "_wikidata_" in name: + qnodes = input_df[name].tolist() + unique_qnodes = list(set(qnodes)) + q_node_query_part = "" + + for each in unique_qnodes: + if len(each) > 0: + q_node_query_part += "(wd:" + each + ")" + sparql_query = "select distinct ?item ?itemLabel ?itemDescription where \n{\n VALUES (?item) {" + q_node_query_part \ + + " }\n SERVICE wikibase:label { bd:serviceParam wikibase:language \"[AUTO_LANGUAGE],en\". } \n}" + + results = wikidata_cache_manager.get_result(sparql_query) + + if results is None: + # if response none, it means get wikidata query results failed + logger.error("Can't get wikidata search results for column " + name) + continue + + # save label and description in input_df + qnodes_info = {} + for each in results: + key = each['item']['value'].split('/')[-1] + qnodes_info[key] = {} + qnodes_info[key]['qnode_description'] = each['itemDescription']['value'] if "itemDescription" in each else "" + qnodes_info[key]['qnode_label'] = each['itemLabel']['value'] if "itemLabel" in each else "" + + col_label, col_des = [], [] + for qnode in qnodes: + if len(qnode) > 0 and qnode in qnodes_info: + col_label.append(qnodes_info[qnode]["qnode_label"]) + col_des.append(qnodes_info[qnode]["qnode_description"]) + else: + col_label.append("") + col_des.append("") + input_df[name + "_label"] = col_label + input_df[name + "_description"] = col_des + new_col_name.append(name + "_label") + new_col_name.append(name + "_description") + + return input_df[new_col_name] + + + diff --git a/datamart_isi/utilities/geospatial_related.py b/datamart_isi/utilities/geospatial_related.py new file mode 100644 index 0000000..a646b36 --- /dev/null +++ b/datamart_isi/utilities/geospatial_related.py @@ -0,0 +1,36 @@ +class GeospatialRelated: + def __init__(self, x: float, y: float) -> None: + """ + Initial method of coordinate. + :param x: X-axis coordinate, usually is latitude + :param y: Y-axis coordinate, usually is longitude + :return: + """ + self.x = x + self.y = y + + def coordinate_transform(self): + """ + This function is used to do the axis transformation, in order to adapt to the Wikidata query service. + So, x-axis coordinate should be longitude, y-axis should be latitude + """ + temp = self.x + self.x = self.y + self.y = temp + + def distinguish_two_points(self, g: 'GeospatialRelated'): + """ + This function is used to distinguish top left point and right bottom point in a bounding box. + :param g: an instance of class GeospatialRelated + :return: two tuples, one is top_left_point(x, y), another is right_bottom_point(x, y) + """ + if self.x < g.x and self.y > g.y: + return (self.x, self.y), (g.x, g.y) + elif g.x < self.x and g.y > self.y: + return (g.x, g.y), (self.x, self.y) + else: + return None, None + + + def get_coordinate(self): + return self.x, self.y diff --git a/datamart_isi/utilities/utils.py b/datamart_isi/utilities/utils.py index e82649d..eafb341 100644 --- a/datamart_isi/utilities/utils.py +++ b/datamart_isi/utilities/utils.py @@ -5,12 +5,16 @@ import os import logging import json +import hashlib from SPARQLWrapper import SPARQLWrapper, JSON, POST, URLENCODED +from d3m.metadata.base import ALL_ELEMENTS from io import StringIO from ast import literal_eval from d3m.container import DataFrame as d3m_DataFrame from datamart_isi.config import cache_file_storage_base_loc from datamart_isi.utilities import connection +from dsbox.datapreprocessing.cleaner.data_profile import Profiler, Hyperparams as ProfilerHyperparams +from dsbox.datapreprocessing.cleaner.cleaning_featurizer import CleaningFeaturizer, CleaningFeaturizerHyperparameter WIKIDATA_SERVER = connection.get_wikidata_server_url() @@ -31,22 +35,39 @@ def materialize(metadata, run_wikifier) -> pd.DataFrame: # general type materializer if 'url' in metadata: dataset_url = metadata['url']['value'] - from dsbox.datapreprocessing.cleaner.data_profile import Profiler, Hyperparams as ProfilerHyperparams - from dsbox.datapreprocessing.cleaner.cleaning_featurizer import CleaningFeaturizer, CleaningFeaturizerHyperparameter - file_type = metadata.get("file_type") or "" - if file_type == "": - # no file type get, try to guess - file_type = dataset_url.split(".")[-1] + # updated v2019.10.14: add local storage cache file + hash_generator = hashlib.md5() + hash_generator.update(dataset_url.encode('utf-8')) + hash_url_key = hash_generator.hexdigest() + dataset_cache_loc = os.path.join(cache_file_storage_base_loc, "datasets_cache", hash_url_key + ".h5") + _logger.debug("Try to check whether cache file exist or not at " + dataset_cache_loc) + if os.path.exists(dataset_cache_loc): + _logger.info("Found exist cached dataset file") + loaded_data = pd.read_hdf(dataset_cache_loc) else: - file_type = file_type['value'] - - if file_type == "wikitable": - extra_information = literal_eval(metadata['extra_information']['value']) - loaded_data = Utils.materialize_for_wikitable(dataset_url, file_type, extra_information) - else: - loaded_data = Utils.materialize_for_general(dataset_url, file_type) - + _logger.info("Cached dataset file does not find, will run materializer.") + file_type = metadata.get("file_type") or "" + if file_type == "": + # no file type get, try to guess + file_type = dataset_url.split(".")[-1] + else: + file_type = file_type['value'] + + if file_type == "wikitable": + extra_information = literal_eval(metadata['extra_information']['value']) + loaded_data = Utils.materialize_for_wikitable(dataset_url, file_type, extra_information) + else: + loaded_data = Utils.materialize_for_general(dataset_url, file_type) + try: + # save the loaded data + loaded_data.to_hdf(dataset_cache_loc, key='df', mode='w', format='fixed') + _logger.debug("Saving dataset cache success!") + except Exception as e: + _logger.warning("Saving dataset cache failed!") + _logger.debug(e, exc_info=True) # run dsbox's profiler and cleaner + # from dsbox.datapreprocessing.cleaner.data_profile import Profiler, Hyperparams as ProfilerHyperparams + # from dsbox.datapreprocessing.cleaner.cleaning_featurizer import CleaningFeaturizer, CleaningFeaturizerHyperparameter # hyper1 = ProfilerHyperparams.defaults() # profiler = Profiler(hyperparams=hyper1) # profiled_df = profiler.produce(inputs=loaded_data).value @@ -190,15 +211,40 @@ def generate_metadata_from_dataframe(cls, data: pd.DataFrame, original_meta: dic from datamart_isi.profilers.basic_profiler import BasicProfiler, VariableMetadata, GlobalMetadata global_metadata = GlobalMetadata.construct_global(description=cls.DEFAULT_DESCRIPTION) - for col_offset in range(data.shape[1]): - variable_metadata = BasicProfiler.basic_profiling_column( - description={}, - variable_metadata=VariableMetadata.construct_variable(description={}), - column=data.iloc[:, col_offset] - ) - global_metadata.add_variable_metadata(variable_metadata) - global_metadata = BasicProfiler.basic_profiling_entire(global_metadata=global_metadata, - data=data) + global_metadata = BasicProfiler.basic_profiling_entire(global_metadata=global_metadata, data=data) + metadata_dict = global_metadata.value + + # for col_offset in range(data.shape[1]): + # variable_metadata = BasicProfiler.basic_profiling_column( + # description={}, + # variable_metadata=VariableMetadata.construct_variable(description={}), + # column=data.iloc[:, col_offset] + # ) + # global_metadata.add_variable_metadata(variable_metadata) + hyper1 = ProfilerHyperparams.defaults() + hyper2 = CleaningFeaturizerHyperparameter.defaults() + clean_f = CleaningFeaturizer(hyperparams=hyper2) + profiler = Profiler(hyperparams=hyper1) + profiled_df = profiler.produce(inputs=data).value + clean_f.set_training_data(inputs=profiled_df) + clean_f.fit() + cleaned_df = clean_f.produce(inputs=profiled_df).value + cleaned_df_metadata = cleaned_df.metadata + + for i in range(data.shape[1]): + each_column_metadata = cleaned_df_metadata.query((ALL_ELEMENTS, i)) + column_name = data.columns[i] + if "datetime" in data.iloc[:, i].dtype.name: + semantic_type = ("http://schema.org/DateTime", 'https://metadata.datadrivendiscovery.org/types/Attribute') + else: + semantic_type = each_column_metadata['semantic_types'] + variable_metadata = {'datamart_id': None, + 'semantic_type': semantic_type, + 'name': column_name, + 'description': 'column name: {}, dtype: {}'.format(column_name, cleaned_df.iloc[:, i].dtype.name) + } + metadata_dict['variables'].append(variable_metadata) + if original_meta: - global_metadata.value.update(original_meta) - return global_metadata.value + metadata_dict.update(original_meta) + return metadata_dict diff --git a/examples/sample-augment-pipeline-new-v2019.10.10.json b/examples/sample-augment-pipeline-new-v2019.10.10.json new file mode 100644 index 0000000..be3b8bb --- /dev/null +++ b/examples/sample-augment-pipeline-new-v2019.10.10.json @@ -0,0 +1,558 @@ +{ + "id":"60c24ddd-ffe7-4ef7-b041-794708968e75", + "schema":"https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", + "created":"2019-10-11T10:22:42.846240Z", + "inputs":[ + { + "name":"input dataset" + } + ], + "outputs":[ + { + "data":"steps.19.produce", + "name":"predictions of input dataset" + } + ], + "steps":[ + { + "type":"PRIMITIVE", + "primitive":{ + "id":"f31f8c1f-d1c5-43e5-a4b2-2ae4a761ef2e", + "version":"0.2.0", + "python_path":"d3m.primitives.data_transformation.denormalize.Common", + "name":"Denormalize datasets", + "digest":"6a80776d244347f0d29f4358df1cd0286c25f67e03a7e2ee517c6e853e6a9d1f" + }, + "arguments":{ + "inputs":{ + "type":"CONTAINER", + "data":"inputs.0" + } + }, + "outputs":[ + { + "id":"produce" + } + ], + "hyperparams":{ + "starting_resource":{ + "type":"VALUE", + "data":null + }, + "recursive":{ + "type":"VALUE", + "data":true + }, + "many_to_many":{ + "type":"VALUE", + "data":false + }, + "discard_not_joined_tabular_resources":{ + "type":"VALUE", + "data":false + } + } + }, + { + "type":"PRIMITIVE", + "primitive":{ + "id":"dsbox-featurizer-do-nothing-dataset-version", + "version":"1.5.3", + "python_path":"d3m.primitives.data_preprocessing.do_nothing_for_dataset.DSBOX", + "name":"DSBox do-nothing primitive dataset version", + "digest":"689c59fc8ddddb763318f8945aabb1d6eab1c4d782a1fd5591585dd2d66ec23b" + }, + "arguments":{ + "inputs":{ + "type":"CONTAINER", + "data":"steps.0.produce" + } + }, + "outputs":[ + { + "id":"produce" + } + ] + }, + { + "type":"PRIMITIVE", + "primitive":{ + "id":"dsbox-featurizer-do-nothing-dataset-version", + "version":"1.5.3", + "python_path":"d3m.primitives.data_preprocessing.do_nothing_for_dataset.DSBOX", + "name":"DSBox do-nothing primitive dataset version", + "digest":"689c59fc8ddddb763318f8945aabb1d6eab1c4d782a1fd5591585dd2d66ec23b" + }, + "arguments":{ + "inputs":{ + "type":"CONTAINER", + "data":"steps.1.produce" + } + }, + "outputs":[ + { + "id":"produce" + } + ] + }, + { + "type":"PRIMITIVE", + "primitive":{ + "id":"dsbox-featurizer-do-nothing-dataset-version", + "version":"1.5.3", + "python_path":"d3m.primitives.data_preprocessing.do_nothing_for_dataset.DSBOX", + "name":"DSBox do-nothing primitive dataset version", + "digest":"689c59fc8ddddb763318f8945aabb1d6eab1c4d782a1fd5591585dd2d66ec23b" + }, + "arguments":{ + "inputs":{ + "type":"CONTAINER", + "data":"steps.2.produce" + } + }, + "outputs":[ + { + "id":"produce" + } + ] + }, + { + "type":"PRIMITIVE", + "primitive":{ + "id":"dsbox-featurizer-do-nothing-dataset-version", + "version":"1.5.3", + "python_path":"d3m.primitives.data_preprocessing.do_nothing_for_dataset.DSBOX", + "name":"DSBox do-nothing primitive dataset version", + "digest":"689c59fc8ddddb763318f8945aabb1d6eab1c4d782a1fd5591585dd2d66ec23b" + }, + "arguments":{ + "inputs":{ + "type":"CONTAINER", + "data":"steps.3.produce" + } + }, + "outputs":[ + { + "id":"produce" + } + ] + }, + { + "type":"PRIMITIVE", + "primitive":{ + "id":"fe0f1ac8-1d39-463a-b344-7bd498a31b91", + "version":"0.1", + "python_path":"d3m.primitives.data_augmentation.datamart_augmentation.Common", + "name":"Perform dataset augmentation using Datamart", + "digest":"498665b64f05ebcc14cd78f3000804fff366b833628462010d4eca931c086b81" + }, + "arguments":{ + "inputs":{ + "type":"CONTAINER", + "data":"steps.4.produce" + } + }, + "outputs":[ + { + "id":"produce" + } + ], + "hyperparams":{ + "system_identifier":{ + "type":"VALUE", + "data":"NYU" + }, + "search_result":{ + "type":"VALUE", + "data":"{\"augmentation\": {\"left_columns\": [[6]], \"right_columns\": [[47]], \"type\": \"join\"}, \"id\": \"Db48a61bd-2b9a-4aad-ac21-755598b3df4d\", \"materialize_info\": \"{\\\"id\\\": \\\"Db48a61bd-2b9a-4aad-ac21-755598b3df4d\\\", \\\"score\\\": 0.7053783337684788, \\\"metadata\\\": {\\\"connection_url\\\": \\\"http://dsbox01.isi.edu:9000\\\", \\\"search_result\\\": {\\\"variable\\\": {\\\"type\\\": \\\"uri\\\", \\\"value\\\": \\\"http://www.wikidata.org/entity/statement/Db48a61bd-2b9a-4aad-ac21-755598b3df4d-e47e89d8-8a0a-4347-ad1a-d294b7a550d3\\\"}, \\\"dataset\\\": {\\\"type\\\": \\\"uri\\\", \\\"value\\\": \\\"http://www.wikidata.org/entity/Db48a61bd-2b9a-4aad-ac21-755598b3df4d\\\"}, \\\"url\\\": {\\\"type\\\": \\\"uri\\\", \\\"value\\\": \\\"https://raw.githubusercontent.com/usc-isi-i2/datamart-userend/master/example_datasets/educate.csv\\\"}, \\\"file_type\\\": {\\\"datatype\\\": \\\"http://www.w3.org/2001/XMLSchema#string\\\", \\\"type\\\": \\\"literal\\\", \\\"value\\\": \\\"csv\\\"}, \\\"extra_information\\\": {\\\"datatype\\\": \\\"http://www.w3.org/2001/XMLSchema#string\\\", \\\"type\\\": \\\"literal\\\", \\\"value\\\": \\\"{\\\\\\\"column_meta_0\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"FIPS Code\\\\\\\"}, \\\\\\\"column_meta_1\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Text\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"State\\\\\\\"}, \\\\\\\"column_meta_2\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Text\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Area name\\\\\\\"}, \\\\\\\"column_meta_3\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"2003 Rural-urban Continuum Code\\\\\\\"}, \\\\\\\"column_meta_4\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"2003 Urban Influence Code\\\\\\\"}, \\\\\\\"column_meta_5\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"2013 Rural-urban Continuum Code\\\\\\\"}, \\\\\\\"column_meta_6\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"2013 Urban Influence Code\\\\\\\"}, \\\\\\\"column_meta_7\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Less than a high school diploma, 1970\\\\\\\"}, \\\\\\\"column_meta_8\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"High school diploma only, 1970\\\\\\\"}, \\\\\\\"column_meta_9\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Some college (1-3 years), 1970\\\\\\\"}, \\\\\\\"column_meta_10\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Four years of college or higher, 1970\\\\\\\"}, \\\\\\\"column_meta_11\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Percent of adults with less than a high school diploma, 1970\\\\\\\"}, \\\\\\\"column_meta_12\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Percent of adults with a high school diploma only, 1970\\\\\\\"}, \\\\\\\"column_meta_13\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Percent of adults completing some college (1-3 years), 1970\\\\\\\"}, \\\\\\\"column_meta_14\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Percent of adults completing four years of college or higher, 1970\\\\\\\"}, \\\\\\\"column_meta_15\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Less than a high school diploma, 1980\\\\\\\"}, \\\\\\\"column_meta_16\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"High school diploma only, 1980\\\\\\\"}, \\\\\\\"column_meta_17\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Some college (1-3 years), 1980\\\\\\\"}, \\\\\\\"column_meta_18\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Four years of college or higher, 1980\\\\\\\"}, \\\\\\\"column_meta_19\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Percent of adults with less than a high school diploma, 1980\\\\\\\"}, \\\\\\\"column_meta_20\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Percent of adults with a high school diploma only, 1980\\\\\\\"}, \\\\\\\"column_meta_21\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Percent of adults completing some college (1-3 years), 1980\\\\\\\"}, \\\\\\\"column_meta_22\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Percent of adults completing four years of college or higher, 1980\\\\\\\"}, \\\\\\\"column_meta_23\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Less than a high school diploma, 1990\\\\\\\"}, \\\\\\\"column_meta_24\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"High school diploma only, 1990\\\\\\\"}, \\\\\\\"column_meta_25\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Some college or associate's degree, 1990\\\\\\\"}, \\\\\\\"column_meta_26\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Bachelor's degree or higher, 1990\\\\\\\"}, \\\\\\\"column_meta_27\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Percent of adults with less than a high school diploma, 1990\\\\\\\"}, \\\\\\\"column_meta_28\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Percent of adults with a high school diploma only, 1990\\\\\\\"}, \\\\\\\"column_meta_29\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Percent of adults completing some college or associate's degree, 1990\\\\\\\"}, \\\\\\\"column_meta_30\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Percent of adults with a bachelor's degree or higher, 1990\\\\\\\"}, \\\\\\\"column_meta_31\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Less than a high school diploma, 2000\\\\\\\"}, \\\\\\\"column_meta_32\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"High school diploma only, 2000\\\\\\\"}, \\\\\\\"column_meta_33\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Some college or associate's degree, 2000\\\\\\\"}, \\\\\\\"column_meta_34\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Bachelor's degree or higher, 2000\\\\\\\"}, \\\\\\\"column_meta_35\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Percent of adults with less than a high school diploma, 2000\\\\\\\"}, \\\\\\\"column_meta_36\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Percent of adults with a high school diploma only, 2000\\\\\\\"}, \\\\\\\"column_meta_37\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Percent of adults completing some college or associate's degree, 2000\\\\\\\"}, \\\\\\\"column_meta_38\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Percent of adults with a bachelor's degree or higher, 2000\\\\\\\"}, \\\\\\\"column_meta_39\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Less than a high school diploma, 2013-17\\\\\\\"}, \\\\\\\"column_meta_40\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"High school diploma only, 2013-17\\\\\\\"}, \\\\\\\"column_meta_41\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Some college or associate's degree, 2013-17\\\\\\\"}, \\\\\\\"column_meta_42\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Bachelor's degree or higher, 2013-17\\\\\\\"}, \\\\\\\"column_meta_43\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Percent of adults with less than a high school diploma, 2013-17\\\\\\\"}, \\\\\\\"column_meta_44\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Percent of adults with a high school diploma only, 2013-17\\\\\\\"}, \\\\\\\"column_meta_45\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Percent of adults completing some college or associate's degree, 2013-17\\\\\\\"}, \\\\\\\"column_meta_46\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"Percent of adults with a bachelor's degree or higher, 2013-17\\\\\\\"}, \\\\\\\"column_meta_47\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Text\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"FIPS Code_wikidata\\\\\\\"}, \\\\\\\"data_metadata\\\\\\\": {\\\\\\\"shape_0\\\\\\\": 3283, \\\\\\\"shape_1\\\\\\\": 48}}\\\"}, \\\"title\\\": {\\\"xml:lang\\\": \\\"en\\\", \\\"type\\\": \\\"literal\\\", \\\"value\\\": \\\"educate csv\\\"}, \\\"keywords\\\": {\\\"datatype\\\": \\\"http://www.w3.org/2001/XMLSchema#string\\\", \\\"type\\\": \\\"literal\\\", \\\"value\\\": \\\"fips code state area name 2003 rural urban continuum code 2003 urban influence code 2013 rural urban continuum code 2013 urban influence code less than a high school diploma 1970 high school diploma only 1970 some college 1 3 years 1970 four years of college or higher 1970 percent of adults with less than a high school diploma 1970 percent of adults with a high school diploma only 1970 percent of adults completing some college 1 3 years 1970 percent of adults completing four years of college or higher 1970 less than a high school diploma 1980 high school diploma only 1980 some college 1 3 years 1980 four years of college or higher 1980 percent of adults with less than a high school diploma 1980 percent of adults with a high school diploma only 1980 percent of adults completing some college 1 3 years 1980 percent of adults completing four years of college or higher 1980 less than a high school diploma 1990 high school diploma only 1990 some college or associate s degree 1990 bachelor s degree or higher 1990 percent of adults with less than a high school diploma 1990 percent of adults with a high school diploma only 1990 percent of adults completing some college or associate s degree 1990 percent of adults with a bachelor s degree or higher 1990 less than a high school diploma 2000 high school diploma only 2000 some college or associate s degree 2000 bachelor s degree or higher 2000 percent of adults with less than a high school diploma 2000 percent of adults with a high school diploma only 2000 percent of adults completing some college or associate s degree 2000 percent of adults with a bachelor s degree or higher 2000 less than a high school diploma 2013 17 high school diploma only 2013 17 some college or associate s degree 2013 17 bachelor s degree or higher 2013 17 percent of adults with less than a high school diploma 2013 17 percent of adults with a high school diploma only 2013 17 percent of adults completing some college or associate s degree 2013 17 percent of adults with a bachelor s degree or higher 2013 17 fips code wikidata\\\"}, \\\"datasetLabel\\\": {\\\"xml:lang\\\": \\\"en\\\", \\\"type\\\": \\\"literal\\\", \\\"value\\\": \\\"Db48a61bd-2b9a-4aad-ac21-755598b3df4d\\\"}, \\\"variableName\\\": {\\\"datatype\\\": \\\"http://www.w3.org/2001/XMLSchema#string\\\", \\\"type\\\": \\\"literal\\\", \\\"value\\\": \\\"FIPS Code_wikidata\\\"}, \\\"score\\\": {\\\"datatype\\\": \\\"http://www.w3.org/2001/XMLSchema#double\\\", \\\"type\\\": \\\"literal\\\", \\\"value\\\": \\\"0.7053783337684788\\\"}}, \\\"query_json\\\": {\\\"keywords\\\": [\\\"FIPS_wikidata\\\"], \\\"variables\\\": {\\\"FIPS_wikidata\\\": \\\"q54236 q511069 q486621 q498341 q382993 q494552 q489652 q496771 q48933 q112061 q54073 q505499 q108962 q483857 q485420 q503023 q502380 q484159 q110187 q110262 q502984 q490874 q112673 q501130 q156163 q495409 q491988 q58711 q506337 q507981 q167583 q389551 q495105 q505841 q495175 q111385 q372648 q484391 q182644 q110555 q110482 q110978 q494556 q504294 q484791 q501270 q428298 q496693 q497845 q490088 q484501 q504391 q156374 q490181 q501172 q192650 q501163 q502737 q511120 q501029 q112137 q109631 q403310 q488194 q506697 q496292 q488840 q61458 q27037 q54446 q119372 q511908 q485539 q488224 q40881 q511806 q835104 q495344 q484325 q108871 q1139833 q485660 q486651 q491456 q504846 q374256 q384737 q486466 q486994 q112953 q156503 q503492 q450159 q156500 q496003 q386271 q485513 q502496 q498148 q738043 q374765 q26883 q496109 q373348 q27021 q174913 q495013 q501989 q486229 q514008 q485204 q1177705 q490980 q506951 q485229 q28321 q489668 q489062 q61352 q494958 q115551 q494764 q108837 q500861 q484538 q492057 q495280 q54443 q108499 q475301 q61032 q512993 q421963 q400753 q115272 q500698 q502240 q494990 q486614 q54064 q494943 q506385 q507308 q499547 q504355 q509798 q490671 q133857 q283657 q169952 q28274 q484450 q502743 q115043 q109616 q503451 q28316 q483895 q255943 q486043 q109279 q56145 q487283 q486099 q496980 q175799 q494616 q484991 q484603 q494822 q135330 q156273 q484590 q509895 q61176 q486920 q484294 q493709 q487581 q496169 q491702 q111266 q486064 q506506 q488672 q112603 q26614 q485038 q156496 q162560 q484234 q490647 q484752 q489684 q514024 q113117 q507230 q487731 q108952 q108626 q505886 q52250 q491316 q502875 q491537 q26762 q112149 q497871 q496125 q484527 q490813 q494480 q495223 q311904 q109651 q510915 q376803 q511935 q515205 q507078 q491178 q490864 q61484 q494931 q114904 q54086 q495165 q501976 q507733 q489430 q490306 q494926 q336183 q508271 q374649 q54066 q486118 q495261 q491584 q110684 q502054 q108047 q506538 q491406 q495649 q486305 q506181 q496031 q114862 q489457 q110539 q502400 q490498 q487243 q508220 q488922 q490869 q486394 q488853 q484304 q117917 q58771 q507126 q49290 q491941 q26760 q115433 q505987 q490124 q501804 q491899 q484431 q260871 q156276 q128096 q500751 q502345 q510947 q501014 q493255 q370310 q489420 q505854 q513891 q486191 q496607 q491945 q61526 q336190 q113793 q271601 q376053 q27245 q495839 q489915 q61354 q404898 q854630 q113005 q494923 q505392 q109622 q374452 q48867 q156445 q495719 q505644 q926589 q500767 q486270 q502587 q374908 q504312 q421960 q17174784 q511723 q485408 q271915 q61084 q111254 q489886 q193230 q491068 q498116 q491911 q61167 q491889 q128080 q503505 q115316 q506987 q63953 q506525 q156448 q109437 q512816 q512832 q492310 q490134 q27024 q376505 q303491 q495682 q496077 q487415 q486179 q489553 q485650 q58694 q108821 q118127 q61348 q497642 q484639 q74704 q512911 q489079 q110755 q156463 q51733 q486513 q491267 q488879 q61010 q504415 q496886 q932966 q368766 q484273 q374334 q115349 q495677 q486348 q486435 q502707 q501789 q111729 q338939 q71136 q493443 q389573 q847673 q376527 q262708 q114843 q111831 q496708 q503093 q94663 q484420 q54438 q501968 q156455 q48905 q484745 q111744 q336229 q26738 q312563 q490522 q370654 q494771 q506772 q488912 q489610 q494134 q379654 q489477 q932951 q484401 q188204 q491327 q486160 q108111 q484685 q61502 q489864 q56154 q485463 q113748 q485656 q503461 q495195 q54089 q502031 q484282 q71190 q513950 q427732 q486362 q486261 q509826 q484263 q484748 q279452 q501096 q493033 q490494 q503883 q112768 q400747 q108143 q489620 q505861 q507587 q489092 q504441 q376576 q28286 q486626 q497817 q311894 q490896 q495171 q494755 q502285 q61012 q502348 q490322 q61315 q488537 q506779 q506547 q461562 q485047 q490937 q58759 q488488 q495356 q374979 q502235 q54254 q491626 q384754 q484482 q61200 q489327 q48927 q506187 q108418 q485641 q167565 q498395 q504445 q108106 q156629 q26601 q111098 q490903 q484786 q26697 q484124 q497695 q1139827 q27034 q487704 q163097 q280844 q501827 q502511 q110565 q108428 q61327 q340608 q504435 q484194 q493024 q494241 q485402 q379663 q188275 q61018 q54076 q115125 q502437 q490835 q56146 q113423 q115480 q114479 q485470 q490077 q490121 q74633 q486538 q496716 q109795 q488943 q507047 q500871 q486535 q376175 q486757 q504366 q498042 q507321 q108503 q110403 q115104 q495691 q497216 q489986 q153251 q489536 q619609 q156181 q502933 q491556 q112584 q114301 q374358 q28285 q108053 q156479 q376004 q485235 q285625 q114503 q26766 q491514 q26591 q113919 q491301 q5925 q202454 q487572 q428902 q257311 q502431 q486067 q487322 q493243 q495581 q115556 q485426 q110680 q54093 q486659 q16861 q115307 q486139 q128077 q496737 q509926 q400757 q389365 q483937 q108386 q490405 q505884 q26587 q114969 q515166 q115413 q375108 q487334 q110212 q111867 q507966 q489546 q495613 q488831 q491831 q484612 q113293 q498346 q502732 q54441 q167580 q485474 q509813 q501101 q490259 q201014 q512951 q376464 q495252 q109160 q506682 q498141 q497880 q488668 q489980 q502492 q493083 q402938 q502739 q487288 q495873 q484586 q312740 q494098 q1125008 q486874 q502200 q490652 q486091 q374410 q504350 q61150 q494998 q511470 q502364 q205715 q156429 q61160 q501292 q502592 q112977 q48891 q511691 q489897 q506315 q108856 q490454 q503329 q109695 q489779 q502086 q109709 q502269 q511095 q156582 q484465 q485364 q486248 q502207 q496700 q485808 q198529 q312509 q489901 q108067 q495629 q496664 q489222 q485780 q491936 q490378 q111530 q61472 q489677 q494146 q112456 q487403 q495414 q48850 q54240 q312470 q111766 q507281 q495479 q485001 q374604 q351865 q495185 q111720 q486325 q220005 q493125 q26754 q113237 q495990 q492016 q280815 q124637 q504335 q486255 q496971 q498336 q109670 q167589 q489767 q82605 q111391 q312475 q495332 q175756 q500686 q110325 q488468 q494620 q494485 q375008 q115216 q484411 q486218 q113815 q109293 q27844 q82505 q486184 q311921 q486389 q156358 q501108 q483973 q484381 q112107 q489880 q26628 q111876 q495169 q943772 q493044 q112957 q311908 q497634 q501084 q248521 q495067 q61350 q484567 q487236 q502483 q376876 q111298 q510885 q486265 q114445 q115273 q506557 q484620 q375125 q498286 q483888 q375652 q494216 q484542 q111273 q109790 q497795 q494233 q490251 q495359 q511135 q488917 q261672 q376640 q490112 q110575 q501602 q491927 q61036 q110407 q511834 q376703 q284439 q503877 q115419 q113854 q498332 q112115 q378896 q490642 q492048 q156270 q156156 q496644 q484755 q503583 q489648 q111694 q500977 q501761 q490958 q511510 q502945 q484548 q376059 q130006 q142369 q426699 q501918 q61173 q120080 q350001 q500670 q376113 q486000 q376755 q312149 q508288 q488769 q511679 q498020 q485571 q501319 q2613700 q498163 q491121 q490383 q336322 q156566 q484133 q484740 q108087 q497737 q490884 q494254 q110389 q110606 q487599 q489377 q63958 q495570 q82510 q485920 q485746 q484474 q485058 q505299 q491170 q109656 q502722 q498372 q502442 q496629 q503081 q484426 q375608 q179954 q502473 q379665 q115148 q430113 q494564 q484582 q490716 q511478 q503442 q484681 q26807 q94669 q491762 q477951 q506015 q377171 q501959 q5092 q507028 q506215 q501345 q513775 q507870 q134080 q501036 q61026 q485896 q488528 q113783 q112069 q490925 q489481 q491894 q506512 q490649 q501256 q486143 q486011 q111168 q506676 q108618 q111968 q490390 q377023 q507907 q485434 q335017 q485715 q494192 q486040 q109303 q502244 q486087 q337915 q2613391 q488826 q271609 q302918 q501582 q509757 q486838 q43421 q498356 q484672 q484296 q156475 q511849 q496636 q505387 q282188 q492021 q496900 q494768 q127979 q487016 q504863 q113962 q501815 q383739 q265079 q486357 q503870 q506690 q493088 q489751 q497928 q501848 q497200 q48917 q497964 q490727 q490757 q377234 q751202 q390141 q156481 q495985 q502929 q111957 q491108 q484559 q211360 q487254 q490023 q497971 q377148 q126829 q108408 q848649 q502377 q825807 q26730 q109289 q488659 q484452 q108403 q181967 q495257 q24648 q501123 q26619 q376764 q113096 q376899 q485615 q489793 q109518 q485452 q113906 q506086 q61135 q110257 q511498 q61296 q240621 q513878 q109986 q487692 q500654 q502404 q490450 q489711 q505354 q61521 q497377 q512937 q490161 q495154 q502278 q494093 q311897 q28308 q501800 q491221 q486137 q156353 q502294 q111851 q506291 q487564 q108861 q342803 q408744 q495126 q127238 q61148 q500891 q494806 q111759 q509786 q507016 q495142 q510244 q489083 q489891 q204761 q506172 q27051 q29124 q109265 q485787 q490150 q489595 q507798 q63719 q485017 q491427 q376069 q489471 q506220 q295787 q156346 q492342 q500784 q493079 q58683 q484378 q109661 q594313 q484279 q507427 q56149 q488885 q280826 q503864 q495658 q115025 q497707 q490272 q111280 q506225 q504375 q505347 q485239 q485710 q312254 q491547 q485388 q421974 q496096 q490357 q498295 q511461 q108072 q496678 q113201 q509092 q112271 q496139 q337204 q490505 q507918 q489616 q490512 q503486 q500854 q108846 q108832 q494751 q501248 q489056 q494228 q501568 q267164 q491208 q54065 q112737 q127973 q491190 q110412 q490116 q377131 q369211 q510900 q74661 q494129 q498377 q501858 q193167 q156647 q490999 q485024 q387978 q495045 q485370 q484570 q111235 q113017 q58688 q486765 q496583 q489642 q489088 q374342 q108803 q511084 q489159 q490369 q108093 q487716 q503438 q492355 q485276 q138141 q501092 q493049 q461204 q111753 q26735 q113834 q494810 q498362 q495340 q48863 q495349 q111304 q540907 q337067 q501559 q502373 q496104 q495090 q427315 q486207 q495072 q54439 q509848 q108122 q502411 q491850 q505868 q490283 q113763 q491982 q495620 q495031 q485623 q490436 q488219 q26610 q421970 q485491 q484574 q488512 q486654 q490108 q376024 q26584 q503455 q502784 q490266 q511747 q26502 q485628 q113756 q485245 q111913 q110739 q111782 q496406 q280838 q500813 q484478 q63771 q336167 q156570 q486848 q111575 q490409 q494716 q61381 q156342 q484142 q61379 q490009 q490014 q385365 q304065 q114026 q496475 q501964 q2613601 q109975 q485582 q502549 q26710 q503554 q484404 q111928 q488892 q115396 q110417 q26880 q491229 q503446 q496025 q156613 q495965 q486480 q494138 q61145 q507931 q27233 q506569 q501830 q28503 q490190 q114884 q339724 q486112 q502920 q494236 q495654 q495937 q485509 q503059 q280596 q507459 q484771 q494104 q497854 q695782 q200696 q512699 q494223 q501974 q506068 q506235 q110491 q490972 q495851 q502952 q108866 q501839 q491623 q488865 q488690 q820502 q491949 q168144 q109686 q115166 q115061 q374890 q501555 q490144 q512713 q489099 q110670 q491173 q327080 q494463 q484530 q496729 q376990 q127978 q112526 q48878 q505505 q496242 q506300 q485944 q500674 q94673 q26719 q485536 q426746 q493260 q507294 q506574 q377952 q111774 q49259 q111374 q269164 q506363 q497628 q489861 q61355 q509838 q495974 q341679 q505638 q507153 q495096 q176068 q346925 q112652 q492488 q490002 q133860 q498077 q501939 q490488 q486095 q490966 q28488 q502230 q489847 q115583 q490395 q403777 q486313 q26740 q113185 q26889 q485588 q156306 q421956 q493957 q486071 q489743 q341755 q156191 q156287 q109761 q495930 q504345 q501779 q61289 q488693 q430328 q486380 q502526 q192753 q511659 q177678 q488543 q496602 q487605 q501954 q484513 q280820 q512885 q180785 q372796 q498034 q486495 q491882 q288606 q494776 q484287 q377122 q503076 q110528 q490989 q487578 q109442 q128137 q502880 q490427 q484367 q379540 q108436 q48834 q485752 q376129 q61077 q501819 q484290 q494815 q485305 q108372 q110764 q507099 q110904 q496744 q115404 q502414 q502479 q492026 q377181 q113875 q491569 q506975 q110624 q156467 q511713 q485926 q484330 q115200 q490656 q338052 q488679 q494180 q484239 q54163 q504379 q503889 q110655 q500803 q483947 q500958 q495906 q114123 q368006 q513919 q494626 q109150 q504410 q511106 q108101 q500918 q502050 q156431 q487681 q113892 q485804 q501151 q494207 q503468 q586070 q501060 q496152 q492888 q501785 q38022 q489894 q337270 q304289 q74290 q485558 q56151 q115289 q112934 q514038 q385931 q490734 q110640 q61029 q48910 q489625 q494077 q110760 q108784 q501277 q156411 q501074 q54231 q337688 q500845 q501340 q61368 q502069 q503478 q500984 q513805 q312497 q485441 q18424 q156350 q375002 q485291 q505287 q376096 q497607 q156452 q544539 q263742 q488847 q346959 q529958 q70979 q112807 q495310 q26701 q108842 q111380 q108117 q115577 q495052 q431826 q61020 q156642 q491292 q489673 q486243 q490796 q490779 q509770 q27023 q26564 q490215 q503000 q495979 q488206 q490287 q337348 q488701 q494152 q49014 q156634 q54072 q490678 q378438 q484153 q108612 q495022 q49231 q512901 q502273 q496176 q61048 q491436 q494081 q27018 q338424 q491837 q493605 q507169 q484729 q498155 q312455 q61478 q507472 q494919 q500663 q501323 q484556 q375803 q29445 q114948 q486526 q109851 q48937 q497984 q111709 q489783 q431440 q137562 q402945 q495672 q110340 q497992 q156213 q495191 q312748 q109969 q501055 q18432 q156380 q489919 q112442 q349995 q502081 q110384 q495116 q486167 q494174 q504385 q61330 q509910 q484664 q506191 q312746 q486344 q26631 q484659 q176307 q502210 q108827 q48831 q497424 q483990 q430938 q115266 q504360 q486517 q502213 q26605 q54082 q341639 q156628 q504838 q502397 q48874 q374589 q188376 q26885 q113354 q167656 q374527 q302165 q506752 q491470 q185533 q366871 q351808 q501823 q503071 q486210 q112869 q491243 q485729 q49149 q494799 q342428 q493951 q501587 q111291 q491991 q591156 q485726 q497702 q374382 q109308 q110779 q489869 q178341 q502288 q379657 q494630 q497810 q483942 q110130 q196014 q111584 q484015 q500897 q376822 q13188841 q278589 q485984 q47894 q491285 q502021 q497938 q108788 q376034 q377035 q515150 q54251 q54258 q107977 q109641 q378527 q115258 q112086 q26716 q502072 q490774 q503538 q490482 q490799 q376838 q506321 q495181 q110532 q484371 q495633 q502077 q491878 q109503 q108494 q498024 q112078 q505444 q512746 q111409 q490443 q505317 q504397 q485467 q28499 q490722 q495588 q495393 q501350 q497830 q504284 q512259 q494167 q507216 q61370 q485549 q488818 q111421 q491276 q426262 q306343 q54238 q491970 q502014 q502516 q504450 q489306 q496132 q492053 q1130480 q61181 q111593 q490386 q47944 q26526 q484224 q497284 q109526 q137509 q487744 q491184 q156387 q488499 q491529 q484760 q488572 q494774 q506552 q494818 q374626 q489069 q489855 q491035 q501948 q54445 q501306 q488859 q335121 q376947 q156468 q426398 q374200 q109298 q109756 q498675 q504830 q691614 q495017 q513905 q495209 q486231 q182845 q376616 q111622 q493599 q496115 q508618 q484354 q502521 q486281 q485912 q506351 q505305 q74628 q110570 q504422 q108127 q501843 q24645 q111844 q491017 q54160 q496568 q277728 q512767 q26598 q496758 q54079 q495132 q506157 q486133 q379660 q492040 q491201 q109165 q502351 q486453 q489468 q495198 q485361 q203049 q488582 q374404 q506151 q109846 q495564 q513078 q488698 q490360 q506727 q495891 q486060 q497482 q485460 q495285 q56153 q491256 q500569 q109665 q127184 q111400 q490255 q498312 q511788 q494518 q485532 q111886 q489682 q114992 q156577 q247715 q312731 q490949 q491590 q485910 q490769 q253538 q156377 q27025 q494569 q492819 q490019 q507757 q493074 q489613 q507353 q61365 q494536 q54440 q112915 q502576 q495201 q109317 q501043 q488805 q501796 q489599 q486386 q114923 q502250 q493134 q512965 q112499 q340605 q507881 q487534 q500312 q485044 q156580 q489999 q491447 q486198 q505276 q502447 q485030 q112686 q111549 q491288 q507770 q337402 q484470 q26699 q484156 q156278 q302852 q387216 q374968 q489702 q489447 q497908 q497172 q484551 q498123 q495687 q485035 q156575 q503366 q497652 q489463 q506326 q500880 q45441 q494893 q108367 q502451 q110983 q156295 q31091 q108131 q484247 q490690 q61007 q505975 q112846 q110336 q54161 q376042 q512732 q284806 q485502 q484457 q484668 q115463 q494514 q259095 q493610 q513933 q488821 q112220 q109270 q515177 q491160 q486048 q156632 q491553 q490029 q490784 q485922 q513833 q491959 q342043 q111663 q373953 q496511 q26676 q508254 q494598 q109457 q491272 q376086 q156623 q485577 q501242 q108397 q504850 q474999 q498014 q484616 q486288 q477870 q495151 q485592 q506064 q484521 q501067 q495273 q486431 q483916 q491126 q505834 q29118 q494899 q54260 q495918 q26723 q485561 q491508 q28283 q490414 q494575 q490750 q490745 q376106 q505405 q488175 q498072 q341708 q484335 q26527 q109810 q487280 q376102 q488796 q58698 q110561 q490420 q312744 q156186 q484268 q510934 q376562 q113823 q61414 q781165 q58774 q26895 q489716 q504428 q514093 q488166 q312737 q515220 q28311 q281681 q504371 q340591 q56144 q507892 q27045 q156361 q498319 q370329 q156216 q111647 q501312 q491148 q489312\\\"}, \\\"keywords_search\\\": [], \\\"variables_search\\\": {}}, \\\"search_type\\\": \\\"general\\\"}, \\\"augmentation\\\": {\\\"properties\\\": \\\"join\\\", \\\"right_columns\\\": [47], \\\"left_columns\\\": [6]}, \\\"datamart_type\\\": \\\"isi\\\"}\", \"metadata\": [{\"metadata\": {\"dimension\": {\"length\": 3283, \"name\": \"rows\", \"semantic_types\": [\"https://metadata.datadrivendiscovery.org/types/TabularRow\"]}, \"schema\": \"https://metadata.datadrivendiscovery.org/schemas/v0/container.json\", \"semantic_types\": [\"https://metadata.datadrivendiscovery.org/types/Table\"], \"structural_type\": \"d3m.container.pandas.DataFrame\"}, \"selector\": []}, {\"metadata\": {\"dimension\": {\"length\": 48, \"name\": \"columns\", \"semantic_types\": [\"https://metadata.datadrivendiscovery.org/types/TabularColumn\"]}}, \"selector\": [\"__ALL_ELEMENTS__\"]}, {\"metadata\": {\"name\": \"FIPS Code\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 0]}, {\"metadata\": {\"name\": \"State\", \"semantic_types\": [\"http://schema.org/Text\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 1]}, {\"metadata\": {\"name\": \"Area name\", \"semantic_types\": [\"http://schema.org/Text\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 2]}, {\"metadata\": {\"name\": \"2003 Rural-urban Continuum Code\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 3]}, {\"metadata\": {\"name\": \"2003 Urban Influence Code\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 4]}, {\"metadata\": {\"name\": \"2013 Rural-urban Continuum Code\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 5]}, {\"metadata\": {\"name\": \"2013 Urban Influence Code\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 6]}, {\"metadata\": {\"name\": \"Less than a high school diploma, 1970\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 7]}, {\"metadata\": {\"name\": \"High school diploma only, 1970\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 8]}, {\"metadata\": {\"name\": \"Some college (1-3 years), 1970\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 9]}, {\"metadata\": {\"name\": \"Four years of college or higher, 1970\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 10]}, {\"metadata\": {\"name\": \"Percent of adults with less than a high school diploma, 1970\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 11]}, {\"metadata\": {\"name\": \"Percent of adults with a high school diploma only, 1970\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 12]}, {\"metadata\": {\"name\": \"Percent of adults completing some college (1-3 years), 1970\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 13]}, {\"metadata\": {\"name\": \"Percent of adults completing four years of college or higher, 1970\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 14]}, {\"metadata\": {\"name\": \"Less than a high school diploma, 1980\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 15]}, {\"metadata\": {\"name\": \"High school diploma only, 1980\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 16]}, {\"metadata\": {\"name\": \"Some college (1-3 years), 1980\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 17]}, {\"metadata\": {\"name\": \"Four years of college or higher, 1980\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 18]}, {\"metadata\": {\"name\": \"Percent of adults with less than a high school diploma, 1980\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 19]}, {\"metadata\": {\"name\": \"Percent of adults with a high school diploma only, 1980\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 20]}, {\"metadata\": {\"name\": \"Percent of adults completing some college (1-3 years), 1980\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 21]}, {\"metadata\": {\"name\": \"Percent of adults completing four years of college or higher, 1980\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 22]}, {\"metadata\": {\"name\": \"Less than a high school diploma, 1990\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 23]}, {\"metadata\": {\"name\": \"High school diploma only, 1990\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 24]}, {\"metadata\": {\"name\": \"Some college or associate's degree, 1990\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 25]}, {\"metadata\": {\"name\": \"Bachelor's degree or higher, 1990\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 26]}, {\"metadata\": {\"name\": \"Percent of adults with less than a high school diploma, 1990\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 27]}, {\"metadata\": {\"name\": \"Percent of adults with a high school diploma only, 1990\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 28]}, {\"metadata\": {\"name\": \"Percent of adults completing some college or associate's degree, 1990\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 29]}, {\"metadata\": {\"name\": \"Percent of adults with a bachelor's degree or higher, 1990\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 30]}, {\"metadata\": {\"name\": \"Less than a high school diploma, 2000\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 31]}, {\"metadata\": {\"name\": \"High school diploma only, 2000\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 32]}, {\"metadata\": {\"name\": \"Some college or associate's degree, 2000\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 33]}, {\"metadata\": {\"name\": \"Bachelor's degree or higher, 2000\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 34]}, {\"metadata\": {\"name\": \"Percent of adults with less than a high school diploma, 2000\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 35]}, {\"metadata\": {\"name\": \"Percent of adults with a high school diploma only, 2000\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 36]}, {\"metadata\": {\"name\": \"Percent of adults completing some college or associate's degree, 2000\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 37]}, {\"metadata\": {\"name\": \"Percent of adults with a bachelor's degree or higher, 2000\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 38]}, {\"metadata\": {\"name\": \"Less than a high school diploma, 2013-17\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 39]}, {\"metadata\": {\"name\": \"High school diploma only, 2013-17\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 40]}, {\"metadata\": {\"name\": \"Some college or associate's degree, 2013-17\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 41]}, {\"metadata\": {\"name\": \"Bachelor's degree or higher, 2013-17\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 42]}, {\"metadata\": {\"name\": \"Percent of adults with less than a high school diploma, 2013-17\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 43]}, {\"metadata\": {\"name\": \"Percent of adults with a high school diploma only, 2013-17\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 44]}, {\"metadata\": {\"name\": \"Percent of adults completing some college or associate's degree, 2013-17\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 45]}, {\"metadata\": {\"name\": \"Percent of adults with a bachelor's degree or higher, 2013-17\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 46]}, {\"metadata\": {\"name\": \"FIPS Code_wikidata\", \"semantic_types\": [\"http://schema.org/Text\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 47]}], \"score\": 0.7053783337684788, \"summary\": {\"Columns\": [\"[0] FIPS Code\", \"[1] State\", \"[2] Area name\", \"[3] 2003 Rural-urban Continuum Code\", \"[4] 2003 Urban Influence Code\", \"[5] 2013 Rural-urban Continuum Code\", \"[6] 2013 Urban Influence Code\", \"[7] Less than a high school diploma\", \"[8] 1970\", \"[9] High school diploma only\", \"[10] 1970\", \"[11] Some college (1-3 years)\", \"[12] 1970\", \"[13] Four years of college or higher\", \"[14] 1970\", \"[15] Percent of adults with less than a high school diploma\", \"[16] 1970\", \"[17] Percent of adults with a high school diploma only\", \"[18] 1970\", \"[19] Percent of adults completing some college (1-3 years)\", \"[20] 1970\", \"[21] Percent of adults completing four years of college or higher\", \"[22] 1970\", \"[23] Less than a high school diploma\", \"[24] 1980\", \"[25] High school diploma only\", \"[26] 1980\", \"[27] Some college (1-3 years)\", \"[28] 1980\", \"[29] Four years of college or higher\", \"[30] 1980\", \"[31] Percent of adults with less than a high school diploma\", \"[32] 1980\", \"[33] Percent of adults with a high school diploma only\", \"[34] 1980\", \"[35] Percent of adults completing some college (1-3 years)\", \"[36] 1980\", \"[37] Percent of adults completing four years of college or higher\", \"[38] 1980\", \"[39] Less than a high school diploma\", \"[40] 1990\", \"[41] High school diploma only\", \"[42] 1990\", \"[43] Some college or associate's degree\", \"[44] 1990\", \"[45] Bachelor's degree or higher\", \"[46] 1990\", \"[47] Percent of adults with less than a high school diploma\", \"[48] 1990\", \"[49] Percent of adults with a high school diploma only\", \"[50] 1990\", \"[51] Percent of adults completing some college or associate's degree\", \"[52] 1990\", \"[53] Percent of adults with a bachelor's degree or higher\", \"[54] 1990\", \"[55] Less than a high school diploma\", \"[56] 2000\", \"[57] High school diploma only\", \"[58] 2000\", \"[59] Some college or associate's degree\", \"[60] 2000\", \"[61] Bachelor's degree or higher\", \"[62] 2000\", \"[63] Percent of adults with less than a high school diploma\", \"[64] 2000\", \"[65] Percent of adults with a high school diploma only\", \"[66] 2000\", \"[67] Percent of adults completing some college or associate's degree\", \"[68] 2000\", \"[69] Percent of adults with a bachelor's degree or higher\", \"[70] 2000\", \"[71] Less than a high school diploma\", \"[72] 2013-17\", \"[73] High school diploma only\", \"[74] 2013-17\", \"[75] Some college or associate's degree\", \"[76] 2013-17\", \"[77] Bachelor's degree or higher\", \"[78] 2013-17\", \"[79] Percent of adults with less than a high school diploma\", \"[80] 2013-17\", \"[81] Percent of adults with a high school diploma only\", \"[82] 2013-17\", \"[83] Percent of adults completing some college or associate's degree\", \"[84] 2013-17\", \"[85] Percent of adults with a bachelor's degree or higher\", \"[86] 2013-17\", \"[87] FIPS Code_wikidata\"], \"Datamart ID\": \"Db48a61bd-2b9a-4aad-ac21-755598b3df4d\", \"Recommend Join Columns\": \"FIPS Code_wikidata\", \"Score\": \"0.7053783337684788\", \"URL\": \"https://raw.githubusercontent.com/usc-isi-i2/datamart-userend/master/example_datasets/educate.csv\", \"title\": \"educate csv\"}, \"supplied_id\": \"DA_poverty_estimation_dataset_TRAIN\", \"supplied_resource_id\": \"learningData\"}" + } + } + }, + { + "type":"PRIMITIVE", + "primitive":{ + "id":"4b42ce1e-9b98-4a25-b68e-fad13311eb65", + "version":"0.3.0", + "python_path":"d3m.primitives.data_transformation.dataset_to_dataframe.Common", + "name":"Extract a DataFrame from a Dataset", + "digest":"a141e6821de7ae586968b0986237745a5510850e6940cf946db9d50d3828b030" + }, + "arguments":{ + "inputs":{ + "type":"CONTAINER", + "data":"steps.5.produce" + } + }, + "outputs":[ + { + "id":"produce" + } + ] + }, + { + "type":"PRIMITIVE", + "primitive":{ + "id":"3002bc5b-fa47-4a3d-882e-a8b5f3d756aa", + "version":"0.1.0", + "python_path":"d3m.primitives.data_transformation.remove_semantic_types.DataFrameCommon", + "name":"Remove semantic types from columns", + "digest":"ff48930a123697994f8b606b8a353c7e60aaf21738f4fd1a2611d8d1eb4a349a" + }, + "arguments":{ + "inputs":{ + "type":"CONTAINER", + "data":"steps.6.produce" + } + }, + "outputs":[ + { + "id":"produce" + } + ], + "hyperparams":{ + "semantic_types":{ + "type":"VALUE", + "data":[ + "http://wikidata.org/qnode" + ] + } + } + }, + { + "type":"PRIMITIVE", + "primitive":{ + "id":"4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", + "version":"0.3.0", + "python_path":"d3m.primitives.data_transformation.extract_columns_by_semantic_types.DataFrameCommon", + "name":"Extracts columns by semantic type", + "digest":"2dda31ef0452060bc55070204c5328927e2b2537a03afa6333607e3e4828ab26" + }, + "arguments":{ + "inputs":{ + "type":"CONTAINER", + "data":"steps.7.produce" + } + }, + "outputs":[ + { + "id":"produce" + } + ], + "hyperparams":{ + "semantic_types":{ + "type":"VALUE", + "data":[ + "https://metadata.datadrivendiscovery.org/types/PrimaryKey", + "https://metadata.datadrivendiscovery.org/types/Attribute" + ] + } + } + }, + { + "type":"PRIMITIVE", + "primitive":{ + "id":"b2612849-39e4-33ce-bfda-24f3e2cb1e93", + "version":"1.5.3", + "python_path":"d3m.primitives.schema_discovery.profiler.DSBOX", + "name":"DSBox Profiler", + "digest":"461c2dd5c69c0ca332c8e77b32c40811692a294618433bd12109c4d1449e8760" + }, + "arguments":{ + "inputs":{ + "type":"CONTAINER", + "data":"steps.8.produce" + } + }, + "outputs":[ + { + "id":"produce" + } + ] + }, + { + "type":"PRIMITIVE", + "primitive":{ + "id":"dsbox-cleaning-featurizer", + "version":"1.5.3", + "python_path":"d3m.primitives.data_cleaning.cleaning_featurizer.DSBOX", + "name":"DSBox Cleaning Featurizer", + "digest":"583c65c56ceabe8b3c354fe3723f1c8947c7432b6e4a5900745d515c2f3ffb19" + }, + "arguments":{ + "inputs":{ + "type":"CONTAINER", + "data":"steps.9.produce" + } + }, + "outputs":[ + { + "id":"produce" + } + ] + }, + { + "type":"PRIMITIVE", + "primitive":{ + "id":"18f0bb42-6350-3753-8f2d-d1c3da70f279", + "version":"1.5.3", + "python_path":"d3m.primitives.data_preprocessing.encoder.DSBOX", + "name":"ISI DSBox Data Encoder", + "digest":"bc30aa8ed97ea89b86fb48f49c205caa128ee39956f160ee4d8eef45904483fa" + }, + "arguments":{ + "inputs":{ + "type":"CONTAINER", + "data":"steps.10.produce" + } + }, + "outputs":[ + { + "id":"produce" + } + ] + }, + { + "type":"PRIMITIVE", + "primitive":{ + "id":"0c64ffd6-cb9e-49f0-b7cb-abd70a5a8261", + "version":"1.0.0", + "python_path":"d3m.primitives.feature_construction.corex_text.DSBOX", + "name":"CorexText", + "digest":"cffef017bfa07969f9d364b6d6007c950de486f57a7009a0a5bee8e5b96ee1bf" + }, + "arguments":{ + "inputs":{ + "type":"CONTAINER", + "data":"steps.11.produce" + } + }, + "outputs":[ + { + "id":"produce" + } + ] + }, + { + "type":"PRIMITIVE", + "primitive":{ + "id":"7ddf2fd8-2f7f-4e53-96a7-0d9f5aeecf93", + "version":"1.5.3", + "python_path":"d3m.primitives.data_transformation.to_numeric.DSBOX", + "name":"ISI DSBox To Numeric DataFrame", + "digest":"8bb5adc20e1d62d56b7a441ca26bf6c0ebce4b6e5f634c8ce27e6e005d7ab93e" + }, + "arguments":{ + "inputs":{ + "type":"CONTAINER", + "data":"steps.12.produce" + } + }, + "outputs":[ + { + "id":"produce" + } + ] + }, + { + "type":"PRIMITIVE", + "primitive":{ + "id":"7894b699-61e9-3a50-ac9f-9bc510466667", + "version":"1.5.3", + "python_path":"d3m.primitives.data_preprocessing.mean_imputation.DSBOX", + "name":"DSBox Mean Imputer", + "digest":"782ab60db245d2694808db4c90f45c7777a37f2ea0438e7772ea75fe3af5aa61" + }, + "arguments":{ + "inputs":{ + "type":"CONTAINER", + "data":"steps.13.produce" + } + }, + "outputs":[ + { + "id":"produce" + } + ] + }, + { + "type":"PRIMITIVE", + "primitive":{ + "id":"64d2ef5d-b221-3033-8342-76d0293fa99c", + "version":"2019.6.7", + "python_path":"d3m.primitives.data_preprocessing.max_abs_scaler.SKlearn", + "name":"sklearn.preprocessing.data.MaxAbsScaler", + "digest":"b5951147694e69ca1717fe6ea53063408b7db639f0c496f8148c0f3b5f5b25df" + }, + "arguments":{ + "inputs":{ + "type":"CONTAINER", + "data":"steps.14.produce" + } + }, + "outputs":[ + { + "id":"produce" + } + ], + "hyperparams":{ + "use_semantic_types":{ + "type":"VALUE", + "data":true + }, + "return_result":{ + "type":"VALUE", + "data":"new" + }, + "add_index_columns":{ + "type":"VALUE", + "data":true + } + } + }, + { + "type":"PRIMITIVE", + "primitive":{ + "id":"dsbox-featurizer-do-nothing", + "version":"1.5.3", + "python_path":"d3m.primitives.data_preprocessing.do_nothing.DSBOX", + "name":"DSBox do-nothing primitive", + "digest":"8552e4f3793e06a7f5d251a7a9e341d096d47f22e739bebf0227a52257d4c64c" + }, + "arguments":{ + "inputs":{ + "type":"CONTAINER", + "data":"steps.15.produce" + } + }, + "outputs":[ + { + "id":"produce" + } + ] + }, + { + "type":"PRIMITIVE", + "primitive":{ + "id":"4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", + "version":"0.3.0", + "python_path":"d3m.primitives.data_transformation.extract_columns_by_semantic_types.DataFrameCommon", + "name":"Extracts columns by semantic type", + "digest":"2dda31ef0452060bc55070204c5328927e2b2537a03afa6333607e3e4828ab26" + }, + "arguments":{ + "inputs":{ + "type":"CONTAINER", + "data":"steps.6.produce" + } + }, + "outputs":[ + { + "id":"produce" + } + ], + "hyperparams":{ + "semantic_types":{ + "type":"VALUE", + "data":[ + "https://metadata.datadrivendiscovery.org/types/TrueTarget" + ] + } + } + }, + { + "type":"PRIMITIVE", + "primitive":{ + "id":"7ddf2fd8-2f7f-4e53-96a7-0d9f5aeecf93", + "version":"1.5.3", + "python_path":"d3m.primitives.data_transformation.to_numeric.DSBOX", + "name":"ISI DSBox To Numeric DataFrame", + "digest":"8bb5adc20e1d62d56b7a441ca26bf6c0ebce4b6e5f634c8ce27e6e005d7ab93e" + }, + "arguments":{ + "inputs":{ + "type":"CONTAINER", + "data":"steps.17.produce" + } + }, + "outputs":[ + { + "id":"produce" + } + ], + "hyperparams":{ + "drop_non_numeric_columns":{ + "type":"VALUE", + "data":false + } + } + }, + { + "type":"PRIMITIVE", + "primitive":{ + "id":"2a031907-6b2c-3390-b365-921f89c8816a", + "version":"2019.6.7", + "python_path":"d3m.primitives.regression.gradient_boosting.SKlearn", + "name":"sklearn.ensemble.gradient_boosting.GradientBoostingRegressor", + "digest":"4775a335b1b430daf9035b3582948d1fe034f35e6113d5dc5637545506c5f2d6" + }, + "arguments":{ + "inputs":{ + "type":"CONTAINER", + "data":"steps.16.produce" + }, + "outputs":{ + "type":"CONTAINER", + "data":"steps.18.produce" + } + }, + "outputs":[ + { + "id":"produce" + } + ], + "hyperparams":{ + "max_depth":{ + "type":"VALUE", + "data":5 + }, + "n_estimators":{ + "type":"VALUE", + "data":200 + }, + "learning_rate":{ + "type":"VALUE", + "data":0.3 + }, + "min_samples_split":{ + "type":"VALUE", + "data":{ + "case":"int", + "value":2 + } + }, + "min_samples_leaf":{ + "type":"VALUE", + "data":{ + "case":"absolute", + "value":2 + } + }, + "add_index_columns":{ + "type":"VALUE", + "data":true + }, + "use_semantic_types":{ + "type":"VALUE", + "data":true + } + } + } + ], + "source":{ + "name":"ISI", + "contact":"mailto:kyao@isi.edu" + }, + "name":"gradient_boosting_regression_template:140580240488488", + "description":"", + "digest":"692396c6739e666de6c30e4dbbdbb6586313d1027db79b72e1c5973b17cc3333" +} \ No newline at end of file diff --git a/examples/search-example-new-v2019.10.10.py b/examples/search-example-new-v2019.10.10.py new file mode 100644 index 0000000..f98954b --- /dev/null +++ b/examples/search-example-new-v2019.10.10.py @@ -0,0 +1,54 @@ +import datamart_nyu +import os +import datamart +import json +from d3m.container.dataset import Dataset, D3MDatasetLoader + +# the first time of the search on a dataset maybe slow, it will be much quicker afterwards + +# load poverty dataset for a example +loader = D3MDatasetLoader() +path = "/Users/minazuki/Desktop/studies/master/2018Summer/data/datasets/seed_datasets_data_augmentation/DA_poverty_estimation/TRAIN/dataset_TRAIN/datasetDoc.json" +json_file = os.path.abspath(path) +all_dataset_uri = 'file://{}'.format(json_file) +input_dataset = loader.load(dataset_uri=all_dataset_uri) + +# currently query_search only support setting up the specific columns that need to be wikified +# this can be useful when find that some columns should not be wikified (e.g.: some index columns) +# For example: +query_example = {'%^$#@wikifier@%^$#':{'FIPS': 'P882', 'State': 'Q35657'}} +meta_to_str = json.dumps(query_example) +query_search = datamart.DatamartQuery(keywords=[meta_to_str], variables=None) +datamart_nyu_url = "http://dsbox01.isi.edu:9000" +datamart_unit = datamart_nyu.RESTDatamart(connection_url=datamart_nyu_url) +search_unit = datamart_unit.search_with_data(query=query_search, supplied_data=input_dataset) +all_results1 = search_unit.get_next_page() + + +if all_results1 is None: + print("No search result returned!") +# print the brief information of the search results +else: + for i, each_search_result in enumerate(all_results1): + each_search_res_json = each_search_result.get_json_metadata() + print("------------ Search result No.{} ------------".format(str(i))) + print(each_search_res_json['augmentation']) + summary = each_search_res_json['summary'].copy() + if "Columns" in summary: + summary.pop("Columns") + print(summary) + print("-"*100) + +""" +# each search result in all_result1 can be treated as one augment candidate, +just need to run search_result.serialize() and use the output as the hyperparam as shown below + +{ + "primitive": "d3m.primitives.data_augmentation.datamart_augmentation.Common", + "hyperparameters": + { + 'system_identifier':["NYU"], + 'search_result':[each_search_result.serialize()], + } +} +""" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 58301aa..d07cc75 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,8 @@ psycopg2-binary>=2.7.5 # psycopg2>=2.7.5 python-dateutil>=2.7.3 rltk==2.0.0a13 -tablextract>=1.2.5 +date-extractor==3.9.1 +tablextract==1.2.5 termcolor>=1.1.0 urllib3>=1.23 wikipedia-API==0.4.4 diff --git a/wikifier/__init__.py b/wikifier/__init__.py index f9e448c..0aa524f 100644 --- a/wikifier/__init__.py +++ b/wikifier/__init__.py @@ -1,2 +1 @@ from wikifier.wikifier import produce -from wikifier.wikifier import save_specific_p_nodes diff --git a/wikifier/find_identity.py b/wikifier/find_identity.py index 0e6c64c..8c9fc9b 100644 --- a/wikifier/find_identity.py +++ b/wikifier/find_identity.py @@ -61,6 +61,7 @@ def get_identifier_3(strings: typing.List[str], column_name: str = None, target_ _logger.info("User-defined P node is " + P_predicts[0]) else: P_predicts = [x[0] for x in Counter(P_list).most_common(5)] # [('P932', 8), ('P1566', 6), ('P698', 2)] + # p_config.common_top_k + len(P_blacklist) + 1 for each in P_blacklist: try: @@ -84,12 +85,26 @@ def get_identifier_3(strings: typing.List[str], column_name: str = None, target_ # best_predicts = """ + best_predicts = [] + # for wikifier rest api service + # if p_config.is_top_k: + # if "P882" in P_predicts: + # best_predicts.append("P882") + # for i in range(0, len(P_predicts)): + # if len(best_predicts) >= p_config.common_top_k: + # break + # if P_predicts[i] != "P882": + # best_predicts.append(P_predicts[i]) + # else: if "P882" in P_predicts: best_predicts = ["P882"] else: best_predicts = [P_predicts[0]] _logger.info("The best matching P node is " + best_predicts[0]) + # change to default value + # p_config.common_top_k = 5 + # p_config.is_top_k = False # print('Top 3 possible properties:') # print(P_predicts) diff --git a/wikifier/utils.py b/wikifier/utils.py index a859162..2475831 100644 --- a/wikifier/utils.py +++ b/wikifier/utils.py @@ -1,242 +1,34 @@ -import sys, os -import warnings -from dsbox.datapreprocessing.cleaner.wikifier import WikifierHyperparams ,Wikifier -from d3m.metadata.base import Metadata, DataMetadata, ALL_ELEMENTS -from d3m.container import List -from d3m.base import utils as d3m_utils -from datamart import upload -from datamart.utilities.utils import SEARCH_URL, PRODUCTION_ES_INDEX, TEST_ES_INDEX -import numpy as np +import json +import typing +import os import logging import pandas as pd -import copy -import subprocess -from .metadata_wikipedia import metadata_wikipedia -from elasticsearch import Elasticsearch -import config - -warnings.filterwarnings("ignore") -logger = logging.getLogger(__name__) -level = logging.getLevelName('ERROR') -logger.setLevel(level) -Q_NODE_SEMANTIC_TYPE = "http://wikidata.org/qnode" -WIKIDATA_URL = "https://tools.wmflabs.org/sqid/#/view?id=" -COLOR_BANK = ['#FFB567', '#36DBFF', '#C1FE9B', '#B89E9E', '#F3FF6D'] -ALL_STATES = set(['alabama', 'alaska', 'arizona', 'arkansas', 'california', 'colorado', 'connecticut', 'delaware', 'florida', 'georgia', 'hawaii', 'idaho', 'illinois', 'indiana', 'iowa', 'kansas', 'kentucky', 'louisiana', 'maine', 'maryland', 'massachusetts', 'michigan', 'minnesota', 'mississippi', 'missouri', 'montana', 'nebraska', 'nevada', 'new-hampshire', 'new-jersey', 'new-mexico', 'new-york', 'north-carolina', 'north-dakota', 'ohio', 'oklahoma', 'oregon', 'pennsylvania', 'rhode-island', 'south-carolina', 'south-dakota', 'tennessee', 'texas', 'utah', 'vermont', 'virginia', 'washington', 'west-virginia', 'wisconsin', 'wyoming']) -URL = "http://dsbox02.isi.edu:9200/" # changing this for other queries -INDEX = "datamart_v2" - -class DATAMART_ID: - datamart_id = "" - -class Count: - count = 0 - -# Disable -def blockPrint(): - sys.stdout = open(os.devnull, 'w') - sys.stderr = open(os.devnull, 'w') -# Restore -def enablePrint(): - sys.stdout = sys.__stdout__ - sys.stderr = sys.__stderr__ - -def load_d3m_dataset(input_ds_loc): - # blockPrint() - from d3m.container.dataset import D3MDatasetLoader - loader = D3MDatasetLoader() - all_dataset_uri = 'file://{}'.format(input_ds_loc) - input_ds = loader.load(dataset_uri=all_dataset_uri) - enablePrint() - return input_ds - -def wikifier_for_d3m_all(input_ds): - # blockPrint() - wikifier_hyperparams = WikifierHyperparams.defaults() - - wikifier_hyperparams = wikifier_hyperparams.replace({"use_columns":()}) - wikifier_primitive = Wikifier(hyperparams = wikifier_hyperparams) - output_ds = wikifier_primitive.produce(inputs = input_ds) - enablePrint() - return output_ds - -def wikifier_for_d3m_fips(input_ds): - # blockPrint() - wikifier_hyperparams = WikifierHyperparams.defaults() - qnodes = List(["P882"]) - wikifier_hyperparams = wikifier_hyperparams.replace({"use_columns":(1,), "specific_q_nodes":qnodes}) - wikifier_primitive = Wikifier(hyperparams = wikifier_hyperparams) - output_ds = wikifier_primitive.produce(inputs = input_ds) - enablePrint() - return output_ds - -def print_search_results(search_result): - out_df = pd.DataFrame() - for each in search_result: - out_df = out_df.append(each.display(), ignore_index=True) - return out_df - -def make_clickable_both(val): - if '#' not in val: - return val - name, url = val.split('#', 1) - return f'{name}' - -def highlight_cols0(s): - color = COLOR_BANK[0] - return 'background-color: %s' % color - -def highlight_cols1(s): - color = COLOR_BANK[1] - return 'background-color: %s' % color - -def highlight_cols2(s): - color = COLOR_BANK[2] - return 'background-color: %s' % color - -def highlight_cols3(s): - color = COLOR_BANK[3] - return 'background-color: %s' % color - -def highlight_cols4(s): - color = COLOR_BANK[4] - return 'background-color: %s' % color - -def color_ending_with_wikidata(column_names): - color_column_list = [] - for each in column_names: - if "_wikidata" in each: - color_column_list.append(each) - return color_column_list - -def color_after_q_nodes(column_names): - color_column_list = [] - afterwards_columns = [] - previous_is_wikidata = False - for each in column_names: - if previous_is_wikidata and "_wikidata" not in each: - afterwards_columns.append(each) - - if "_wikidata" in each: - previous_is_wikidata = True - color_column_list.append(each) - return color_column_list, afterwards_columns - -def pretty_print(input_ds, ds_type="", display_length=10): - output_ds = copy.deepcopy(input_ds) - res_id, inputs_df = d3m_utils.get_tabular_resource(dataset=output_ds, resource_id=None) - - can_mark_dict = {} - can_mark_from = {} - can_mark_list = [] - each_column = list(inputs_df.columns) - if ds_type=="download": - inputs_df = inputs_df.sort_values(by=['joining_pairs'],ascending=False) - color_column_list, afterwards_columns = color_after_q_nodes(each_column) - if ds_type=="wikifier": - color_column_list = color_ending_with_wikidata(each_column) - elif ds_type=="wiki_augment": - color_column_list, afterwards_columns = color_after_q_nodes(each_column) - else: - color_column_list = [] - - for i in range(inputs_df.shape[1]): - selector = (res_id,ALL_ELEMENTS, i) - meta = output_ds.metadata.query(selector) - if "semantic_types" in meta and Q_NODE_SEMANTIC_TYPE in meta["semantic_types"]: - can_mark_dict[each_column[i]] = make_clickable_both - can_mark_list.append(meta["name"]) - temp = meta["name"].split("_wikidata")[0] - can_mark_from[each_column[i]] = temp - can_mark_dict[temp] = make_clickable_both - - for idx, rows in inputs_df.iterrows(): - for each in can_mark_list: - if rows[each] is not np.nan: - rows[can_mark_from[each]] = str(rows[can_mark_from[each]]) + '#'+ WIKIDATA_URL + str(rows[each]) - rows[each] = str(rows[each]) + '#' + WIKIDATA_URL + str(rows[each]) - inputs_df.at[idx,each] = rows[each] - else: - inputs_df.at[idx,each] = str(rows[each]) - - res = inputs_df.iloc[:display_length,:] - res = res.style.format(can_mark_dict).applymap(highlight_cols0, subset=pd.IndexSlice[:, color_column_list]) - if ds_type=="wiki_augment" or ds_type=="download": - res = res.applymap(highlight_cols1, subset=pd.IndexSlice[:, afterwards_columns]) - return res - -def download_FBI_data(states=None, python_path=config.python_path, pypath=config.pypath): - if states is None: - states = ALL_STATES - for each_state in states: - each_state = each_state.lower() - command_download = python_path + " " + config.pypath +"wikidata-wikifier/wikifier/wikidata/FBI_Crime_Model.py download " + each_state - p = subprocess.Popen(command_download, stdout=subprocess.PIPE, shell=True, stderr=subprocess.STDOUT) - while p.poll() == None: - out = p.stdout.readline().strip() - if out: - print (bytes.decode(out)) - -def generate_FBI_data(states=None, python_path=config.python_path, pypath=config.pypath): - if states is None: - states = ALL_STATES - for each_state in states: - each_state = each_state.lower() - command_generate = python_path + " " + pypath +"wikidata-wikifier/wikifier/wikidata/FBI_Crime_Model.py generate " + each_state - p = subprocess.Popen(command_generate, stdout=subprocess.PIPE, shell=True, stderr=subprocess.STDOUT) - while p.poll() == None: - out = p.stdout.readline().strip() - if out: - print (bytes.decode(out)) - -def upload_FBI_data(states=None, mode=""): - if states is None: - states = ALL_STATES - - if mode == "test": - endpoint = config.endpoint_upload_test - else: - endpoint = config.endpoint_upload_main - - for each_state in states: - each_state = each_state.lower() - command_add = config.python_path + config.config_upload + endpoint + "--user " + config.user + " --passwd " + config.password + " -f " + each_state + ".ttl" - command_update_truthy = config.python_path + config.config_update_truthy + endpoint + "--user " + config.user + " --passwd " + config.password + " -f changes_" + each_state + ".tsv" - - p = subprocess.Popen(command_add, stdout=subprocess.PIPE, shell=True, stderr=subprocess.STDOUT) - while p.poll() == None: - out = p.stdout.readline().strip() - if out: - print (bytes.decode(out)) - - p = subprocess.Popen(command_update_truthy, stdout=subprocess.PIPE, shell=True, stderr=subprocess.STDOUT) - while p.poll() == None: - out = p.stdout.readline().strip() - if out: - print (bytes.decode(out)) - -def clean_FBI_data(mode=""): - if mode == "test": - endpoint = config.endpoint_upload_test - else: - endpoint = config.endpoint_upload_main - command_clean = python_path + config_clean_up + endpoint - subprocess.call(command_clean, stdout=subprocess.PIPE, shell=True, stderr=subprocess.STDOUT) - -def upload_wikipedia_data(): - all_metadata = metadata_wikipedia - succeed = upload(meta_list=[all_metadata], es_index=PRODUCTION_ES_INDEX, deduplicate=False) - DATAMART_ID.datamart_id = succeed[0]['datamart_id'] - -def delete_by_IDS(selector, es, index=INDEX): - for v in selector: - try: - es.delete(index=index, doc_type = "_doc", id = v) - except: - print("failed to delete", v) - -def remove_wikipedia_data(ids): - if ids is None: - ids = [DATAMART_ID.datamart_id] - es = Elasticsearch([URL]) - delete_by_IDS(selector=ids,es = es, index=INDEX) \ No newline at end of file +from datamart_isi import config +from pandas.util import hash_pandas_object +_logger = logging.getLogger(__name__) + + +def check_wikifier_choice(input_dataframe: pd.DataFrame) -> typing.Union[bool, None]: + """ + Function used to find the recorded choice for a given dataset, if no record found, will return False + :param input_dataframe: the input supplied dataframe + :return: a bool, True means it need to be wikified, False means not need + """ + try: + hash_input_data = str(hash_pandas_object(input_dataframe).sum()) + file_loc = os.path.join(config.cache_file_storage_base_loc, "other_cache", "wikifier_choice.json") + with open(file_loc, 'r') as f: + wikifier_choices = json.load(f) + + # if we find exist choice, use this + if hash_input_data in wikifier_choices.keys(): + wikifier_choice = wikifier_choices[hash_input_data] + return wikifier_choice + else: + _logger.error("No choice record found for dataset " + hash_input_data) + return None + + except Exception as e: + _logger.error("Check wikifier choice failed!") + _logger.debug(e, exc_info=True) + return None diff --git a/wikifier/wikifier.py b/wikifier/wikifier.py index 89d7e12..25460ac 100644 --- a/wikifier/wikifier.py +++ b/wikifier/wikifier.py @@ -1,9 +1,7 @@ -import os import typing import numpy as np import copy import math -import hashlib import json import logging import requests @@ -12,28 +10,24 @@ import csv from .find_identity import FindIdentity from collections import Counter -# from datamart_isi import config +from datamart_isi.cache.metadata_cache import MetadataCache from datamart_isi.cache.general_search_cache import GeneralSearchCache from datamart_isi.utilities import connection +from .utils import check_wikifier_choice -# DEFAULT_DATAMART_URL = config.default_datamart_url -# CACHE_MANAGER = GeneralSearchCache(connection_url= os.getenv('DATAMART_URL_NYU', DEFAULT_DATAMART_URL)) CACHE_MANAGER = GeneralSearchCache() - -try: - from datamart_isi.config import default_temp_path - - DEFAULT_TEMP_PATH = default_temp_path -except: - DEFAULT_TEMP_PATH = "/tmp" - _logger = logging.getLogger(__name__) -# NEW_WIKIFIER_SERVER = config.new_wikifier_server NEW_WIKIFIER_SERVER = connection.get_wikifier_knowledge_graph_server_url() def produce(inputs, target_columns: typing.List[int] = None, target_p_nodes: typing.List[str] = None, input_type: str = "pandas", wikifier_choice: typing.List[str] = None, threshold: float = 0.7, use_cache=True): + if target_columns is None and target_p_nodes is None and wikifier_choice is None and use_cache: + do_wikifier = check_wikifier_choice(inputs) + if do_wikifier is False: + _logger.info("Detect dataset which should not wikified!") + return inputs + if input_type == "pandas": # use general search cache system to cache the wikifier results produce_config = {"target_columns": target_columns, "target_p_nodes": target_p_nodes, @@ -41,25 +35,36 @@ def produce(inputs, target_columns: typing.List[int] = None, target_p_nodes: typ if use_cache: cache_key = CACHE_MANAGER.get_hash_key(inputs, json.dumps(produce_config)) + _logger.debug("Current wikification's key is " + cache_key) cache_result = CACHE_MANAGER.get_cache_results(cache_key) + # get the cache for wikified results directly if cache_result is not None: _logger.info("Using caching results for wikifier") return cache_result # END cache part + else: + _logger.debug("Cache not hitted.") + else: + _logger.debug("Not use cache for this time's wikification.") + + column_to_p_node_dict = dict() if wikifier_choice is None: - # FIXME: Currently the new wikifier maybe very slow for large datasets - # return_df = produce_for_pandas(inputs, target_columns, target_p_nodes, threshold) - return_df = produce_by_automatic(inputs, target_columns, target_p_nodes, threshold) + return_df, column_to_p_node_dict_all = produce_by_automatic(inputs, target_columns, target_p_nodes, threshold) + column_to_p_node_dict.update(column_to_p_node_dict_all) elif target_columns is None: if wikifier_choice[0] == "identifier": - return_df = produce_for_pandas(inputs, target_columns, target_p_nodes, threshold) + return_df, column_to_p_node_dict_identifier = produce_for_pandas(inputs, target_columns, target_p_nodes, + threshold) + column_to_p_node_dict.update(column_to_p_node_dict_identifier) elif wikifier_choice[0] == "new_wikifier": - return_df = produce_by_new_wikifier(inputs, target_columns, target_p_nodes, threshold) + return_df, column_to_p_node_dict_new_wikifier = produce_by_new_wikifier(inputs, target_columns, target_p_nodes, + threshold) + column_to_p_node_dict.update(column_to_p_node_dict_new_wikifier) else: - return_df = produce_by_automatic(inputs, target_columns, target_p_nodes, threshold) + return_df, column_to_p_node_dict_all = produce_by_automatic(inputs, target_columns, target_p_nodes, threshold) + column_to_p_node_dict.update(column_to_p_node_dict_all) else: - col_name = inputs.columns.tolist() col_identifier, col_new_wikifier, col_auto = [], [], [] for i in range(len(wikifier_choice)): if wikifier_choice[i] == "identifier": @@ -68,32 +73,32 @@ def produce(inputs, target_columns: typing.List[int] = None, target_p_nodes: typ col_new_wikifier.append(target_columns[i]) else: col_auto.append(target_columns[i]) - col_res = list(set([i for i in range(len(col_name))]).difference(set(col_new_wikifier + col_identifier + col_auto))) - return_df = copy.deepcopy(inputs.iloc[:, col_res]) + return_df = copy.deepcopy(inputs) if col_identifier: - return_df_identifier = produce_for_pandas(inputs.iloc[:, col_identifier], - [i for i in range(len(col_identifier))], - target_p_nodes, threshold) - # change the column name index - col_tmp = return_df_identifier.columns.tolist() - col_name.extend(list(set(col_tmp).difference(set(col_name).intersection(set(col_tmp))))) - return_df = pd.concat([return_df, return_df_identifier], axis=1) + return_df_identifier, column_to_p_node_dict_identifier = produce_for_pandas(inputs.iloc[:, col_identifier], + [i for i in + range(len(col_identifier))], + target_p_nodes, threshold) + column_to_p_node_dict.update(column_to_p_node_dict_identifier) + return_df = pd.concat([return_df, return_df_identifier.iloc[:, len(col_identifier):]], axis=1) if col_new_wikifier: - return_df_new = produce_by_new_wikifier(inputs.iloc[:, col_new_wikifier], - [i for i in range(len(col_new_wikifier))], - target_p_nodes, threshold) - col_tmp = return_df_new.columns.tolist() - col_name.extend(list(set(col_tmp).difference(set(col_name).intersection(set(col_tmp))))) - return_df = pd.concat([return_df, return_df_new], axis=1) + return_df_new, column_to_p_node_dict_new_wikifier = produce_by_new_wikifier(inputs.iloc[:, col_new_wikifier], + [i for i in + range(len(col_new_wikifier))], + target_p_nodes, threshold) + column_to_p_node_dict.update(column_to_p_node_dict_new_wikifier) + return_df = pd.concat([return_df, return_df_new.iloc[:, len(col_new_wikifier):]], axis=1) if col_auto: - return_df_auto = produce_by_automatic(inputs.iloc[:, col_auto], [i for i in range(len(col_auto))], - target_p_nodes, threshold) - col_tmp = return_df_auto.columns.tolist() - col_name.extend(list(set(col_tmp).difference(set(col_name).intersection(set(col_tmp))))) - return_df = pd.concat([return_df, return_df_auto], axis=1) + return_df_auto, column_to_p_node_dict_all = produce_by_automatic(inputs.iloc[:, col_auto], + [i for i in range(len(col_auto))], + target_p_nodes, threshold) + column_to_p_node_dict.update(column_to_p_node_dict_all) + return_df = pd.concat([return_df, return_df_auto.iloc[:, len(col_auto):]], axis=1) - return_df = return_df[col_name] + # push the full dataset specific relationship here + if column_to_p_node_dict: + MetadataCache.save_specific_wikifier_targets(inputs, column_to_p_node_dict) if use_cache: # push to cache system @@ -120,13 +125,13 @@ def produce(inputs, target_columns: typing.List[int] = None, target_p_nodes: typ def all_in_range_0_to_100(inputs): min_val = min(inputs) max_val = max(inputs) - if min_val <= 100 and min_val >= 0 and max_val >= 0 and max_val <= 100: + if 100 >= min_val >= 0 and 0 <= max_val <= 100: return True else: return False -def are_almost_continues_numbers(inputs, threshold=0.7): +def are_almost_continues_numbers(inputs, threshold): min_val = min(inputs) max_val = max(inputs) if (max_val - min_val) * threshold <= len(inputs): @@ -142,12 +147,12 @@ def one_character_alphabet(inputs): def produce_for_pandas(input_df, target_columns: typing.List[int] = None, target_p_nodes: dict = None, - threshold_for_coverage=0.7): + threshold_for_coverage=0.7) -> typing.Tuple[pd.DataFrame, dict]: """ function used to produce for input type is pandas.dataFrame :param input_df: input pd.dataFrame :param target_columns: target columns to find with wikidata - :param target_p_nodes: user-speicified P node want to get, can be None if want automatic search + :param target_p_nodes: user-specified P node want to get, can be None if want automatic search :param threshold_for_coverage: minimum coverage ratio for a wikidata columns to be appended :return: a pd.dataFrame with updated columns from wikidata """ @@ -164,8 +169,8 @@ def produce_for_pandas(input_df, target_columns: typing.List[int] = None, target _logger.debug('Current column: ' + current_column_name) try: temp = set() - for each in input_df.iloc[:, column].dropna().tolist(): - temp.add(int(each)) + for each in input_df.iloc[:, column].dropna(): + temp.add(float(each)) if all_in_range_0_to_100(temp) or are_almost_continues_numbers(temp, threshold_for_coverage): _logger.debug("Column with all numerical values and useless detected, skipped") continue @@ -218,30 +223,29 @@ def produce_for_pandas(input_df, target_columns: typing.List[int] = None, target " which is less than threshold " + str(threshold_for_coverage)) continue column_to_p_node_dict[current_column_name] = res[0] - col_name = current_column_name + '_wikidata' + col_name = current_column_name + '_wikidata_' + str(idx) return_df[col_name] = new_col break if column_to_p_node_dict: - save_specific_p_nodes(input_df, column_to_p_node_dict) - return return_df + MetadataCache.save_specific_wikifier_targets(input_df, column_to_p_node_dict) + + return return_df, column_to_p_node_dict def produce_by_new_wikifier(input_df, target_columns=None, target_p_nodes: dict = None, - threshold_for_coverage=0.7) -> pd.DataFrame: + threshold_for_coverage=0.7) -> typing.Tuple[pd.DataFrame, dict]: """ The function used to call new wikifier service :param input_df: a dataframe(both d3m or pandas are acceptable) - :param target_columns: typing.List[int] indicates the column numbers of the columns need to be wikified - :param threshold_for_coverage: the minimum coverage of Q nodes for the column, - if the appeared times are lower than threshold, we will not use it :return: a dataframe with wikifiered columns """ - _logger.debug("Start running new wikifier") + _logger.info("Start running new wikifier") if target_columns is None: target_columns = list(range(input_df.shape[1])) - col_names = [] + column_to_p_node_dict = dict() + for column in target_columns: current_column_name = input_df.columns[column] # for special case that if a column has only one character for each row, we should skip it @@ -253,7 +257,7 @@ def produce_by_new_wikifier(input_df, target_columns=None, target_p_nodes: dict col_names.append(current_column_name) if not col_names: - return input_df + return input_df, column_to_p_node_dict _logger.debug("Following {} columns will be send to new wikifier:".format(str(len(col_names)))) _logger.debug(str(col_names)) @@ -291,23 +295,24 @@ def produce_by_new_wikifier(input_df, target_columns=None, target_p_nodes: dict return_df = pd.DataFrame(data[1:], columns=data[0]) col_name = return_df.columns.tolist() for cn in col_name: - if "_WK" in cn: + if "_QNodes" in cn: new_name = cn.split('_')[0] + "_wikidata" return_df.rename(columns={cn: new_name}, inplace=True) _logger.debug("Get data from the new wikifier successfully.") if column_to_p_node_dict: - _logger.debug("For each column, the best matching class is:" + str(column_to_p_node_dict)) - save_specific_p_nodes(input_df, column_to_p_node_dict) + _logger.info("For each column, the best matching class is:" + str(column_to_p_node_dict)) + MetadataCache.save_specific_wikifier_targets(input_df, column_to_p_node_dict) else: _logger.error('Something wrong in new wikifier server with response code: ' + response.text) _logger.debug("Wikifier_choice will change to identifier") - return_df = produce_for_pandas(input_df=input_df, target_columns=target_columns, threshold_for_coverage=0.7) - - return return_df + return_df, column_to_p_node_dict_identifier = produce_for_pandas(input_df=input_df, target_columns=target_columns, + threshold_for_coverage=0.7) + column_to_p_node_dict.update(column_to_p_node_dict_identifier) + return return_df, column_to_p_node_dict -def produce_by_automatic(input_df, target_columns=None, target_p_nodes=None, - threshold_for_coverage=0.7) -> pd.DataFrame: +def produce_by_automatic(input_df, target_columns=None, target_p_nodes=None, threshold_for_coverage=0.7) \ + -> typing.Tuple[pd.DataFrame, dict]: """ The function used to call new wikifier service :param input_df: a dataframe(both d3m or pandas are acceptable) @@ -318,75 +323,59 @@ def produce_by_automatic(input_df, target_columns=None, target_p_nodes=None, :return: a dataframe with wikifiered columns """ _logger.debug("Start running automatic wikifier") + column_to_p_node_dict = dict() if target_columns is None: target_columns = list(range(input_df.shape[1])) - col_new_wikifier, col_identifier = [], [] for column in target_columns: current_column_name = input_df.columns[column] - _logger.debug('Current column: ' + current_column_name) + # _logger.debug('Current column: ' + current_column_name) if target_p_nodes is not None and current_column_name in target_p_nodes.keys(): if "Q" in target_p_nodes[current_column_name]: col_new_wikifier.append(column) - _logger.debug(current_column_name + ' is text column, will choose new wikifier') + # _logger.debug(current_column_name + ' is text column, will choose new wikifier') elif "P" in target_p_nodes[current_column_name]: col_identifier.append(column) - _logger.debug(current_column_name + ' is numeric column, will choose identifier') + # _logger.debug(current_column_name + ' is numeric column, will choose identifier') else: try: if input_df.iloc[:, column].astype(float).dtypes == "float64" or input_df.iloc[:, column].astype( int).dtypes == "int64": - _logger.debug(current_column_name + ' is numeric column, will choose identifier') + # _logger.debug(current_column_name + ' is numeric column, will choose identifier') col_identifier.append(column) except: - _logger.debug(current_column_name + ' is text column, will choose new wikifier') + # _logger.debug(current_column_name + ' is text column, will choose new wikifier') col_new_wikifier.append(column) - col_name = input_df.columns.tolist() + if col_identifier: + _logger.info("Following columns will be sent to identifier:") + _logger.info(str(col_identifier)) + if col_new_wikifier: + _logger.info("Following columns will be sent to new wikifier:") + _logger.info(str(col_new_wikifier)) + if not col_new_wikifier and not col_identifier: + _logger.info("No column will be wikified!") + return_df_identifier = input_df.iloc[:, col_identifier] return_df_new = input_df.iloc[:, col_new_wikifier] - col_res = list(set([i for i in range(len(col_name))]).difference(set(col_new_wikifier + col_identifier))) - return_df = copy.deepcopy(input_df.iloc[:, col_res]) + return_df = copy.deepcopy(input_df) if col_identifier: - return_df_identifier = produce_for_pandas(return_df_identifier, [i for i in range(len(col_identifier))], - target_p_nodes, threshold_for_coverage) - col_tmp = return_df_identifier.columns.tolist() - # change the column name index - col_name.extend(list(set(col_tmp).difference(set(col_name).intersection(set(col_tmp))))) + return_df_identifier, column_to_p_node_dict_identifier = produce_for_pandas(return_df_identifier, + [i for i in range(len(col_identifier))], + target_p_nodes, threshold_for_coverage) + column_to_p_node_dict.update(column_to_p_node_dict_identifier) + if col_new_wikifier: - return_df_new = produce_by_new_wikifier(return_df_new, [i for i in range(len(col_new_wikifier))], - target_p_nodes, threshold_for_coverage) - col_tmp = return_df_new.columns.tolist() - col_name.extend(list(set(col_tmp).difference(set(col_name).intersection(set(col_tmp))))) - return_df = pd.concat([return_df, return_df_identifier, return_df_new], axis=1) + return_df_new, column_to_p_node_dict_new_wikifier = produce_by_new_wikifier(return_df_new, + [i for i in range(len(col_new_wikifier))], + target_p_nodes, threshold_for_coverage) + column_to_p_node_dict.update(column_to_p_node_dict_new_wikifier) + return_df = pd.concat([return_df, return_df_identifier.iloc[:, len(col_identifier):], return_df_new.iloc[:, len(col_new_wikifier):]], axis=1) - return return_df[col_name] + return return_df, column_to_p_node_dict def coverage(column): count_stats = Counter(column) return (len(column) - count_stats['']) / len(column) - - -def save_specific_p_nodes(original_dataframe, column_to_p_node_dict) -> bool: - try: - original_columns_list = original_dataframe.columns.tolist() - original_columns_list.sort() - hash_generator = hashlib.md5() - - hash_generator.update(str(original_columns_list).encode('utf-8')) - hash_key = str(hash_generator.hexdigest()) - temp_path = os.getenv('D3MLOCALDIR', DEFAULT_TEMP_PATH) - specific_q_nodes_file = os.path.join(temp_path, hash_key + "_column_to_P_nodes") - if os.path.exists(specific_q_nodes_file): - _logger.warning("The specific p nodes file already exist! Will replace the old one!") - - with open(specific_q_nodes_file, 'w') as f: - json.dump(column_to_p_node_dict, f) - - return True - - except Exception as e: - _logger.debug(e, exc_info=True) - return False