Skip to content

Commit

Permalink
add trait data
Browse files Browse the repository at this point in the history
  • Loading branch information
tarcisio_adm committed Aug 22, 2024
1 parent 3f3e70b commit 0d6f88e
Show file tree
Hide file tree
Showing 4 changed files with 272 additions and 3 deletions.
2 changes: 1 addition & 1 deletion scripts/sql_insert_emi_data/Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ csvkit = "1.3.0"
pandas = "*"
sqlite3-to-mysql = "*"
networkx = "*"
matchms = "*"
matchms = "0.26.4"
spec2vec = "*"

[dev-packages]
Expand Down
13 changes: 11 additions & 2 deletions scripts/sql_insert_emi_data/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@
raw_data_root_folder = "data/output/individual_analysis"
#if an empty string or None does not consider it
open_tree_of_life_dir = None
#if an empty string or None does not consider it
trait_dir = None
ionization_mode = 'pos'
structure_metadata_sqlite_file = "data/structures_metadata.db"
# MySQL table mapping where values are mysql table names corresponding to the dictionary keys
#For trait_data: tables names should be exactly the same as the file names without file extension
table_canonical_names = {'compound_summary': 'canopus_compound_summary',
'sample_metadata': 'sample_metadata',
'taxon_metadata': 'taxon_metadata',
Expand All @@ -16,12 +19,18 @@
'spec2vec_doc': 'spec2vec_doc',
'structures_metadata': 'structures_metadata',
'opentreeoflife': 'open_tree_life',
'opentreeoflife_synonym': 'open_tree_life_synonym'
'opentreeoflife_synonym': 'open_tree_life_synonym',
'trait_data': { 'traits': 'traits',
'trydbAll': 'trydbAll',
'taxonomy': 'taxonomy',
'enpkg': 'enpkg',
'lotus': 'lotus',
'interactions': 'interactions'}
}

# MySQL db credentials
host = "localhost"
user = "root"
password = "root_mysql"
password = "root"
database = "emi_db"
port = "3306"
13 changes: 13 additions & 0 deletions scripts/sql_insert_emi_data/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,4 +68,17 @@
)
conversion.transfer()

if trait_dir is not None and trait_dir != "":
for table_name in table_canonical_names['trait_data'].values():
if table_name in ["enpkg","trydbAll","interactions"]:
SQLDataInsertion.sql_insert_emi_data(
trait_dir,
table_name, ".tsv", mydb, name_prefix=table_name,
is_sample_folder=False, enclosed_by='"', terminated_by='\t')
else:
SQLDataInsertion.sql_insert_emi_data(
trait_dir,
table_name, ".csv", mydb, name_prefix=table_name,
is_sample_folder=False, enclosed_by='"', terminated_by=',')

# See PyCharm help at https://www.jetbrains.com/help/pycharm/
247 changes: 247 additions & 0 deletions scripts/sql_insert_emi_data/raw_mysql_schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -251,4 +251,251 @@ CREATE TABLE `open_tree_life_synonym` (
uniqname TEXT,
sourceinfo TEXT );

DROP TABLE IF EXISTS traits ;
CREATE TABLE traits (
TraitID VARCHAR(300) ,
TraitName VARCHAR(300),
TOP_ID VARCHAR(300),
PRIMARY KEY (TraitID));

DROP TABLE IF EXISTS trydbAll ;
CREATE TABLE trydbAll (
LastName VARCHAR(300),
FirstName VARCHAR(300),
DatasetID VARCHAR(300),
Dataset VARCHAR(300),
SpeciesName VARCHAR(300),
AccSpeciesID VARCHAR(300),
AccSpeciesName VARCHAR(300),
ObservationID VARCHAR(300) ,
ObsDataID VARCHAR(300) ,
TraitID VARCHAR(300),
TraitName VARCHAR(300),
DataID VARCHAR(300),
DataName VARCHAR(300),
OriglName TEXT,
OrigValueStr VARCHAR(300),
OrigUnitStr VARCHAR(300),
ValueKindName VARCHAR(300),
OrigUncertaintyStr VARCHAR(300),
UncertaintyName VARCHAR(300),
Replicates VARCHAR(300),
StdValue VARCHAR(300),
UnitName VARCHAR(300),
RelUncertaintyPercent VARCHAR(300),
OrigObsDataID VARCHAR(300),
ErrorRisk VARCHAR(300),
`Reference` TEXT,
`Comment` TEXT,
StdValueStr TEXT,
DataType VARCHAR(300));

DROP TABLE IF EXISTS taxonomy ;
CREATE TABLE taxonomy (
WdID VARCHAR(300),
eol VARCHAR(300),
gbif_wd VARCHAR(300),
ncbi_wd VARCHAR(300),
ott VARCHAR(300),
inat VARCHAR(300),
itis VARCHAR(300),
irmng VARCHAR(300),
col VARCHAR(300),
nbn VARCHAR(300),
worms VARCHAR(300),
bold VARCHAR(300),
plazi VARCHAR(300),
apni VARCHAR(300),
WdName VARCHAR(300),
ncbi_otol VARCHAR(300),
gbif_otol VARCHAR(300),
TRY_AccSpeciesName VARCHAR(300)
#, PRIMARY KEY (TRY_SpeciesID, wikidata_id)
);

DROP TABLE IF EXISTS enpkg ;
CREATE TABLE enpkg (
raw_material VARCHAR(300),
wd_taxon_id VARCHAR(300),
submitted_taxon VARCHAR(300),
inch VARCHAR(300),
wd_chem VARCHAR(300),
organ VARCHAR(300),
broad_organ VARCHAR(300),
tissue VARCHAR(300),
subsystem VARCHAR(300));

DROP TABLE IF EXISTS lotus ;
CREATE TABLE lotus(
structure_wikidata VARCHAR(300),
structure_inchikey VARCHAR(300),
structure_inchi VARCHAR(300),
structure_smiles VARCHAR(300),
structure_molecular_formula VARCHAR(300),
structure_exact_mass VARCHAR(300),
structure_xlogp VARCHAR(300),
structure_smiles_2D VARCHAR(300),
structure_cid VARCHAR(300),
structure_nameIupac VARCHAR(300),
structure_nameTraditional VARCHAR(300),
structure_stereocenters_total VARCHAR(300),
structure_stereocenters_unspecified VARCHAR(300),
structure_taxonomy_npclassifier_01pathway VARCHAR(300),
structure_taxonomy_npclassifier_02superclass VARCHAR(300),
structure_taxonomy_npclassifier_03class VARCHAR(300),
structure_taxonomy_classyfire_chemontid VARCHAR(300),
structure_taxonomy_classyfire_01kingdom VARCHAR(300),
structure_taxonomy_classyfire_02superclass VARCHAR(300),
structure_taxonomy_classyfire_03class VARCHAR(300),
structure_taxonomy_classyfire_04directparent VARCHAR(300),
organism_wikidata VARCHAR(300),
organism_name VARCHAR(300),
organism_taxonomy_gbifid VARCHAR(300),
organism_taxonomy_ncbiid VARCHAR(300),
organism_taxonomy_ottid VARCHAR(300),
organism_taxonomy_01domain VARCHAR(300),
organism_taxonomy_02kingdom VARCHAR(300),
organism_taxonomy_03phylum VARCHAR(300),
organism_taxonomy_04class VARCHAR(300),
organism_taxonomy_05order VARCHAR(300),
organism_taxonomy_06family VARCHAR(300),
organism_taxonomy_07tribe VARCHAR(300),
organism_taxonomy_08genus VARCHAR(300),
organism_taxonomy_09species VARCHAR(300),
organism_taxonomy_10varietas VARCHAR(300),
reference_wikidata VARCHAR(300),
reference_doi VARCHAR(300),
manual_validation VARCHAR(300));

DROP TABLE IF EXISTS interactions ;
CREATE TABLE interactions (
sourceTaxonId TEXT,
sourceTaxonName TEXT,
sourceTaxonRank TEXT,
sourceTaxonPathNames TEXT,
sourceTaxonPathIds TEXT,
sourceTaxonPathRankNames TEXT,
sourceTaxonSpeciesName TEXT,
sourceTaxonSpeciesId TEXT,
sourceTaxonSubgenusName TEXT,
sourceTaxonSubgenusId TEXT,
sourceTaxonGenusName TEXT,
sourceTaxonGenusId TEXT,
sourceTaxonFamilyName TEXT,
sourceTaxonFamilyId TEXT,
sourceTaxonOrderName TEXT,
sourceTaxonOrderId TEXT,
sourceTaxonClassName TEXT,
sourceTaxonClassId TEXT,
sourceTaxonPhylumName TEXT,
sourceTaxonPhylumId TEXT,
sourceTaxonKingdomName TEXT,
sourceTaxonKingdomId TEXT,
sourceId TEXT,
sourceOccurrenceId TEXT,
sourceInstitutionCode TEXT,
sourceCollectionCode TEXT,
sourceCatalogNumber TEXT,
sourceBasisOfRecordId TEXT,
sourceBasisOfRecordName TEXT,
sourceLifeStageId TEXT,
sourceLifeStageName TEXT,
sourceBodyPartId TEXT,
sourceBodyPartName TEXT,
sourcePhysiologicalStateId TEXT,
sourcePhysiologicalStateName TEXT,
sourceSexId TEXT,
sourceSexName TEXT,
interactionTypeName TEXT,
interactionTypeId TEXT,
targetTaxonId TEXT,
targetTaxonName TEXT,
targetTaxonRank TEXT,
targetTaxonPathNames TEXT,
targetTaxonPathIds TEXT,
targetTaxonPathRankNames TEXT,
targetTaxonSpeciesName TEXT,
targetTaxonSpeciesId TEXT,
targetTaxonSubgenusName TEXT,
targetTaxonSubgenusId TEXT,
targetTaxonGenusName TEXT,
targetTaxonGenusId TEXT,
targetTaxonFamilyName TEXT,
targetTaxonFamilyId TEXT,
targetTaxonOrderName TEXT,
targetTaxonOrderId TEXT,
targetTaxonClassName TEXT,
targetTaxonClassId TEXT,
targetTaxonPhylumName TEXT,
targetTaxonPhylumId TEXT,
targetTaxonKingdomName TEXT,
targetTaxonKingdomId TEXT,
targetId TEXT,
targetOccurrenceId TEXT,
targetInstitutionCode TEXT,
targetCollectionCode TEXT,
targetCatalogNumber TEXT,
targetBasisOfRecordId TEXT,
targetBasisOfRecordName TEXT,
targetLifeStageId TEXT,
targetLifeStageName TEXT,
targetBodyPartId TEXT,
targetBodyPartName TEXT,
targetPhysiologicalStateId TEXT,
targetPhysiologicalStateName TEXT,
targetSexId TEXT,
targetSexName TEXT,
decimalLatitude TEXT,
decimalLongitude TEXT,
localityId TEXT,
localityName TEXT,
eventDate TEXT,
argumentTypeId TEXT,
referenceCitation TEXT,
referenceDoi TEXT,
referenceUrl TEXT,
sourceCitation TEXT,
sourceNamespace TEXT,
sourceArchiveURI TEXT,
sourceDOI TEXT,
sourceLastSeenAtUnixEpoch TEXT,
source_BOLD TEXT,
source_COL TEXT,
source_ENVO TEXT,
source_EOL TEXT,
source_FB TEXT,
source_FBC TEXT,
source_GBIF TEXT,
source_IF TEXT,
source_IRMNG TEXT,
source_ITIS TEXT,
source_NBN TEXT,
source_NCBI TEXT,
source_PBDB TEXT,
source_SLB TEXT,
source_SPECCODE TEXT,
source_TAXON TEXT,
source_W TEXT,
source_WD TEXT,
source_WORMS TEXT,
target_BOLD TEXT,
target_COL TEXT,
target_ENVO TEXT,
target_EOL TEXT,
target_FB TEXT,
target_FBC TEXT,
target_GBIF TEXT,
target_IF TEXT,
target_IRMNG TEXT,
target_ITIS TEXT,
target_NBN TEXT,
target_NCBI TEXT,
target_PBDB TEXT,
target_SLB TEXT,
target_SPECCODE TEXT,
target_TAXON TEXT,
target_W TEXT,
target_WD TEXT,
target_WORMS TEXT);

SET FOREIGN_KEY_CHECKS = 1;

0 comments on commit 0d6f88e

Please sign in to comment.