Merge branch 'gh-pages'

* gh-pages: NCBI importer for recently published whole genomes
giffordlabcvr · Apr 27, 2024 · 35e7471 · 35e7471
2 parents fc39516 + 1c5a8d7
commit 35e7471
Showing 1 changed file with 21 additions and 0 deletions.
diff --git a/modules/build/core/parvoNcbiImporterNewGenomes2020-23.xml b/modules/build/core/parvoNcbiImporterNewGenomes2020-23.xml
@@ -0,0 +1,21 @@
+<!-- Import parvovirus genomes sequences from NCBI -->
+<ncbiImporter>
+  <giNumberFieldName>gb_gi_number</giNumberFieldName>
+  <sequenceFormat>GENBANK_XML</sequenceFormat>
+  <sourceName>ncbi-new-genomes</sourceName>
+
+  <!-- The eSearchTerm is a standard NCBI entrez text query. 
+       An alternative to eSearchTerm is to specify specific GI numbers or Primary Accessions 
+       to retrieve. See ncbiImporterGiNumbers.xml and ncbiImporterPrimaryAccessions.xml for syntax examples -->
+  <eSearchTerm>"Parvovirus"[Organism] AND 2000:6000[SLEN] AND 2020/1/1:2024/1/1[Publication Date]</eSearchTerm>
+  <!-- This specifies the field of the GenBank document which will be used as the GLUE sequence ID. The options are:
+       GI_NUMBER (default if omitted)
+       PRIMARY_ACCESSION -->
+  <sequenceIdField>PRIMARY_ACCESSION</sequenceIdField>
+  <!-- Maximum number of NCBI records to return. If omitted, defaults to 4000 -->
+  <eSearchRetMax>1000000</eSearchRetMax>
+  <!-- Retrieval operates in batches to avoid overloading NCBI. 
+       This is the number of sequences to retrieve per batch -->
+  <eFetchBatchSize>800</eFetchBatchSize>
+
+</ncbiImporter>