From 97378283a55404b39426515033948fc9ff37547f Mon Sep 17 00:00:00 2001 From: Geo Pertea Date: Fri, 5 Feb 2021 17:33:52 -0500 Subject: [PATCH] GRangeParser updated to support single coordinate --- GBase.cpp | 76 ++++++++++++++++++++++++++++++++----------------------- GBase.h | 3 +-- gff.cpp | 7 +++-- 3 files changed, 51 insertions(+), 35 deletions(-) diff --git a/GBase.cpp b/GBase.cpp index 94ca5a9..f5148e7 100644 --- a/GBase.cpp +++ b/GBase.cpp @@ -254,14 +254,15 @@ FILE* Gfopen(const char *path, char *mode) { GMessage("Error opening file '%s': %s\n", path, strerror(errno)); return f; } - +#define IS_CHR_DELIM(c) ( c == ' ' || c == '\t' || c == ':' ) void GRangeParser::parse(char* s) { - //parses general range format: [[+/-]refID[+/-][ ]:]start[-/..]end[ :][+/-] - // if ref ID has ':' characters a space delimited format is accepted to separate - // the ref ID from the coordinate range: [[+/-]refstart[[-/..][end]] - //the safest way is to parse from the end in case the ref ID has ':' characters + //parses general range format: [+\-|.]refID[+\-\.][ |:][start][-|.\s]end[\s:][+\-\.] + // if ref ID has ':' a space delimited format is preferred + // or just separate the ref ID from the coordinate range: [[+/-]refstart[[-..][end]] + //the safest way would be to parse from the end in case the ref ID has ':' characters + //if the whole chromosome is intended (no coordinates to speficy) this->start=0; - this->end=MAX_UINT; + this->end=0; this->strand=0; int slen=strlen(s); if (slen==0) return; @@ -272,49 +273,62 @@ void GRangeParser::parse(char* s) { strand=c; ++s;slen--; } - if (*s==':' || *s==' ') //ignore + if (strand && (*s==':' || *s==' ')) //ignore { s++;slen--; } char* p=s; //parsing position for coordinate string char* isep=strpbrk(s, " \t"); if (isep==NULL) isep=strchr(s, ':'); - if (isep) { //chr (ref) ID was given + if (isep) { //chr (ref) ID ending found p=isep+1; *isep=0; - char c=*(isep-1); - if (strand==0 && (c=='+' || c=='-')) { - //strand given after the ref ID - isep--; - strand=c; - *isep=0; //ref is now parsable + //character after the delimiter can only be a strand if it's followed by another delimiter + //e.g. chr1 + 134551-204326 or chr1:+:134551-204326 + c=*(isep+1); + if (strand==0 && (c=='+' || c=='-' || c=='.') && IS_CHR_DELIM(*(isep+2))) { + strand=c; + p=isep+3; + } + if (strand==0) { + c=*(isep-1); //character before the delimiter could be the strand + if (c=='+' || c=='-') { //not '.', sorry + isep--; + strand=c; + *isep=0; //ref is now parsable + } } this->refName=Gstrdup(s,isep-1); } - c=s[slen-1]; - if (c=='+' || c=='-' || c=='.') { - strand=c; - slen--;s[slen]=0; - } + //here we are after ref ID (and possibly strand) delimiter char* pend=p; - while (isdigit(*pend)) pend++; - c=*pend; - *pend=0; - this->start=atoi(p); - p=pend; - *p=c; - while (*p=='-' || *p=='.' || isspace(*p)) ++p; + if (isdigit(*pend)) { + //parse the start coordinate then + do { pend++; } while (isdigit(*pend)); + c=*pend; + *pend=0; + this->start=atoi(p); + p=pend; + *p=c; + } + while (*p=='-' || *p=='.' || *p==' ' || *p=='\t') ++p; pend=p; while (isdigit(*pend)) pend++; - if (pend>=p) { //parse the 2nd coordinate + if (pend>p) { //parse the 2nd coordinate + c=*pend; *pend=0; this->end=atoi(p); - if (this->end==0) this->end=MAX_UINT; + *pend=c; + } + if (start && end && endstart, this->end); + //if (strand==0) { ? + c=s[slen-1]; //peek at the end of the string for strand + if (c=='+' || c=='-' || c=='.') { + if (end || IS_CHR_DELIM(s[slen-2])) + strand=c; + //slen--;s[slen]=0; } - if (this->endstart) Gswap(this->start, this->end); } - - bool GstrEq(const char* a, const char* b) { if (a==NULL || b==NULL) return false; return (strcmp(a, b)==0); diff --git a/GBase.h b/GBase.h index ea399c3..1ea4307 100644 --- a/GBase.h +++ b/GBase.h @@ -431,10 +431,9 @@ struct GSeg { struct GRangeParser: GSeg { char* refName=NULL; - int gseq_id=-1; char strand=0; void parse(char* s); - GRangeParser(char* s=NULL):GSeg(0, INT_MAX) { + GRangeParser(char* s=NULL):GSeg(0, 0) { if (s) parse(s); } ~GRangeParser() { diff --git a/gff.cpp b/gff.cpp index 48d4f7e..df167d2 100644 --- a/gff.cpp +++ b/gff.cpp @@ -32,7 +32,7 @@ void gffnames_unref(GffNames* &n) { } const byte CLASSCODE_OVL_RANK = 14; //rank value just above 'o' class code -//rank value < this means exon overlap +//rank value < this means exon overlap const byte CLASSCODE_J_RANK = 6; // all junctional based overlaps @@ -415,13 +415,16 @@ GffLine::GffLine(GffReader* reader, const char* l): _parents(NULL), _parents_len line[i]=0; t[tidx]=line+i+1; tidx++; - if (tidx>8) break; + //if (tidx>8) break; } i++; } if (tidx<8) { // ignore non-GFF lines return; } + if (tidx>9) { + GMessage("Warning: unexpected tab character in last column, line truncated:\n\%s\n",l); + } gffWarnings=reader->gff_warns; gseqname=t[0]; track=t[1];