-
Notifications
You must be signed in to change notification settings - Fork 0
/
regex.c
125 lines (113 loc) · 3.31 KB
/
regex.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
/* The Computer Language Benchmarks Game
* http://benchmarksgame.alioth.debian.org/
** contributed by Mike Pall
**
** regex-dna benchmark using PCRE
**
** compile with:
** gcc -O3 -fomit-frame-pointer -o regexdna regexdna.c -lpcre
*/
#define __USE_STRING_INLINES
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <pcre.h>
typedef struct fbuf {
char *buf;
size_t size, len;
} fbuf_t;
static void fb_init(fbuf_t *b)
{
b->buf = NULL;
b->len = b->size = 0;
}
static char *fb_need(fbuf_t *b, size_t need)
{
need += b->len;
if (need > b->size) {
if (b->size == 0) b->size = need;
else while (need > b->size) b->size += b->size;
if (!(b->buf = realloc(b->buf, b->size))) exit(1);
}
return b->buf+b->len;
}
#define FB_MINREAD (3<<16)
/* Read all of a stdio stream into dst buffer. */
static size_t fb_readall(fbuf_t *dst, FILE *fp)
{
char *dp;
int n;
for (dp = fb_need(dst, FB_MINREAD);
(n = fread(dp, 1, dst->size-dst->len, fp)) > 0;
dp = fb_need(dst, FB_MINREAD)) dst->len += n;
if (ferror(fp)) exit(1);
return dst->len;
}
/* Substitute pattern p with replacement r, copying from src to dst buffer. */
static size_t fb_subst(fbuf_t *dst, fbuf_t *src, const char *p, const char *r)
{
pcre *re;
pcre_extra *re_ex;
const char *re_e;
char *dp;
int re_eo, m[3], pos, rlen, clen;
if (!(re = pcre_compile(p, PCRE_CASELESS, &re_e, &re_eo, NULL))) exit(1);
re_ex = pcre_study(re, 0, &re_e);
for (dst->len = 0, rlen = strlen(r), pos = 0;
pcre_exec(re, re_ex, src->buf, src->len, pos, 0, m, 3) >= 0;
pos = m[1]) {
clen = m[0]-pos;
dp = fb_need(dst, clen+rlen);
dst->len += clen+rlen;
memcpy(dp, src->buf+pos, clen);
memcpy(dp+clen, r, rlen);
}
clen = src->len-pos;
dp = fb_need(dst, clen);
dst->len += clen;
memcpy(dp, src->buf+pos, clen);
return dst->len;
}
/* Count all matches with pattern p in src buffer. */
static int fb_countmatches(fbuf_t *src, const char *p)
{
pcre *re;
pcre_extra *re_ex;
const char *re_e;
int re_eo, m[3], pos, count;
if (!(re = pcre_compile(p, PCRE_CASELESS, &re_e, &re_eo, NULL))) exit(1);
re_ex = pcre_study(re, 0, &re_e);
for (count = 0, pos = 0;
pcre_exec(re, re_ex, src->buf, src->len, pos, 0, m, 3) >= 0;
pos = m[1]) count++;
return count;
}
static const char *variants[] = {
"agggtaaa|tttaccct", "[cgt]gggtaaa|tttaccc[acg]",
"a[act]ggtaaa|tttacc[agt]t", "ag[act]gtaaa|tttac[agt]ct",
"agg[act]taaa|ttta[agt]cct", "aggg[acg]aaa|ttt[cgt]ccct",
"agggt[cgt]aa|tt[acg]accct", "agggta[cgt]a|t[acg]taccct",
"agggtaa[cgt]|[acg]ttaccct", NULL
};
static const char *subst[] = {
"B", "(c|g|t)", "D", "(a|g|t)", "H", "(a|c|t)", "K", "(g|t)",
"M", "(a|c)", "N", "(a|c|g|t)", "R", "(a|g)", "S", "(c|g)",
"V", "(a|c|g)", "W", "(a|t)", "Y", "(c|t)", NULL
};
int main(int argc, char **argv)
{
fbuf_t seq[2];
const char **pp;
size_t ilen, clen, slen;
int flip;
fb_init(&seq[0]);
fb_init(&seq[1]);
ilen = fb_readall(&seq[0], stdin);
clen = fb_subst(&seq[1], &seq[0], ">.*|\n", "");
for (pp = variants; *pp; pp++)
printf("%s %d\n", *pp, fb_countmatches(&seq[1], *pp));
for (slen = 0, flip = 1, pp = subst; *pp; pp += 2, flip = 1-flip)
slen = fb_subst(&seq[1-flip], &seq[flip], *pp, pp[1]);
printf("\n%zu\n%zu\n%zu\n", ilen, clen, slen);
return 0;
}