2 * Copyright (c) 2004-2005 Sergey Lyubka <valenok@gmail.com>
5 * "THE BEER-WARE LICENSE" (Revision 42):
6 * Sergey Lyubka wrote this file. As long as you retain this notice you
7 * can do whatever you want with this stuff. If we meet some day, and you think
8 * this stuff is worth it, you can buy me a beer in return.
20 enum {END, BRANCH, ANY, EXACT, ANYOF, ANYBUT, OPEN, CLOSE, BOL, EOL,
21 STAR, PLUS, STARQ, PLUSQ, QUEST, SPACE, NONSPACE, DIGIT};
28 {"END", 0, ""}, /* End of code block or program */
29 {"BRANCH", 2, "oo"}, /* Alternative operator, "|" */
30 {"ANY", 0, ""}, /* Match any character, "." */
31 {"EXACT", 2, "d"}, /* Match exact string */
32 {"ANYOF", 2, "D"}, /* Match any from set, "[]" */
33 {"ANYBUT", 2, "D"}, /* Match any but from set, "[^]"*/
34 {"OPEN ", 1, "i"}, /* Capture start, "(" */
35 {"CLOSE", 1, "i"}, /* Capture end, ")" */
36 {"BOL", 0, ""}, /* Beginning of string, "^" */
37 {"EOL", 0, ""}, /* End of string, "$" */
38 {"STAR", 1, "o"}, /* Match zero or more times "*" */
39 {"PLUS", 1, "o"}, /* Match one or more times, "+" */
40 {"STARQ", 1, "o"}, /* Non-greedy STAR, "*?" */
41 {"PLUSQ", 1, "o"}, /* Non-greedy PLUS, "+?" */
42 {"QUEST", 1, "o"}, /* Match zero or one time, "?" */
43 {"SPACE", 0, ""}, /* Match whitespace, "\s" */
44 {"NONSPACE", 0, ""}, /* Match non-space, "\S" */
45 {"DIGIT", 0, ""} /* Match digit, "\d" */
49 * Commands and operands are all unsigned char (1 byte long). All code offsets
50 * are relative to current address, and positive (always point forward). Data
51 * offsets are absolute. Commands with operands:
53 * BRANCH offset1 offset2
54 * Try to match the code block that follows the BRANCH instruction
55 * (code block ends with END). If no match, try to match code block that
56 * starts at offset1. If either of these match, jump to offset2.
58 * EXACT data_offset data_length
59 * Try to match exact string. String is recorded in data section from
60 * data_offset, and has length data_length.
63 * CLOSE capture_number
64 * If the user have passed 'struct cap' array for captures, OPEN
65 * records the beginning of the matched substring (cap->ptr), CLOSE
66 * sets the length (cap->len) for respective capture_number.
71 * *, +, ?, respectively. Try to gobble as much as possible from the
72 * matched buffer, until code block that follows these instructions
73 * matches. When the longest possible string is matched,
76 * STARQ, PLUSQ are non-greedy versions of STAR and PLUS.
79 static const char *meta_chars = "|.^$*+?()[\\";
82 print_character_set(FILE *fp, const unsigned char *p, int len)
86 for (i = 0; i < len; i++) {
88 (void) fputc(',', fp);
92 (void) fprintf(fp, "\\x%02x", p[i]);
94 (void) fprintf(fp, "%s", opcodes[p[i]].name);
95 } else if (isprint(p[i])) {
96 (void) fputc(p[i], fp);
98 (void) fprintf(fp,"\\x%02x", p[i]);
104 slre_dump(const struct slre *r, FILE *fp)
106 int i, j, ch, op, pc;
108 for (pc = 0; pc < r->code_size; pc++) {
111 (void) fprintf(fp, "%3d %s ", pc, opcodes[op].name);
113 for (i = 0; opcodes[op].flags[i] != '\0'; i++)
114 switch (opcodes[op].flags[i]) {
116 (void) fprintf(fp, "%d ", r->code[pc + 1]);
120 (void) fprintf(fp, "%d ",
121 pc + r->code[pc + 1] - i);
125 print_character_set(fp, r->data +
126 r->code[pc + 1], r->code[pc + 2]);
130 (void) fputc('"', fp);
131 for (j = 0; j < r->code[pc + 2]; j++) {
132 ch = r->data[r->code[pc + 1] + j];
134 (void) fputc(ch, fp);
136 (void) fprintf(fp,"\\x%02x",ch);
138 (void) fputc('"', fp);
143 (void) fputc('\n', fp);
148 set_jump_offset(struct slre *r, int pc, int offset)
150 assert(offset < r->code_size);
152 if (r->code_size - offset > 0xff) {
153 r->err_str = "Jump offset is too big";
155 r->code[pc] = (unsigned char) (r->code_size - offset);
160 emit(struct slre *r, int code)
162 if (r->code_size >= (int) (sizeof(r->code) / sizeof(r->code[0])))
163 r->err_str = "RE is too long (code overflow)";
165 r->code[r->code_size++] = (unsigned char) code;
169 store_char_in_data(struct slre *r, int ch)
171 if (r->data_size >= (int) sizeof(r->data))
172 r->err_str = "RE is too long (data overflow)";
174 r->data[r->data_size++] = (unsigned char)ch;
178 exact(struct slre *r, const char **re)
180 int old_data_size = r->data_size;
182 while (**re != '\0' && (strchr(meta_chars, **re)) == NULL)
183 store_char_in_data(r, *(*re)++);
186 emit(r, old_data_size);
187 emit(r, r->data_size - old_data_size);
191 get_escape_char(const char **re)
196 case 'n': res = '\n'; break;
197 case 'r': res = '\r'; break;
198 case 't': res = '\t'; break;
199 case '0': res = 0; break;
200 case 'S': res = NONSPACE << 8; break;
201 case 's': res = SPACE << 8; break;
202 case 'd': res = DIGIT << 8; break;
203 default: res = (*re)[-1]; break;
210 anyof(struct slre *r, const char **re)
212 int esc, old_data_size = r->data_size, op = ANYOF;
224 emit(r, old_data_size);
225 emit(r, r->data_size - old_data_size);
230 esc = get_escape_char(re);
231 if ((esc & 0xff) == 0) {
232 store_char_in_data(r, 0);
233 store_char_in_data(r, esc >> 8);
235 store_char_in_data(r, esc);
239 store_char_in_data(r, (*re)[-1]);
243 r->err_str = "No closing ']' bracket";
247 relocate(struct slre *r, int begin, int shift)
250 memmove(r->code + begin + shift, r->code + begin, r->code_size - begin);
251 r->code_size += shift;
255 quantifier(struct slre *r, int prev, int op)
257 if (r->code[prev] == EXACT && r->code[prev + 2] > 1) {
260 emit(r, r->code[prev + 1] + r->code[prev + 2]);
262 prev = r->code_size - 3;
264 relocate(r, prev, 2);
265 r->code[prev] = (unsigned char)op;
266 set_jump_offset(r, prev + 1, prev);
270 exact_one_char(struct slre *r, int ch)
273 emit(r, r->data_size);
275 store_char_in_data(r, ch);
279 fixup_branch(struct slre *r, int fixup)
283 set_jump_offset(r, fixup, fixup - 2);
288 compile(struct slre *r, const char **re)
290 int op, esc, branch_start, last_op, fixup, cap_no, level;
294 branch_start = last_op = r->code_size;
310 last_op = r->code_size;
314 last_op = r->code_size;
318 last_op = r->code_size;
319 esc = get_escape_char(re);
323 exact_one_char(r, esc);
327 last_op = r->code_size;
328 cap_no = ++r->num_caps;
333 if (*(*re)++ != ')') {
334 r->err_str = "No closing bracket";
343 fixup_branch(r, fixup);
345 r->err_str = "Unbalanced brackets";
353 op = (*re)[-1] == '*' ? STAR: PLUS;
356 op = op == STAR ? STARQ : PLUSQ;
358 quantifier(r, last_op, op);
361 quantifier(r, last_op, QUEST);
364 fixup_branch(r, fixup);
365 relocate(r, branch_start, 3);
366 r->code[branch_start] = BRANCH;
367 set_jump_offset(r, branch_start + 1, branch_start);
368 fixup = branch_start + 2;
369 r->code[fixup] = 0xff;
373 last_op = r->code_size;
380 slre_compile(struct slre *r, const char *re)
383 r->code_size = r->data_size = r->num_caps = r->anchored = 0;
388 emit(r, OPEN); /* This will capture what matches full RE */
394 if (r->code[2] == BRANCH)
401 return (r->err_str == NULL ? 1 : 0);
404 static int match(const struct slre *, int,
405 const char *, int, int *, struct cap *);
408 loop_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
410 int saved_offset, matched_offset;
412 saved_offset = matched_offset = *ofs;
414 while (match(r, pc + 2, s, len, ofs, NULL)) {
416 if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
417 matched_offset = saved_offset;
421 *ofs = matched_offset;
425 loop_non_greedy(const struct slre *r, int pc, const char *s,int len, int *ofs)
427 int saved_offset = *ofs;
429 while (match(r, pc + 2, s, len, ofs, NULL)) {
431 if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
439 is_any_of(const unsigned char *p, int len, const char *s, int *ofs)
445 for (i = 0; i < len; i++)
455 is_any_but(const unsigned char *p, int len, const char *s, int *ofs)
461 for (i = 0; i < len; i++)
470 match(const struct slre *r, int pc, const char *s, int len,
471 int *ofs, struct cap *caps)
473 int n, saved_offset, res = 1;
475 while (res && r->code[pc] != END) {
477 assert(pc < r->code_size);
478 assert(pc < (int) (sizeof(r->code) / sizeof(r->code[0])));
480 switch (r->code[pc]) {
483 res = match(r, pc + 3, s, len, ofs, caps);
486 res = match(r, pc + r->code[pc + 1],
489 pc += r->code[pc + 2];
493 n = r->code[pc + 2]; /* String length */
494 if (n <= len - *ofs && !memcmp(s + *ofs, r->data +
495 r->code[pc + 1], n)) {
504 if (!match(r, pc + 2, s, len, ofs, caps))
506 pc += r->code[pc + 1];
510 loop_greedy(r, pc, s, len, ofs);
511 pc += r->code[pc + 1];
515 loop_non_greedy(r, pc, s, len, ofs);
516 pc += r->code[pc + 1];
519 if ((res = match(r, pc + 2, s, len, ofs, caps)) == 0)
522 loop_greedy(r, pc, s, len, ofs);
523 pc += r->code[pc + 1];
526 if ((res = match(r, pc + 2, s, len, ofs, caps)) == 0)
529 loop_non_greedy(r, pc, s, len, ofs);
530 pc += r->code[pc + 1];
534 if (*ofs < len && isspace(((unsigned char *)s)[*ofs])) {
542 if (*ofs <len && !isspace(((unsigned char *)s)[*ofs])) {
550 if (*ofs < len && isdigit(((unsigned char *)s)[*ofs])) {
567 res = is_any_of(r->data + r->code[pc + 1],
568 r->code[pc + 2], s, ofs);
574 res = is_any_but(r->data + r->code[pc + 1],
575 r->code[pc + 2], s, ofs);
579 res = *ofs == 0 ? 1 : 0;
583 res = *ofs == len ? 1 : 0;
588 caps[r->code[pc + 1]].ptr = s + *ofs;
593 caps[r->code[pc + 1]].len = (s + *ofs) -
594 caps[r->code[pc + 1]].ptr;
601 printf("unknown cmd (%d) at %d\n", r->code[pc], pc);
611 slre_match(const struct slre *r, const char *buf, int len,
614 int i, ofs = 0, res = 0;
617 res = match(r, 0, buf, len, &ofs, caps);
619 for (i = 0; i < len && res == 0; i++) {
621 res = match(r, 0, buf, len, &ofs, caps);
629 int main(int argc, char *argv[])
633 char data[1 * 1024 * 1024];
635 int i, count, res, len;
638 printf("Usage: %s 'slre' <file> [count]\n", argv[0]);
639 } else if ((fp = fopen(argv[2], "rb")) == NULL) {
640 printf("Error: cannot open %s:%s\n", argv[2], strerror(errno));
641 } else if (!slre_compile(&slre, argv[1])) {
642 printf("Error compiling slre: %s\n", slre.err_str);
644 slre_dump(&slre, stderr);
646 (void) memset(caps, 0, sizeof(caps));
648 /* Read first 128K of file */
649 len = fread(data, 1, sizeof(data), fp);
653 count = argc > 3 ? atoi(argv[3]) : 1;
654 for (i = 0; i < count; i++)
655 res = slre_match(&slre, data, len, caps);
657 printf("Result: %d\n", res);
659 for (i = 0; i < 20; i++)
661 printf("Substring %d: [%.*s]\n", i,
662 caps[i].len, caps[i].ptr);