blob: e818a007d12370c8a4c05493ffdaebee43e71791 [file] [log] [blame]
/*
Copyright 2002-2003 John Plevyak, All Rights Reserved
*/
#include "d.h"
static char *action_types[] = { "ACCEPT", "SHIFT", "REDUCE" };
static void print_state(State *s);
Production *
new_production(Grammar *g, char *name) {
Production *p;
if ((p = lookup_production(g, name, strlen(name))))
return p;
p = MALLOC(sizeof(Production));
memset(p, 0, sizeof(Production));
vec_add(&g->productions, p);
p->name = name;
p->name_len = strlen(name);
return p;
}
static Elem *
new_elem() {
Elem *e = MALLOC(sizeof(Elem));
memset(e, 0, sizeof(Elem));
return e;
}
Rule *
new_rule(Grammar *g, Production *p) {
Rule *r = MALLOC(sizeof(Rule));
memset(r, 0, sizeof(Rule));
r->prod = p;
r->end = new_elem();
r->end->kind = ELEM_END;
r->end->rule = r;
r->action_index = g->action_index;
return r;
}
static Term *
new_term() {
Term *term = MALLOC(sizeof(Term));
memset(term, 0, sizeof(Term));
return term;
}
static Elem *
new_elem_term(Term *t, Rule *r) {
Elem *e = new_elem();
e->kind = ELEM_TERM;
e->e.term = t;
e->rule = r;
vec_add(&r->elems, e);
return e;
}
Elem *
new_elem_nterm(Production *p, Rule *r) {
Elem *e = new_elem();
e->kind = ELEM_NTERM;
e->e.nterm = p;
e->rule = r;
return e;
}
static Elem *
new_term_string(Grammar *g, char *s, char *e, Rule *r) {
Term *t = new_term();
Elem *elem;
t->string = MALLOC(e - s + 1);
memcpy(t->string, s, e - s);
t->string[e - s] = 0;
t->string_len = e - s;
vec_add(&g->terminals, t);
elem = new_elem_term(t, r);
return elem;
}
#define ESC(_c) *ss++ = '\\'; *ss++ = _c; break;
char *
escape_string_for_regex(char *s) {
char *ss = (char*)MALLOC((strlen(s) + 1) * 2), *sss = ss;
for (; *s; s++) {
switch (*s) {
case '(':
case ')':
case '[':
case ']':
case '-':
case '^':
case '*':
case '?':
case '+':
*ss++ = '\\';
/* fall through */
default: *ss++ = *s; break;
}
}
*ss = 0;
return sss;
}
static void
unescape_term_string(Term *t) {
char *s, *start = 0, *ss;
int length, base = 0;
for (ss = s = t->string; *s; s++) {
if (*s == '\\') {
switch (s[1]) {
case 'b': *ss = '\b'; s++; break;
case 'f': *ss = '\f'; s++; break;
case 'n': *ss = '\n'; s++; break;
case 'r': *ss = '\r'; s++; break;
case 't': *ss = '\t'; s++; break;
case 'v': *ss = '\v'; s++; break;
case 'a': *ss = '\a'; s++; break;
case 'c': *ss = 0; return;
case '\"':
if (t->kind == TERM_REGEX)
{ *ss = '\"'; s++; break; }
else
goto Ldefault;
case '\'':
if (t->kind == TERM_STRING)
{ *ss = '\''; s++; break; }
else
goto Ldefault;
case 'x':
length = 0;
if (isxdigit(s[2])) {
base = 16;
start = s + 2;
length++;
if (isxdigit(s[3]))
length++;
}
s += length + 1;
goto Lncont;
case 'd':
length = 0;
if (isdigit(s[2])) {
base = 10;
start = s + 2;
length++;
if (isdigit(s[3])) {
length++;
if (isdigit(s[4]) && ((s[2] < '2') || ((s[2] == '2') && ((s[3] < '5') ||
((s[3] == '5') && (s[4] < '6'))))))
length++;
}
}
s += length + 1;
goto Lncont;
case '0': case '1': case '2': case '3':
case '4': case '5': case '6': case '7':
length = 1;
base = 8;
start = s + 1;
if (isdigit(s[2]) && (s[2] != '8') && (s[2] != '9')) {
length++;
if (isdigit(s[3]) && (s[3] != '8') && (s[3] != '9')) {
length++;
}
}
s += length;
/* fall through */
Lncont:
if (length > 0) {
char saved_c = start[length];
start[length] = '\0';
*ss = (char) strtol(start, NULL, base);
start[length] = saved_c;
if (*s > 0)
break;
d_fail("encountered an escaped NULL while processing '%s'", t->string);
} else
goto next;
Ldefault:
default:
*ss++ = *s;
*ss = s[1];
s++;
break;
}
} else
*ss = *s;
ss++;
next:;
}
*ss = 0;
t->string_len = strlen(t->string);
if (!t->string_len)
d_fail("empty string after unescape '%s'", t->string);
}
Elem *
new_string(Grammar *g, char *s, char *e, Rule *r) {
Elem *x = new_term_string(g, s + 1, e - 1, r);
x->e.term->kind = (*s == '"') ? TERM_REGEX : TERM_STRING;
unescape_term_string(x->e.term);
return x;
}
Elem *
new_ident(char *s, char *e, Rule *r) {
Elem *x = new_elem();
x->kind = ELEM_UNRESOLVED;
x->e.unresolved.string = dup_str(s, e);
x->e.unresolved.len = strlen(x->e.unresolved.string);
x->rule = r;
if (r)
vec_add(&r->elems, x);
return x;
}
void
new_token(Grammar *g, char *s, char *e) {
Term *t = new_term();
t->string = MALLOC(e - s + 1);
memcpy(t->string, s, e - s);
t->string[e - s] = 0;
t->string_len = e - s;
vec_add(&g->terminals, t);
t->kind = TERM_TOKEN;
}
Elem *
new_code(Grammar *g, char *s, char *e, Rule *r) {
Elem *x = new_term_string(g, s, e, r);
x->e.term->kind = TERM_CODE;
return x;
}
Elem *
dup_elem(Elem *e, Rule *r) {
Elem *ee = MALLOC(sizeof(Elem));
memcpy(ee, e, sizeof(Elem));
ee->rule = r;
return ee;
}
void
add_global_code(Grammar *g, char *start, char *end, int line) {
if (!g->code) g->code = MALLOC(sizeof(Code) * 4);
else if (!((g->ncode + 1) & 4))
g->code = REALLOC(g->code, sizeof(Code) * (g->ncode + 4));
g->code[g->ncode].code = dup_str(start, end);
g->code[g->ncode].line = line;
g->ncode++;
}
void
new_declaration(Grammar *g, Elem *e, uint kind) {
Declaration *d = MALLOC(sizeof(*d));
d->elem = e;
d->kind = kind;
d->index = g->declarations.n;
vec_add(&g->declarations, d);
}
void
add_declaration(Grammar *g, char *start, char *end, uint kind, uint line) {
if (start == end) {
switch (kind) {
case DECLARE_SET_OP_PRIORITY: g->set_op_priority_from_rule = 1; return;
case DECLARE_STATES_FOR_ALL_NTERMS: g->states_for_all_nterms = 1; return;
case DECLARE_LONGEST_MATCH: g->longest_match = 1; return;
case DECLARE_ALL_MATCHES: g->longest_match = 0; return;
case DECLARE_TOKENIZE: g->tokenizer = 1; return;
case DECLARE_SAVE_PARSE_TREE: g->save_parse_tree = 1; return;
default: d_fail("declare expects argument, line %d", line);
}
}
switch (kind) {
case DECLARE_WHITESPACE: g->default_white_space = dup_str(start, end); return;
case DECLARE_SET_OP_PRIORITY:
d_fail("declare does not expect argument, line %d", line);
default:
new_declaration(g, new_ident(start, end, NULL), kind);
break;
}
}
D_Pass *
find_pass(Grammar *g, char *start, char *end) {
int i, l;
while (*start && isspace(*start)) start++;
l = end - start;
for (i = 0; i < g->passes.n; i++)
if (l == g->passes.v[i]->name_len &&
!strncmp(g->passes.v[i]->name, start, l))
return g->passes.v[i];
return NULL;
}
void
add_pass(Grammar *g, char *start, char *end, uint kind, uint line) {
if (find_pass(g, start, end))
d_fail("duplicate pass '%s' line %d", dup_str(start, end), line);
else {
D_Pass *p = MALLOC(sizeof(*p));
p->name = dup_str(start, end);
p->name_len = end - start;
p->kind = kind;
p->index = g->pass_index++;
vec_add(&g->passes, p);
}
}
void
add_pass_code(Grammar *g, Rule *r, char *pass_start, char *pass_end,
char *code_start, char *code_end, uint pass_line, uint code_line)
{
D_Pass *p = find_pass(g, pass_start, pass_end);
if (!p)
d_fail("unknown pass '%s' line %d", dup_str(pass_start, pass_end), pass_line);
while (r->pass_code.n <= p->index) vec_add(&r->pass_code, NULL);
r->pass_code.v[p->index] = MALLOC(sizeof(Code));
r->pass_code.v[p->index]->code = dup_str(code_start, code_end);
r->pass_code.v[p->index]->line = code_line;
}
Production *
new_internal_production(Grammar *g, Production *p) {
char *n = p ? p->name : " _synthetic";
char *name = MALLOC(strlen(n) + 20);
Production *pp = NULL, *tp = NULL, *ttp;
int i, found = 0;
sprintf(name, "%s.%d", n, g->productions.n);
pp = new_production(g, name);
pp->internal = INTERNAL_HIDDEN;
pp->regex = p ? p->regex : 0;
if (p) {
for (i = 0; i < g->productions.n; i++) {
if (found) {
ttp = g->productions.v[i];
g->productions.v[i] = tp;
tp = ttp;
} else if (p == g->productions.v[i]) {
found = 1;
tp = g->productions.v[i+1];
g->productions.v[i+1] = pp;
i++;
}
}
}
return pp;
}
void
conditional_EBNF(Grammar *g) {
Production *pp;
Rule *rr;
pp = new_internal_production(g, g->p);
pp->internal = INTERNAL_CONDITIONAL;
rr = new_rule(g, pp);
vec_add(&rr->elems, last_elem(g->r));
last_elem(g->r)->rule = rr;
rr->elems.v[rr->elems.n - 1]->rule = rr;
vec_add(&pp->rules, rr);
vec_add(&pp->rules, new_rule(g, pp));
last_elem(g->r) = new_elem_nterm(pp, g->r);
}
void
star_EBNF(Grammar *g) {
Production *pp;
Rule *rr;
pp = new_internal_production(g, g->p);
pp->internal = INTERNAL_STAR;
rr = new_rule(g, pp);
if (!g->right_recursive_BNF) {
vec_add(&rr->elems, new_elem_nterm(pp, rr));
vec_add(&rr->elems, last_elem(g->r));
last_elem(g->r) = new_elem_nterm(pp, g->r);
last_elem(rr)->rule = rr;
} else {
vec_add(&rr->elems, last_elem(g->r));
last_elem(g->r) = new_elem_nterm(pp, g->r);
last_elem(rr)->rule = rr;
vec_add(&rr->elems, new_elem_nterm(pp, rr));
}
vec_add(&pp->rules, rr);
vec_add(&pp->rules, new_rule(g, pp));
}
void
plus_EBNF(Grammar *g) {
Production *pp;
Rule *rr;
Elem *elem;
pp = new_internal_production(g, g->p);
pp->internal = INTERNAL_PLUS;
rr = new_rule(g, pp);
if (!g->right_recursive_BNF) {
elem = last_elem(g->r);
vec_add(&rr->elems, new_elem_nterm(pp, rr));
vec_add(&rr->elems, dup_elem(elem, rr));
last_elem(g->r) = new_elem_nterm(pp, g->r);
} else {
elem = last_elem(g->r);
vec_add(&rr->elems, dup_elem(elem, rr));
last_elem(g->r) = new_elem_nterm(pp, g->r);
vec_add(&rr->elems, new_elem_nterm(pp, rr));
}
vec_add(&pp->rules, rr);
rr = new_rule(g, pp);
vec_add(&rr->elems, elem);
elem->rule = rr;
vec_add(&pp->rules, rr);
}
void
initialize_productions(Grammar *g) {
Production *pp, *ppp;
Rule *rrr;
ppp = new_production(g, strdup("0 Start"));
ppp->internal = INTERNAL_HIDDEN;
rrr = new_rule(g, ppp);
vec_add(&rrr->elems, new_elem_nterm(NULL, rrr));
vec_add(&ppp->rules, rrr);
pp = new_production(g, strdup("1 Start"));
pp->internal = INTERNAL_HIDDEN;
rrr->elems.v[0]->e.nterm = pp;
}
void
finish_productions(Grammar *g) {
Production *pp = g->productions.v[1];
Rule *rr = new_rule(g, g->productions.v[1]);
vec_add(&rr->elems, new_elem_nterm(NULL, rr));
vec_add(&pp->rules, rr);
rr->elems.v[0]->e.nterm = g->productions.v[2];
}
Production *
lookup_production(Grammar *g, char *name, int l) {
int i;
for (i = 0; i < g->productions.n; i++) {
Production *pp = g->productions.v[i];
if (pp->name_len != l || strncmp(pp->name, name, l))
continue;
return pp;
}
return NULL;
}
static Term *
lookup_token(Grammar *g, char *name, int l) {
int i;
for (i = 0; i < g->terminals.n; i++) {
Term *t = g->terminals.v[i];
if (t->kind != TERM_TOKEN || t->string_len != l ||
strncmp(t->string, name, l))
continue;
return t;
}
return NULL;
}
static Term *
unique_term(Grammar *g, Term *t) {
int i;
for (i = 0; i < g->terminals.n; i++)
if (t->kind == g->terminals.v[i]->kind &&
t->string_len == g->terminals.v[i]->string_len &&
t->term_priority == g->terminals.v[i]->term_priority &&
(!g->set_op_priority_from_rule ||
(t->op_assoc == g->terminals.v[i]->op_assoc &&
t->op_priority == g->terminals.v[i]->op_priority)) &&
!strncmp(t->string, g->terminals.v[i]->string, t->string_len))
return g->terminals.v[i];
return t;
}
static void
compute_nullable(Grammar *g) {
int i, j, k, changed = 1;
Elem *e;
/* ensure that the trivial case is the first cause */
for (i = 0; i < g->productions.n; i++) {
for (j = 0; j < g->productions.v[i]->rules.n; j++)
if (!g->productions.v[i]->rules.v[j]->elems.n) {
g->productions.v[i]->nullable = g->productions.v[i]->rules.v[j];
break;
}
}
/* transitive closure */
while (changed) {
changed = 0;
for (i = 0; i < g->productions.n; i++) {
if (!g->productions.v[i]->nullable)
for (j = 0; j < g->productions.v[i]->rules.n; j++) {
for (k = 0; k < g->productions.v[i]->rules.v[j]->elems.n; k++) {
e = g->productions.v[i]->rules.v[j]->elems.v[k];
if (e->kind != ELEM_NTERM || !e->e.nterm->nullable)
goto Lnot_nullable;
}
changed = 1;
g->productions.v[i]->nullable = g->productions.v[i]->rules.v[j];
break;
}
Lnot_nullable:;
}
}
}
/*
verify and cleanup the grammar datastructures
- resolve non-terminals
- set element indexes
*/
static void
resolve_grammar(Grammar *g) {
int i, j, k, l;
Production *p, *pp;
Rule *r;
Elem *e;
Term *last_term, *t;
g->rule_index = 0;
for (i = 0; i < g->productions.n; i++) {
p = g->productions.v[i];
if (p != lookup_production(g, p->name, p->name_len))
d_fail("duplicate production '%s'", p->name);
p->index = i;
for (j = 0; j < p->rules.n; j++) {
r = p->rules.v[j];
r->index = g->rule_index++;
last_term = NULL;
for (k = 0; k < r->elems.n; k++) {
e = r->elems.v[k];
e->index = k;
if (e->kind == ELEM_UNRESOLVED) {
l = e->e.unresolved.len;
if ((pp = lookup_production(g, e->e.unresolved.string, l))) {
e->kind = ELEM_NTERM;
e->e.nterm = pp;
} else if ((t = lookup_token(g, e->e.unresolved.string, l))) {
e->kind = ELEM_TERM;
e->e.term = t;
} else {
char str[256];
strncpy(str, e->e.unresolved.string, l);
str[l < 255 ? l : 255] = 0;
d_fail("unresolved identifier: '%s'", str);
}
}
if (e->kind == ELEM_TERM)
last_term = e->e.term;
}
r->end->index = r->elems.n;
if (g->set_op_priority_from_rule) {
if (last_term && r->rule_assoc) {
last_term->op_assoc = r->rule_assoc;
last_term->op_priority = r->rule_priority;
}
}
}
}
for (i = 0; i < g->terminals.n; i++)
g->terminals.v[i]->index = i;
compute_nullable(g);
}
static void
merge_identical_terminals(Grammar *g) {
int i, j, k;
Production *p;
Rule *r;
Elem *e;
for (i = 0; i < g->productions.n; i++) {
p = g->productions.v[i];
for (j = 0; j < p->rules.n; j++) {
r = p->rules.v[j];
for (k = 0; k < r->elems.n; k++) {
e = r->elems.v[k];
if (e->kind == ELEM_TERM)
e->e.term = unique_term(g, e->e.term);
}
}
}
}
void
print_term(Term *t) {
char *s = t->string ? escape_string(t->string) : NULL;
if (t->kind == TERM_STRING) {
if (!t->string || !*t->string)
printf("<EOF> ");
else
printf("string(\"%s\") ", s);
} else if (t->kind == TERM_REGEX)
printf("regex(\"%s\") ", s);
else if (t->kind == TERM_CODE)
printf("code(\"%s\") ", s);
else if (t->kind == TERM_TOKEN)
printf("token(\"%s\") ", s);
else
d_fail("unknown token kind");
if (s)
FREE(s);
}
void
print_elem(Elem *ee) {
if (ee->kind == ELEM_TERM)
print_term(ee->e.term);
else if (ee->kind == ELEM_UNRESOLVED)
printf("%s ", ee->e.unresolved.string);
else
printf("%s ", ee->e.nterm->name);
}
struct EnumStr {
uint e;
char *s;
} assoc_strings[] = {
{ ASSOC_NONE, "$none" },
{ ASSOC_NARY_LEFT, "$left" },
{ ASSOC_NARY_RIGHT, "$right" },
{ ASSOC_UNARY_LEFT, "$unary_left" },
{ ASSOC_UNARY_RIGHT, "$unary_right" },
{ ASSOC_BINARY_LEFT, "$binary_left" },
{ ASSOC_BINARY_RIGHT, "$binary_right" },
{ ASSOC_NO, "$noassoc" }
};
static char *
assoc_str(uint e) {
int i;
for (i = 0; i < sizeof(assoc_strings) / sizeof(assoc_strings[0]); i++)
if (e == assoc_strings[i].e)
return assoc_strings[i].s;
return assoc_strings[0].s;
}
void
print_rule(Rule *r) {
int k;
printf("%s: ", r->prod->name);
for (k = 0; k < r->elems.n; k++)
print_elem(r->elems.v[k]);
if (r->speculative_code.code)
printf("SPECULATIVE_CODE\n%s\nEND CODE\n", r->speculative_code.code);
if (r->final_code.code)
printf("FINAL_CODE\n%s\nEND CODE\n", r->final_code.code);
}
void
print_grammar(Grammar *g) {
uint i, j, k;
Production *pp;
Rule *rr;
if (!g->productions.n)
return;
printf("PRODUCTIONS\n\n");
for (i = 0; i < g->productions.n; i++) {
pp = g->productions.v[i];
printf("%s (%d)\n", pp->name, i);
for (j = 0; j < pp->rules.n; j++) {
rr = pp->rules.v[j];
if (!j)
printf("\t: ");
else
printf("\t| ");
for (k = 0; k < rr->elems.n; k++)
print_elem(rr->elems.v[k]);
if (rr->op_priority)
printf("op %d ", rr->op_priority);
if (rr->op_assoc)
printf("%s ", assoc_str(rr->op_assoc));
if (rr->rule_priority)
printf("rule %d ", rr->rule_priority);
if (rr->rule_assoc)
printf("%s ", assoc_str(rr->rule_assoc));
if (rr->speculative_code.code)
printf("%s ", rr->speculative_code.code);
if (rr->final_code.code)
printf("%s ", rr->final_code.code);
printf("\n");
}
printf("\t;\n");
printf("\n");
}
printf("TERMINALS\n\n");
for (i = 0; i < g->terminals.n; i++) {
printf("\t");
print_term(g->terminals.v[i]);
printf("(%d)\n", i + g->productions.n);
}
printf("\n");
}
static void
print_item(Item *i) {
int j, end = 1;
printf("\t%s: ", i->rule->prod->name);
for (j = 0; j < i->rule->elems.n; j++) {
Elem *e = i->rule->elems.v[j];
if (i == e) {
printf(". ");
end = 0;
}
print_elem(e);
}
if (end)
printf(". ");
printf("\n");
}
static void
print_conflict(char *kind, int *conflict) {
if (!*conflict) {
printf(" CONFLICT (before precedence and associativity)\n");
*conflict = 1;
}
printf("\t%s conflict ", kind);
printf("\n");
}
static void
print_state(State *s) {
int j, conflict = 0;
printf("STATE %d (%d ITEMS)%s\n", s->index, s->items.n,
s->accept ? " ACCEPT" : "");
for (j = 0; j < s->items.n; j++)
print_item(s->items.v[j]);
if (s->gotos.n)
printf(" GOTO\n");
for (j = 0; j < s->gotos.n; j++) {
printf("\t");
print_elem(s->gotos.v[j]->elem);
printf(" : %d\n", s->gotos.v[j]->state->index);
}
printf(" ACTION\n");
for (j = 0; j < s->reduce_actions.n; j++) {
Action *a = s->reduce_actions.v[j];
printf("\t%s\t", action_types[a->kind]);
print_rule(a->rule);
printf("\n");
}
for (j = 0; j < s->shift_actions.n; j++) {
Action *a = s->shift_actions.v[j];
printf("\t%s\t", action_types[a->kind]);
if (a->kind == ACTION_SHIFT) {
print_term(a->term);
printf("%d", a->state->index);
}
printf("\n");
}
if (s->reduce_actions.n > 1)
print_conflict("reduce/reduce", &conflict);
if (s->reduce_actions.n && s->shift_actions.n)
print_conflict("shift/reduce", &conflict);
printf("\n");
}
void
print_states(Grammar *g) {
int i;
for (i = 0; i < g->states.n; i++)
print_state(g->states.v[i]);
}
int
state_for_declaration(Grammar *g, int iproduction) {
int i;
for (i = 0; i < g->declarations.n; i++)
if (g->declarations.v[i]->kind == DECLARE_STATE_FOR &&
g->declarations.v[i]->elem->e.nterm->index == iproduction)
return 1;
return 0;
}
static void
make_elems_for_productions(Grammar *g) {
int i, j, k, l;
Rule *rr;
Production *pp, *ppp;
pp = g->productions.v[0];
for (i = 0; i < g->productions.n; i++)
if (!g->productions.v[i]->internal) {
if (g->states_for_all_nterms ||
state_for_declaration(g, i)) {
/* try to find an existing elem */
for (j = 0; j < g->productions.n; j++)
for (k = 0; k < g->productions.v[j]->rules.n; k++) {
rr = g->productions.v[j]->rules.v[k];
for (l = 0; l < rr->elems.n; l++)
if (rr->elems.v[l]->e.term_or_nterm == g->productions.v[i]) {
g->productions.v[i]->elem = rr->elems.v[l];
break;
}
}
if (j >= g->productions.n) { /* not found */
g->productions.v[i]->elem =
new_elem_nterm(g->productions.v[i], new_rule(g, pp));
g->productions.v[i]->elem->rule->index = g->rule_index++; /* fake */
}
}
}
if (!g->states_for_all_nterms &&
g->states_for_whitespace &&
(ppp = lookup_production(g, "whitespace", sizeof("whitespace")-1)))
{
ppp->elem = new_elem_nterm(ppp, new_rule(g, pp));
ppp->elem->rule->index = g->rule_index++; /* fake */
}
}
static void
convert_regex_production_one(Grammar *g, Production *p) {
int j, k, l;
Production *pp;
Rule *r, *rr;
Elem *e;
Term *t;
int circular = 0;
char *buf = 0, *b, *s;
int buf_len = 0;
if (p->regex_term) /* already done */
return;
if (p->in_regex)
d_fail("circular regex production '%s'", p->name);
p->in_regex = 1;
for (j = 0; j < p->rules.n; j++) {
r = p->rules.v[j];
if (r->final_code.code || (r->speculative_code.code && p->rules.n > 1))
d_fail("final and/or multi-rule code not permitted in regex productions '%s'", p->name);
for (k = 0; k < r->elems.n; k++) {
e = r->elems.v[k];
if (e->kind == ELEM_NTERM) {
if (!e->e.nterm->regex)
d_fail("regex production '%s' cannot invoke non-regex production '%s'",
p->name, e->e.nterm->name);
pp = e->e.nterm;
for (l = 0; l < pp->rules.n; l++)
if (pp->rules.v[l]->speculative_code.code || pp->rules.v[l]->final_code.code)
d_fail("code not permitted in rule %d of regex productions '%s'", l, p->name);
if (p != pp) {
convert_regex_production_one(g, pp);
buf_len += pp->regex_term->string_len + 5;
} else {
circular = 1;
buf_len += 5;
}
} else { /* e->kind == ELEM_TERM */
if (e->e.term->kind == TERM_CODE || e->e.term->kind == TERM_TOKEN)
d_fail("regex production '%s' cannot include scanners or tokens");
buf_len += e->e.term->string_len + 5;
}
}
}
b = buf = (char*)MALLOC(buf_len + 1);
t = new_term();
t->kind = TERM_REGEX;
t->string = buf;
t->index = g->terminals.n;
t->regex_production = p;
vec_add(&g->terminals, t);
p->regex_term = t;
if (circular) { /* attempt to match to regex operators */
if (p->rules.n != 2)
Lfail: d_fail("unable to resolve circular regex production: '%s'", p->name);
l = p->rules.v[0]->elems.n + p->rules.v[1]->elems.n;
if (l == 2 || l == 3) {
if (p->rules.v[0]->elems.n != 2 && p->rules.v[1]->elems.n != 2)
goto Lfail;
r = p->rules.v[0]->elems.n == 2 ? p->rules.v[0] : p->rules.v[1];
rr = p->rules.v[0] == r ? p->rules.v[1] : p->rules.v[0];
if (r->elems.v[0]->e.nterm != p && r->elems.v[1]->e.nterm != p)
goto Lfail;
e = r->elems.v[0]->e.nterm == p ? r->elems.v[1] : r->elems.v[1];
if (rr->elems.n && e->e.term_or_nterm != rr->elems.v[0]->e.term_or_nterm)
goto Lfail;
t = e->kind == ELEM_TERM ? e->e.term : e->e.nterm->regex_term;
*b++ = '(';
if (t->kind == TERM_STRING)
s = escape_string_for_regex(t->string);
else
s = t->string;
memcpy(b, s, strlen(s)); b += strlen(s);
if (t->kind == TERM_STRING)
FREE(s);
*b++ = ')';
if (l == 2)
*b++ = '*';
else
*b++ = '+';
*b = 0;
p->regex_term->string_len = strlen(p->regex_term->string);
} else
goto Lfail;
} else { /* handle the base case, p = (r | r'), r = (e e') */
if (p->rules.n > 1)
*b++ = '(';
for (j = 0; j < p->rules.n; j++) {
r = p->rules.v[j];
if (r->elems.n > 1)
*b++ = '(';
for (k = 0; k < r->elems.n; k++) {
e = r->elems.v[k];
t = e->kind == ELEM_TERM ? e->e.term : e->e.nterm->regex_term;
if (t->kind == TERM_STRING)
s = escape_string_for_regex(t->string);
else
s = t->string;
memcpy(b, s, strlen(s)); b += strlen(s);
if (t->kind == TERM_STRING)
FREE(s);
}
if (r->elems.n > 1)
*b++ = ')';
if (j != p->rules.n - 1)
*b++ = '|';
}
if (p->rules.n > 1)
*b++ = ')';
*b = 0;
p->regex_term->string_len = strlen(p->regex_term->string);
}
p->in_regex = 0;
}
static void
convert_regex_productions(Grammar *g) {
int i, j, k;
Production *p;
Rule *r;
for (i = 0; i < g->productions.n; i++) {
p = g->productions.v[i];
if (!p->regex)
continue;
convert_regex_production_one(g, p);
}
for (i = 0; i < g->productions.n; i++) {
p = g->productions.v[i];
for (j = 0; j < p->rules.n; j++) {
r = p->rules.v[j];
for (k = 0; k < r->elems.n; k++) {
if (r->elems.v[k]->kind == ELEM_NTERM && r->elems.v[k]->e.nterm->regex_term) {
r->elems.v[k]->e.term = r->elems.v[k]->e.nterm->regex_term;
r->elems.v[k]->kind = ELEM_TERM;
}
}
}
}
}
static void
check_default_actions(Grammar *g) {
Production *pdefault;
pdefault = lookup_production(g, "_", 1);
if (pdefault && pdefault->rules.n > 1)
d_fail("number of rules in default action != 1");
}
typedef struct {
struct State *eq;
struct Rule *diff_rule;
struct State *diff_state;
} EqState;
void
build_eq(Grammar *g) {
int i, j, k, changed = 1, x, xx;
State *s, *ss;
EqState *eq, *e, *ee;
eq = (EqState*)MALLOC(sizeof(EqState)*g->states.n);
memset(eq, 0, sizeof(EqState)*g->states.n);
while (changed) {
changed = 0;
for (i = 0; i < g->states.n; i++) {
s = g->states.v[i];
e = &eq[s->index];
for (j = i + 1; j < g->states.n; j++) {
ss = g->states.v[j];
ee = &eq[ss->index];
if (e->eq || ee->eq)
continue;
if (s->same_shifts != ss->same_shifts && ss->same_shifts != s)
continue;
/* check gotos */
if (s->gotos.n != ss->gotos.n)
continue;
for (k = 0; k < s->gotos.n; k++) {
if (elem_symbol(g, s->gotos.v[k]->elem) != elem_symbol(g, ss->gotos.v[k]->elem))
goto Lcontinue;
if (s->gotos.v[k]->state != ss->gotos.v[k]->state) {
EqState *ge = &eq[s->gotos.v[k]->state->index];
EqState *gee = &eq[ss->gotos.v[k]->state->index];
if (ge->eq != ss->gotos.v[k]->state && gee->eq != s->gotos.v[k]->state)
goto Lcontinue;
if ((ee->diff_state && ee->diff_state != eq[ss->gotos.v[k]->state->index].eq) ||
(e->diff_state && e->diff_state != eq[s->gotos.v[k]->state->index].eq))
goto Lcontinue;
/* allow one different state */
ee->diff_state = ss->gotos.v[k]->state;
e->diff_state = s->gotos.v[k]->state;
}
}
/* check reductions */
if (s->reduce_actions.n != ss->reduce_actions.n)
continue;
for (k = 0; k < s->reduce_actions.n; k++) {
if (s->reduce_actions.v[k]->rule == ss->reduce_actions.v[k]->rule)
continue;
if (s->reduce_actions.v[k]->rule->prod !=
ss->reduce_actions.v[k]->rule->prod)
goto Lcontinue;
if ((x = s->reduce_actions.v[k]->rule->elems.n) !=
(xx = ss->reduce_actions.v[k]->rule->elems.n)) {
if ((ee->diff_rule && ee->diff_rule != ss->reduce_actions.v[k]->rule) ||
(e->diff_rule && e->diff_rule != s->reduce_actions.v[k]->rule))
goto Lcontinue;
/* allow one different rule */
ee->diff_rule = ss->reduce_actions.v[k]->rule;
e->diff_rule = s->reduce_actions.v[k]->rule;
}
}
ee->eq = s;
changed = 1;
Lcontinue:;
}
}
}
for (i = 0; i < g->states.n; i++) {
s = g->states.v[i];
e = &eq[s->index];
if (e->eq) {
if (verbose_level > 2) {
printf("eq %d %d ", s->index, e->eq->index);
if (e->diff_state)
printf("diff state (%d %d) ",
e->diff_state->index,
eq[e->eq->index].diff_state->index);
if (e->diff_rule) {
printf("diff rule ");
printf("[ ");
print_rule(e->diff_rule);
printf("][ ");
print_rule(eq[e->eq->index].diff_rule);
printf("]");
}
printf("\n");
}
}
}
for (i = 0; i < g->states.n; i++) {
s = g->states.v[i];
e = &eq[s->index];
if (e->eq && e->diff_state) {
if (eq[e->diff_state->index].diff_rule &&
eq[e->diff_state->index].diff_rule->elems.n == 2)
{
s->reduces_to = e->eq;
s->reduces_with = eq[e->eq->index].diff_rule;
s->reduces_to_then_with = e->diff_rule;
} else if (eq[eq[e->eq->index].diff_state->index].diff_rule &&
eq[eq[e->eq->index].diff_state->index].diff_rule->elems.n == 2)
{
e->eq->reduces_to = s;
s->reduces_with = e->diff_rule;
s->reduces_to_then_with = eq[e->eq->index].diff_rule;
}
}
}
for (i = 0; i < g->states.n; i++) {
s = g->states.v[i];
if (s->reduces_to)
if (verbose_level)
printf("reduces_to %d %d\n", s->index, s->reduces_to->index);
}
FREE(eq);
}
Grammar *
new_D_Grammar(char *pathname) {
Grammar *g = (Grammar *)MALLOC(sizeof(Grammar));
memset(g, 0, sizeof(Grammar));
g->pathname = dup_str(pathname, pathname + strlen(pathname));
return g;
}
void
free_D_Grammar(Grammar *g) {
FREE(g->pathname);
}
int
parse_grammar(Grammar *g, D_ParserTables *t, int sizeof_ParseNode_User) {
FILE *fp = fopen(g->pathname, "r");
char *s;
D_Parser *p;
if (!fp)
return -1;
if (!(s = sbuf_read(g->pathname)))
return -1;
initialize_productions(g);
p = new_D_Parser(t, sizeof_ParseNode_User);
p->initial_globals = g;
p->loc.pathname = g->pathname;
if (dparse(p, s, strlen(s))) {
if (g->productions.n > 2)
finish_productions(g);
return 0;
} else
return -1;
}
static int
scanner_declaration(Declaration *d) {
switch (d->kind) {
case DECLARE_TOKENIZE:
case DECLARE_LONGEST_MATCH:
case DECLARE_ALL_MATCHES:
return 1;
default:
return 0;
}
}
static void
set_declaration_group(Production *p, Production *root, Declaration *d) {
int i, j;
if (p->declaration_group[d->kind] == root)
return;
if (d->kind == DECLARE_TOKENIZE && p->declaration_group[d->kind]) {
d_fail("shared tokenize subtrees");
return;
}
p->declaration_group[d->kind] = root;
p->last_declaration[d->kind] = d;
for (i = 0; i < p->rules.n; i++) {
for (j = 0; j < p->rules.v[i]->elems.n; j++)
if (p->rules.v[i]->elems.v[j]->kind == ELEM_NTERM)
set_declaration_group(p->rules.v[i]->elems.v[j]->e.nterm, root, d);
}
}
static void
propogate_declarations(Grammar *g) {
int i, j, k;
Production *p;
Rule *r;
Elem *e;
/* global defaults */
if (g->tokenizer)
new_declaration(g, new_elem_nterm(g->productions.v[0], NULL), DECLARE_TOKENIZE);
if (g->longest_match)
new_declaration(g, new_elem_nterm(g->productions.v[0], NULL), DECLARE_LONGEST_MATCH);
/* resolve declarations */
for (i = 0; i < g->declarations.n; i++) {
e = g->declarations.v[i]->elem;
if (e->kind == ELEM_UNRESOLVED) {
if (!(p = lookup_production(g, e->e.unresolved.string, e->e.unresolved.len)))
d_fail("unresolved declaration '%s'", e->e.unresolved.string);
e->kind = ELEM_NTERM;
e->e.nterm = p;
}
}
/* build declaration groups (covering a production subtrees) */
for (i = 0; i < g->declarations.n; i++) {
if (scanner_declaration(g->declarations.v[i])) {
p = g->declarations.v[i]->elem->e.nterm;
set_declaration_group(p, p, g->declarations.v[i]);
}
}
/* set terminal scan_kind */
for (i = 0; i < g->productions.n; i++) {
p = g->productions.v[i];
for (j = 0; j < p->rules.n; j++) {
r = p->rules.v[j];
for (k = 0; k < r->elems.n; k++) {
e = r->elems.v[k];
if (e->kind == ELEM_TERM) {
if (!p->declaration_group[DECLARE_LONGEST_MATCH] &&
!p->declaration_group[DECLARE_ALL_MATCHES])
e->e.term->scan_kind = D_SCAN_DEFAULT;
else if (p->declaration_group[DECLARE_LONGEST_MATCH] &&
!p->declaration_group[DECLARE_ALL_MATCHES])
e->e.term->scan_kind = D_SCAN_LONGEST;
else if (!p->declaration_group[DECLARE_LONGEST_MATCH] &&
p->declaration_group[DECLARE_ALL_MATCHES])
e->e.term->scan_kind = D_SCAN_ALL;
else {
if (p->last_declaration[DECLARE_LONGEST_MATCH]->index >
p->last_declaration[DECLARE_ALL_MATCHES]->index)
e->e.term->scan_kind = D_SCAN_LONGEST;
else
e->e.term->scan_kind = D_SCAN_ALL;
}
}
}
}
}
}
static void
merge_shift_actions(State *to, State *from) {
int i, j;
for (i = 0; i < from->shift_actions.n; i++) {
for (j = 0; j < to->shift_actions.n; j++)
if (from->shift_actions.v[i]->term == to->shift_actions.v[j]->term)
goto Lnext;
vec_add(&to->shift_actions, from->shift_actions.v[i]);
Lnext:;
}
}
static void
compute_declaration_states(Grammar *g, Production *p, Declaration *d) {
State *s, *base_s = NULL;
int j, k, scanner = scanner_declaration(d);
for (j = 0; j < g->states.n; j++) {
s = g->states.v[j];
if (d->kind == DECLARE_TOKENIZE) {
if (!base_s)
base_s = s;
else {
s->same_shifts = base_s;
merge_shift_actions(base_s, s);
}
}
if (scanner) {
for (k = 0; k < s->items.n; k++)
if (s->items.v[k]->kind == ELEM_TERM)
switch (s->items.v[k]->e.term->scan_kind) {
case D_SCAN_LONGEST:
if (s->scan_kind == D_SCAN_RESERVED ||
s->scan_kind == D_SCAN_LONGEST)
s->scan_kind = D_SCAN_LONGEST;
else
s->scan_kind = D_SCAN_MIXED;
break;
case D_SCAN_ALL:
if (s->scan_kind == D_SCAN_RESERVED ||
s->scan_kind == D_SCAN_ALL)
s->scan_kind = D_SCAN_ALL;
else
s->scan_kind = D_SCAN_MIXED;
break;
default:
break;
}
}
}
}
static void
map_declarations_to_states(Grammar *g) {
int i;
State *s;
for (i = 0; i < g->states.n; i++) {
s = g->states.v[i];
s->scan_kind = D_SCAN_RESERVED;
}
/* map groups to sets of states */
for (i = 0; i < g->declarations.n; i++)
if (scanner_declaration(g->declarations.v[i]))
compute_declaration_states(g, g->declarations.v[i]->elem->e.nterm,
g->declarations.v[i]);
for (i = 0; i < g->states.n; i++) {
s = g->states.v[i];
if (s->scan_kind == D_SCAN_RESERVED)
s->scan_kind = D_SCAN_DEFAULT; /* set the default */
}
}
int
build_grammar(Grammar *g) {
resolve_grammar(g);
convert_regex_productions(g);
propogate_declarations(g);
merge_identical_terminals(g);
make_elems_for_productions(g);
check_default_actions(g);
build_LR_tables(g);
map_declarations_to_states(g);
if (verbose_level) {
printf("%d productions %d terminals %d states %d declarations\n",
g->productions.n, g->terminals.n, g->states.n,
g->declarations.n);
}
if (verbose_level > 1) {
print_grammar(g);
print_states(g);
}
build_scanners(g);
build_eq(g);
return 0;
}