blob: f9758b812f09669508fa505a04cd3d8fc1d81a75 [file] [log] [blame]
/*
* Match a string against a list of patterns/regexes.
*
* Copyright (C) 2006-2007 Török Edvin <edwin@clamav.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*
*/
#if HAVE_CONFIG_H
#include "clamav-config.h"
#endif
#ifndef CL_DEBUG
#define NDEBUG
#endif
#ifdef CL_THREAD_SAFE
#ifndef _REENTRANT
#define _REENTRANT
#endif
#endif
/* TODO: when implementation of new version is complete, enable it in CL_EXPERIMENTAL */
#ifdef CL_EXPERIMENTAL
/*#define USE_NEW_VERSION*/
#endif
#ifndef USE_NEW_VERSION
/*this is the old version of regex_list.c
*reason for redesign: there is only one node type that has to handle all the cases: binary search among children, alternatives list, match.
* This design is very error-prone.*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <limits.h>
#include <sys/types.h>
#include "regex.h"
#include "clamav.h"
#include "others.h"
#include "regex_list.h"
#include "matcher-ac.h"
#include "str.h"
/*Tree*/
enum token_op_t {OP_CHAR,OP_STDCLASS,OP_CUSTOMCLASS,OP_DOT,OP_LEAF,OP_ROOT,OP_PARCLOSE};
typedef unsigned char* char_bitmap_p;
/*
*
* OP_CHAR: 1 character, c = character
* complex stuff:
* OP_STDCLASS: standard character class, c = char class, class: 1<<(index into std_class of class name)
* OP_CUSTOMCLASS: custom character class, first pointer in ptr array is a pointer to the bitmap table for this class
* OP_DOT: single . matching any character except \n
* OP_LEAF: this is a leaf node, reinterpret structure
*/
struct tree_node {
struct tree_node* next;/* next regex/complex sibling, or parent, if no more siblings , can't be NULL except for root node*/
unsigned char c;
enum token_op_t op;
char alternatives;/* number of (non-regex) children of node, i.e. sizeof(children)*/
char listend;/* no more siblings, next pointer is pointer to parent*/
union {
struct tree_node** children;/* alternatives nr. of children, followed by (a null pointer terminated) regex leaf node pointers) */
char_bitmap_p* bitmap;
struct leaf_info* leaf;
} u;
};
struct leaf_info {
char* info;/* what does it mean that we reached the leaf...*/
regex_t* preg;/* this is NULL if leaf node, and non-regex*/
};
/* Character classes */
static const char* std_class[] = {
"[:alnum:]",
"[:digit:]",
"[:punct:]",
"[:alpha:]",
"[:graph:]",
"[:space:]",
"[:blank:]",
"[:lower:]",
"[:upper:]",
"[:cntrl:]",
"[:print:]",
"[:xdigit:]"
/* don't change the order of these strings, unless you change them in generate_tables.c too, and regenerate the tables*/
};
#define STD_CLASS_CNT sizeof(std_class)/sizeof(std_class[0])
/* generated by contrib/phishing/generate_tables.c */
static const unsigned char char_class_bitmap[STD_CLASS_CNT][32] = {
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x03,
0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x03,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0xfe, 0xff, 0x00, 0xfc,
0x01, 0x00, 0x00, 0xf8, 0x01, 0x00, 0x00, 0x78,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0xfe, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x3e, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x02, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0xfe, 0xff, 0xff, 0x07,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xfe, 0xff, 0xff, 0x07, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x03,
0x7e, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
};
static const unsigned short int char_class[256] = {
0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x260, 0x220, 0x220, 0x220, 0x220, 0x200, 0x200,
0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200,
0x460, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414,
0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414,
0x414, 0xd19, 0xd19, 0xd19, 0xd19, 0xd19, 0xd19, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519,
0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x414, 0x414, 0x414, 0x414, 0x414,
0x414, 0xc99, 0xc99, 0xc99, 0xc99, 0xc99, 0xc99, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499,
0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x414, 0x414, 0x414, 0x414, 0x200,
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000
};
static const size_t std_class_cnt = sizeof(std_class)/sizeof(std_class[0]);
/* Prototypes */
static int add_pattern(struct regex_matcher* matcher,const unsigned char* pat,const char* info,int hostOnly);
static int match_node(struct tree_node* node,const unsigned char* c,size_t len,const char** info);
static void destroy_tree(struct regex_matcher* matcher);
static struct tree_node* tree_root_alloc(void);
static int build_regex_list(struct regex_matcher* matcher);
static void stack_destroy(struct node_stack* stack);
#ifndef NDEBUG
void dump_tree(struct tree_node* root);
#endif
#define MATCH_SUCCESS 0
#define MATCH_FAILED -1
/*
* Call this function when an unrecoverable error has occured, (instead of exit).
*/
static void fatal_error(struct regex_matcher* matcher)
{
regex_list_done(matcher);
matcher->list_inited = -1;/* the phishing module will know we tried to load a whitelist, and failed, so it will disable itself too*/
}
static inline size_t get_char_at_pos_with_skip(const struct pre_fixup_info* info, const char* buffer, size_t pos)
{
const char* str;
size_t realpos = 0;
if(!info) {
return (pos <= strlen(buffer)) ? buffer[pos>0 ? pos-1:0] : '\0';
}
str = info->pre_displayLink.data;
cli_dbgmsg("calc_pos_with_skip: skip:%lu, %lu - %lu \"%s\",\"%s\"\n", pos, info->host_start, info->host_end, str, buffer);
pos += info->host_start;
while(str[realpos] && !isalnum(str[realpos])) realpos++;
for(; str[realpos] && (pos>0); pos--) {
while(str[realpos]==' ') realpos++;
realpos++;
}
while(str[realpos]==' ') realpos++;
cli_dbgmsg("calc_pos_with_skip:%s\n",str+realpos);
return (pos>0 && !str[realpos]) ? '\0' : str[realpos>0?realpos-1:0];
}
/*
* @matcher - matcher structure to use
* @real_url - href target
* @display_url - <a> tag contents
* @hostOnly - if you want to match only the host part
* @is_whitelist - is this a lookup in whitelist?
*
* @return - CL_SUCCESS - url doesn't match
* - CL_VIRUS - url matches list
*
* Do not send NULL pointers to this function!!
*
*/
int regex_list_match(struct regex_matcher* matcher,char* real_url,const char* display_url,const struct pre_fixup_info* pre_fixup,int hostOnly,const char** info,int is_whitelist)
{
massert(matcher);
massert(real_url);
massert(display_url);
massert(info);
if(!matcher->list_inited)
return 0;
massert(matcher->list_built);
{
size_t real_len = strlen(real_url);
size_t display_len = strlen(display_url);
size_t buffer_len = (hostOnly && !is_whitelist) ? real_len : real_len + display_len + 1 + (is_whitelist ? 1 : 0);
char* buffer = cli_malloc(buffer_len+1);
size_t i;
int rc = 0;
struct cli_ac_data mdata;
if(!buffer)
return CL_EMEM;
strncpy(buffer,real_url,real_len);
buffer[real_len]= (!is_whitelist && hostOnly) ? '\0' : ':';
if(!hostOnly || is_whitelist) {
strncpy(buffer+real_len+1,display_url,display_len);
if(is_whitelist)
buffer[buffer_len - 1] = '/';
buffer[buffer_len]=0;
}
cli_dbgmsg("Looking up in regex_list: %s\n", buffer);
if(hostOnly) {
if((rc = cli_ac_initdata(&mdata, 0, AC_DEFAULT_TRACKLEN)))
return rc;
rc = 0;
for(i = 0; i < matcher->root_hosts_cnt; i++) {
/* doesn't need to match terminating \0*/
rc = cli_ac_scanbuff((unsigned char*)buffer,buffer_len,info, &matcher->root_hosts[i] ,&mdata,0,0,0,-1,NULL);
cli_ac_freedata(&mdata);
if(rc) {
char c;
const char* matched = strchr(*info,':');
const size_t match_len = matched ? strlen(matched+1) : 0;
if(((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len+1))==' ' || c=='\0' || c=='/' || c=='?') &&
(match_len == buffer_len || /* full match */
(match_len < buffer_len &&
((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len-match_len))=='.' || (c==' ')) )
/* subdomain matched*/)) {
cli_dbgmsg("Got a match: %s with %s\n",buffer,*info);
cli_dbgmsg("Before inserting .: %s\n",real_url);
if(real_len >= match_len + 1) {
real_url[real_len-match_len-1]='.';
cli_dbgmsg("After inserting .: %s\n",real_url);
}
break;
}
cli_dbgmsg("Ignoring false match: %s with %s,%c\n",buffer,*info,c);
rc=0;
}
}
} else
rc = 0;
if(!rc)
rc = match_node(hostOnly ? matcher->root_regex_hostonly : matcher->root_regex,(unsigned char*)buffer,buffer_len,info) == MATCH_SUCCESS ? CL_VIRUS : CL_SUCCESS;
free(buffer);
if(!rc)
cli_dbgmsg("Lookup result: not in regex list\n");
else
cli_dbgmsg("Lookup result: in regex list\n");
return rc;
}
}
/* node stack */
#define NODE_STACK_INITIAL 1024
#define NODE_STACK_GROW 4096
/* Initialize @stack */
static int stack_init(struct node_stack* stack)
{
massert(stack);
stack->cnt = 0;
stack->capacity = NODE_STACK_INITIAL;
stack->data = cli_malloc(stack->capacity * sizeof(*stack->data));
if(!stack->data)
return CL_EMEM;
else
return CL_SUCCESS;
}
/* Reset @stack pointer, but don't realloc */
static void stack_reset(struct node_stack* stack)
{
massert(stack);
stack->cnt = 0;
}
/* Push @node on @stack, growing it if necessarry */
static int stack_push(struct node_stack* stack,struct tree_node* node)
{
massert(stack);
massert(stack->data);
if(stack->cnt == stack->capacity) {
stack->capacity += NODE_STACK_GROW;
stack->data = cli_realloc2(stack->data,stack->capacity*sizeof(*stack->data));
if(!stack->data)
return CL_EMEM;
}
stack->data[stack->cnt++] = node;
return CL_SUCCESS;
}
/* Pops node from @stack, doesn't realloc */
static struct tree_node* stack_pop(struct node_stack* stack)
{
massert(stack);
massert(stack->data);
massert(stack->cnt);/*don't pop from empty stack */
return stack->cnt ? stack->data[--stack->cnt] : NULL;
}
/* Initialization & loading */
/* Initializes @matcher, allocating necesarry substructures */
int init_regex_list(struct regex_matcher* matcher)
{
int rc;
massert(matcher);
matcher->list_inited = 0;
matcher->root_hosts_cnt = 0;
matcher->root_hosts = NULL;
matcher->root_hosts_cnt = 0;
matcher->root_regex = tree_root_alloc();
if(!matcher->root_regex) {
return CL_EMEM;
}
matcher->root_regex_hostonly = tree_root_alloc();
if(!matcher->root_regex_hostonly) {
free(matcher->root_regex);
return CL_EMEM;
}
if(( rc = stack_init(&matcher->node_stack) )) {
free(matcher->root_regex_hostonly);
free(matcher->root_regex);
return rc;
}
if(( rc = stack_init(&matcher->node_stack_alt) )) {
free(matcher->root_regex_hostonly);
free(matcher->root_regex);
stack_destroy(&matcher->node_stack);
return rc;
}
matcher->list_inited=1;
matcher->list_built=1;/* its empty, but pretend its built, so that load_ will realloc root_hosts */
matcher->list_loaded=0;
return CL_SUCCESS;
}
/* inserts @pattern into @root, using ac-matcher
* although the name might be confusing, @pattern is not a regex!*/
static int add_regex_list_element(struct cli_matcher* root,const char* pattern,char* info)
{
int ret;
struct cli_ac_patt *new = cli_calloc(1,sizeof(*new));
size_t len,i;
if(!new)
return CL_EMEM;
massert(root);
massert(pattern);
len = strlen(pattern);
/* need not to match \0 too */
new->type = 0;
new->sigid = 0;
new->parts = 0;
new->partno = 0;
new->mindist = 0;
new->maxdist = 0;
new->offset = 0;
new->target = 0;
new->length = len;
if(new->length > root->maxpatlen)
root->maxpatlen = new->length;
new->pattern = cli_malloc(sizeof(new->pattern[0])*len);
if(!new->pattern) {
free(new);
return CL_EMEM;
}
for(i=0;i<len;i++)
new->pattern[i]=pattern[i];/*new->pattern is short int* */
new->virname = cli_strdup(info);
if((ret = cli_ac_addpatt(root,new))) {
free(new->virname);
free(new->pattern);
free(new);
return ret;
}
return CL_SUCCESS;
}
static int functionality_level_check(char* line)
{
char* ptmin;
char* ptmax;
size_t j;
ptmin = strrchr(line,':');
if(!ptmin)
return CL_SUCCESS;
ptmin++;
ptmax = strchr(ptmin,'-');
if(!ptmax)
return CL_SUCCESS;/* there is no functionality level specified, so we're ok */
else {
size_t min, max;
ptmax++;
for(j=0;j+ptmin+1 < ptmax;j++)
if(!isdigit(ptmin[j]))
return CL_SUCCESS;/* not numbers, not functionality level */
for(j=0;j<strlen(ptmax);j++)
if(!isdigit(ptmax[j]))
return CL_SUCCESS;/* see above */
ptmax[-1]='\0';
min = atoi(ptmin);
if(strlen(ptmax)==0)
max = INT_MAX;
else
max = atoi(ptmax);
if(min > cl_retflevel()) {
cli_dbgmsg("regex list line %s not loaded (required f-level: %u)\n",line,(unsigned int)min);
return CL_EMALFDB;
}
if(max < cl_retflevel())
return CL_EMALFDB;
ptmin[-1]='\0';
return CL_SUCCESS;
}
}
/* Load patterns/regexes from file */
int load_regex_matcher(struct regex_matcher* matcher,FILE* fd,unsigned int options,int is_whitelist)
{
int rc,line=0;
char buffer[FILEBUFF];
massert(matcher);
massert(fd);
if(matcher->list_inited==-1)
return CL_EMALFDB; /* already failed to load */
/* if(matcher->list_loaded) {
cli_warnmsg("Regex list has already been loaded, ignoring further requests for load\n");
return CL_SUCCESS;
}*/
if(!fd) {
cli_errmsg("Unable to load regex list (null file)\n");
return CL_EIO;
}
cli_dbgmsg("Loading regex_list\n");
if(!matcher->list_inited) {
rc = init_regex_list(matcher);
if (!matcher->list_inited) {
cli_errmsg("Regex list failed to initialize!\n");
fatal_error(matcher);
return rc;
}
/*atexit(regex_list_done); TODO: destroy this in manager.c */
}
/*
* Regexlist db format (common to .wdb(whitelist) and .pdb(domainlist) files:
* Multiple lines of form, (empty lines are skipped):
* Flags RealURL DisplayedURL
* Where:
* Flags:
*
* .pdb files:
* R - regex, H - host-only, followed by (optional) 3-digit hexnumber representing
* flags that should be filtered.
* [i.e. phishcheck urls.flags that we don't want to be done for this particular host]
*
* .wdb files:
* X - full URL regex
* Y - host-only regex
* M - host simple pattern
*
* If a line in the file doesn't conform to this format, loading fails
*
*/
while(fgets(buffer,FILEBUFF,fd)) {
char* pattern;
char* flags;
cli_chomp(buffer);
if(!*buffer)
continue;/* skip empty lines */
if(functionality_level_check(buffer))
continue;
line++;
pattern = strchr(buffer,':');
if(!pattern) {
cli_errmsg("Malformed regex list line %d\n",line);
fatal_error(matcher);
return CL_EMALFDB;
}
/*pattern[0]='\0';*/
flags = buffer+1;
pattern++;
if(is_whitelist) {
const size_t pattern_len = strlen(pattern);
if(pattern_len < FILEBUFF) {
pattern[pattern_len] = '/';
pattern[pattern_len+1] = '\0';
}
else {
cli_errmsg("Overlong regex line %d\n",line);
fatal_error(matcher);
return CL_EMALFDB;
}
}
if((buffer[0] == 'R' && !is_whitelist) || ((buffer[0] == 'X' || buffer[0] == 'Y') && is_whitelist)) {/*regex*/
if(( rc = add_pattern(matcher,(const unsigned char*)pattern,flags, buffer[0] == 'Y') ))
return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;
}
else if( ( buffer[0] == 'H' && !is_whitelist) || (buffer[0] == 'M' && is_whitelist)) {/*matches displayed host*/
struct cli_matcher* root;
if(matcher->list_built) {
struct cli_matcher* old_hosts = matcher->root_hosts;
matcher->root_hosts_cnt++;
matcher->root_hosts = cli_realloc(matcher->root_hosts, matcher->root_hosts_cnt * sizeof(*matcher->root_hosts));
if(!matcher->root_hosts) {
matcher->root_hosts = old_hosts;/* according to manpage this must still be valid*/
return CL_EMEM;
}
root = &matcher->root_hosts[matcher->root_hosts_cnt-1];
memset(root, 0, sizeof(struct cli_matcher));
cli_dbgmsg("regex_list: Initialising AC pattern matcher\n");
if((rc = cli_ac_init(root, cli_ac_mindepth, cli_ac_maxdepth))) {
/* no need to free previously allocated memory here */
cli_errmsg("regex_list: Can't initialise AC pattern matcher\n");
return rc;
}
matcher->list_built = 0;
}
else {
root = &matcher->root_hosts[matcher->root_hosts_cnt-1];
}
if(( rc = add_regex_list_element(root,pattern,flags) ))
return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;
}
else {
return CL_EMALFDB;
/* this is useless, we have host, and regex matches
if(( rc = add_regex_list_element(matcher->root_urls,pattern,flags) ))
return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;*/
}
}
matcher->list_loaded = 1;
if(( rc = build_regex_list(matcher) ))
return rc;
#ifndef NDEBUG
/* dump_tree(matcher->root_regex);*/
#endif
if(!matcher->list_built) {
cli_errmsg("Regex list not loaded: build failed!\n");
fatal_error(matcher);
return CL_EMALFDB;
}
regex_list_cleanup(matcher);
return CL_SUCCESS;
}
static struct tree_node ** tree_node_get_children(const struct tree_node* node)
{
return node->op==OP_CUSTOMCLASS ? (node->u.children[1] ? node->u.children+1 : NULL) :node->u.children;
}
/* Build the matcher list */
static int build_regex_list(struct regex_matcher* matcher)
{
int rc;
if(!matcher->list_inited || !matcher->list_loaded) {
cli_errmsg("Regex list not loaded!\n");
return -1;/*TODO: better error code */
}
cli_dbgmsg("Building regex list\n");
if(matcher->root_hosts)
if(( rc = cli_ac_buildtrie(&matcher->root_hosts[matcher->root_hosts_cnt-1]) ))
return rc;
matcher->list_built=1;
return CL_SUCCESS;
}
/* Done with this matcher, free resources */
void regex_list_done(struct regex_matcher* matcher)
{
massert(matcher);
regex_list_cleanup(matcher);
if(matcher->list_loaded) {
if(matcher->root_hosts) {
size_t i;
for(i=0;i<matcher->root_hosts_cnt;i++)
cli_ac_free(&matcher->root_hosts[i]);
free(matcher->root_hosts);
matcher->root_hosts=NULL;
}
matcher->root_hosts_cnt=0;
matcher->list_built=0;
destroy_tree(matcher);
matcher->list_loaded=0;
}
if(matcher->list_inited) {
matcher->list_inited=0;
}
stack_destroy(&matcher->node_stack);
stack_destroy(&matcher->node_stack_alt);
}
/* Tree matcher algorithm */
struct token_t
{
union {
const unsigned char* start;
char_bitmap_p bitmap;
unsigned char c;
} u;
size_t len;
char type;
};
enum {TOKEN_CHAR,TOKEN_DOT,TOKEN_PAR_OPEN,TOKEN_PAR_CLOSE,TOKEN_BRACKET,TOKEN_ALT,TOKEN_REGEX,TOKEN_DONE};
static const unsigned char* getNextToken(const unsigned char* pat,struct token_t* token)
{
massert(pat);
massert(token);
switch(*pat) {
case '\\':
token->type=TOKEN_CHAR;
token->u.c = *(++pat);
if(islower(token->u.c)) {
/* handle \n, \t, etc. */
char fmt[3] = {'\\', '\0', '\0'};
char c;
fmt[1] = token->u.c;
if(snprintf(&c,1,fmt)!=1) {
token->type=TOKEN_REGEX;
token->u.start = pat;
}
else
token->u.c=c;
}
token->len = 1;
break;
case '|':
token->type=TOKEN_ALT;
break;
case '*':
case '+':
case '?':
case '{':
case '}':
token->type=TOKEN_REGEX;
break;
case '[':
{
/*TODO: implement*/
/*see if it is something simple like a list of characters, a range, or negated ...*/
const unsigned char* old=pat++;/* save this in case we change our mind and decide this is too complicated for us to handle*/
unsigned char range_start=0;
int hasprev = 0;
char_bitmap_p bitmap = cli_malloc(32);
if(!bitmap)
return NULL;
if (*pat=='^') {
memset(bitmap,0xFF,32);/*match chars not in brackets*/
pat++;
}
else
memset(bitmap,0x00,32);
do {
/* literal ] can be first character, so test for it at the end of the loop, for example: []] */
if (*pat=='-' && hasprev) {
/* it is a range*/
unsigned char range_end;
unsigned int c;
massert(range_start);
pat++;
if (pat[0]=='[')
if (pat[1]=='.') {
if(pat[2]=='-' && pat[3]=='.' && pat[4]==']')
range_end = '-';
else {
/* this is getting complicated, bail out */
cli_warnmsg("confused about collating sequences in regex,bailing out");
pat=old;
token->type=TOKEN_REGEX;
break;
}
}
else
range_end = *pat;
else
range_end = *pat;
for(c=range_start+1;c<=range_end;c++)
bitmap[c>>3] ^= 1<<(c&0x7);
hasprev = 0;
}
else if (pat[0]=='[' && pat[1]==':') {
const unsigned char* end;
int len,found=-1;
size_t i;
pat+=2;
end=(unsigned char*)strstr((const char*)pat,":]");
if(!end) {
cli_warnmsg("confused about std char class syntax regex,bailing out");
pat=old;
token->type=TOKEN_REGEX;
break;
}
len = end-pat;
for(i=0;i<std_class_cnt;i++)
if(!strncmp((const char*)pat,std_class[i],len)) {
found=i;
break;
}
if(found!=-1) {
for(i=0;i<256;i++)
if(char_class[i]&(1<<found))
bitmap[i>>3] ^= 1<<(i&0x7);
}
else {
/*unknown class*/
cli_warnmsg("confused about regex bracket expression, bailing out");
pat=old;
token->type=TOKEN_REGEX;
break;
}
}
else {
bitmap[*pat>>3] ^= 1<<(*pat&0x7);
pat++;
range_start = *pat;
hasprev = 1;
}
} while(*pat!=']');
/*TODO: see if this bitmap already exists, then reuse*/
token->type = TOKEN_BRACKET;
token->u.bitmap = bitmap;
break;
}
case ']':
massert(0 && "Encountered ] without matching [");
/* bad state */
break;
case '.':
token->type=TOKEN_DOT;
break;
case '(':
token->type=TOKEN_PAR_OPEN;
break;
case ')':
token->type=TOKEN_PAR_CLOSE;
break;
default:
token->type=TOKEN_CHAR;
token->u.c = *pat;
token->len=1;
break;
}
return ++pat;
}
#define INITIAL_ALT_STACK 10
#define ALT_STACK_GROW 20
static const unsigned char* find_regex_start(const unsigned char* pat)
{
struct token_t token;
/*TODO: find where the regex part begins, for ex:
* abcd+, regex begins at 'd'
* */
const unsigned char* last=NULL;
const unsigned char* tmp=NULL;
const unsigned char** altpositions = cli_malloc(INITIAL_ALT_STACK*sizeof(*altpositions));
size_t altpositions_capacity = INITIAL_ALT_STACK;
size_t altpositions_cnt = 0;
char lasttype = -1;
if(!altpositions)
return NULL;
massert(pat);
/* Try to parse pattern till special regex chars are encountered, that the tree-matcher doesn't handle, like: +,*,{}.
* The tricky part is that once we encounter these, the previous 'atom' has to be passed on to the regex matcher, so we have to
* back up to the last known good position
* Example, if we have: abc(defg)+, then only abc can be handled by tree parser, so we have to return the position of (.
* Another example: abc(defg|xyz|oz+|pdo), the last known good position is |, after xyz
* TODO: what about open parantheses? maybe once we found a special char, we have top back out before the first (?
* */
do {
tmp = pat;
pat = getNextToken(pat,&token);
if(token.type!=TOKEN_REGEX) {
last = tmp;
lasttype = token.type;
if(token.type==TOKEN_BRACKET && token.u.bitmap)
free(token.u.bitmap);
if(token.type==TOKEN_ALT || token.type==TOKEN_PAR_OPEN) {
/* save this position on stack, succesfully parsed till here*/
if(altpositions_cnt && altpositions[altpositions_cnt-1][0]=='|')
/* encountered another alternate (|) operator, override previous | position stored */
altpositions[altpositions_cnt-1]=last;
else {
altpositions[altpositions_cnt++] = last;
if(altpositions_cnt == altpositions_capacity) {
altpositions_capacity += ALT_STACK_GROW;
altpositions = cli_realloc2(altpositions,altpositions_capacity*sizeof(*altpositions));
if(!altpositions)
return NULL;
}
}
} else if (lasttype==TOKEN_PAR_CLOSE) {
/* remove last stored position from stack, succesfully this last group */
altpositions_cnt--;
massert(altpositions_cnt>0);
}
}
else {
if(altpositions_cnt)
last = altpositions[0 /*altpositions_cnt-1*/];/*TODO: which index here?, see above TODO... */
/*last stored 'safe' position where no special (+,*,{}) regex chars were encountered*/
}
} while(*pat && token.type!=TOKEN_REGEX);
free(altpositions);
return *pat ? last : last+1;
}
static struct tree_node* tree_node_alloc(struct tree_node* next,char listend)
{
struct tree_node* node = cli_malloc(sizeof(*node));
if(node) {
node->alternatives=0;
node->next=next;
node->listend=listend;
node->u.children=NULL;
}
return node;
}
static struct tree_node* tree_root_alloc(void)
{
struct tree_node* root=tree_node_alloc(NULL,1);
if(root) {
root->op=OP_ROOT;
root->c=0;
root->next=NULL;
root->listend=1;
}
return root;
}
static struct tree_node* tree_node_char_binsearch(const struct tree_node* node,const char csearch,int* left)
{
int right;
struct tree_node **children;
massert(node);
massert(left);
children = tree_node_get_children(node);
right = node->alternatives-1;
*left = 0;
if(!node->alternatives)
return NULL;
massert(children);
while(*left<=right) {
int mid = *left+(right-*left)/2;
if(children[mid]->c == csearch)
return children[mid];
else if(children[mid]->c < csearch)
*left=mid+1;
else
right=mid-1;
}
return NULL;
}
static struct tree_node* tree_get_next(struct tree_node* node)
{
struct tree_node** children;
massert(node);
children = tree_node_get_children(node);
if(!node->alternatives && children && children[0])
return children[0];
else if(node->alternatives<=1)
return node;
else
return children[0]->next;
}
static size_t tree_node_get_array_size(const struct tree_node* node)
{
massert(node);
/* if op is CUSTOMCLASS, then first pointer is pointer to bitmap, so array size is +1 */
return (node->alternatives + (node->op==OP_CUSTOMCLASS ? 1 : 0)) * sizeof(node->u.children[0]);
}
static struct tree_node* tree_node_char_insert(struct tree_node* node,const char c,int left)
{
struct tree_node* new, *alt = tree_get_next(node);
struct tree_node **children;
node->alternatives++;
node->u.children = cli_realloc2(node->u.children,tree_node_get_array_size(node));
if(!node->u.children)
return NULL;
children = node->op==OP_CUSTOMCLASS ? node->u.children+1 : node->u.children;
new = tree_node_alloc(alt , node == alt );
if(new) {
new->op=OP_CHAR;
new->c=c;
}
if(node->alternatives-left-1>0)
memmove(&children[left+1],&children[left],(node->alternatives-left-1)*sizeof(node->u.children[0]));
children[left] = new;
return new;
}
static void tree_node_insert_nonbin(struct tree_node* node, struct tree_node* new)
{
struct tree_node **children;
massert(node);
massert(new);
children = tree_node_get_children(node);
if(node->alternatives) {
massert(children);
if(children[0]->next == node) {
int i;
new->listend = 1;
for(i=0;i<node->alternatives;i++) {
children[i]->next = new;
children[i]->listend = 0;
}
}
else {
struct tree_node* p;
for(p = children[0]->next ; p->next != node ; p = p->next)
massert(!p->listend);
new->listend = 1;
p->listend = 0;
p->next = new;
}
}
else {
int idx = node->op==OP_CUSTOMCLASS ? 1 : 0;
if(node->u.children)
if(node->u.children[idx]) {
node = node->u.children[idx];
while(node->next && !node->listend)
node = node->next;
node->listend = 0;
new->next = node->next;
node->next = new;
new->listend=1;
return;
}
node->u.children = cli_realloc2(node->u.children,sizeof(node->u.children[0])*(2));
if(node->u.children) {
node->u.children[idx] = new;
}
}
}
static unsigned char char_getclass(const unsigned char* bitmap)
{
size_t i;
massert(bitmap);
for(i=0;i<std_class_cnt;i++)
if(!memcmp(bitmap,char_class_bitmap[i],256>>3))
return i;
return std_class_cnt;
}
static void stack_destroy(struct node_stack* stack)
{
massert(stack);
if(stack->data)
free(stack->data);
stack->data = NULL;
stack->capacity = 0;
}
/* call this after whitelist load is complete, and the tree is no longer going to be modified */
void regex_list_cleanup(struct regex_matcher* matcher)
{
massert(matcher);
stack_destroy(&matcher->node_stack);
stack_destroy(&matcher->node_stack_alt);
stack_init(&matcher->node_stack);
stack_init(&matcher->node_stack_alt);
}
int is_regex_ok(struct regex_matcher* matcher)
{
massert(matcher);
return (!matcher->list_inited || matcher->list_inited!=-1);/* either we don't have a regexlist, or we initialized it successfully */
}
/* returns 0 on success, regexec error code otherwise */
static int add_pattern(struct regex_matcher* matcher,const unsigned char* pat,const char* info, int hostonly)
{
int bol=1;
const unsigned char* pat_end = find_regex_start(pat);
struct token_t token;
struct tree_node* node;
massert(matcher);
node = hostonly ? matcher->root_regex_hostonly : matcher->root_regex;
stack_reset(&matcher->node_stack);
stack_reset(&matcher->node_stack_alt);
stack_push(&matcher->node_stack,node);
for(;node->op!=OP_LEAF;){
if(pat<pat_end)
pat = getNextToken(pat,&token);
else if(*pat) {
token.type = TOKEN_REGEX;
token.u.start=pat;
}
else
token.type = TOKEN_DONE;
switch(token.type) {
case TOKEN_CHAR:
{
/* search for char in tree */
int left;
struct tree_node* newnode = tree_node_char_binsearch(node,token.u.c,&left);
if(newnode)
node = newnode;
else {
/* not found, insert it */
node = tree_node_char_insert(node,token.u.c,left);
}
break;
}
case TOKEN_PAR_OPEN:
stack_push(&matcher->node_stack_alt,NULL);/* marker */
stack_push(&matcher->node_stack,node);
break;
case TOKEN_PAR_CLOSE: {
/*TODO: test this!!!*/
struct tree_node* node_alt = node;
node = tree_node_alloc(NULL,1);
node->op=OP_PARCLOSE;
node->c=0;
node->listend=1;
tree_node_insert_nonbin(node_alt,node);
while (( node_alt = stack_pop(&matcher->node_stack_alt) )) {
tree_node_insert_nonbin(node_alt,node);
}
stack_pop(&matcher->node_stack);
break;
}
case TOKEN_ALT:
stack_push(&matcher->node_stack_alt,node);
node = stack_pop(&matcher->node_stack);
stack_push(&matcher->node_stack,node);
break;
case TOKEN_BRACKET:
{
struct tree_node* new = tree_node_alloc(tree_get_next(node),1);
unsigned char charclass = char_getclass(token.u.bitmap);
if(charclass == std_class_cnt) {/*not a std char class*/
new->op = OP_CUSTOMCLASS;
new->u.children = cli_malloc(sizeof(new->u.children[0])*2);
if(!new->u.children)
return CL_EMEM;
new->u.bitmap[0] = token.u.bitmap;
new->u.bitmap[1] = NULL;
tree_node_insert_nonbin(node,new);
node = new;
}
else {
new->op = OP_STDCLASS;
new->c = charclass;
tree_node_insert_nonbin(node,new);
node=new;
}
break;
}
case TOKEN_DOT:
{
struct tree_node* new = tree_node_alloc(tree_get_next(node),1);
new->op = OP_DOT;
tree_node_insert_nonbin(node,new);
node=new;
break;
}
case TOKEN_REGEX:
case TOKEN_DONE: {
struct leaf_info* leaf=cli_malloc(sizeof(*leaf));
if(!leaf)
return CL_EMEM;
leaf->info = cli_strdup(info);
if(token.type==TOKEN_REGEX) {
int rc;
struct tree_node* new;
regex_t* preg;
preg=cli_malloc(sizeof(*preg));
if(!preg)
return CL_EMEM;
rc = cli_regcomp(preg,(const char*)token.u.start,REG_EXTENDED|(bol?0:REG_NOTBOL));
leaf->preg=preg;
if(rc)
return rc;
new=cli_malloc(sizeof(*new));
if(!new)
return CL_EMEM;
new->op=OP_LEAF;
new->next=node;
new->alternatives=0;
new->u.leaf=leaf;
new->listend=1;
tree_node_insert_nonbin(node,new);
}
else {
leaf->preg=NULL;
node->alternatives=0;
node->u.leaf=leaf;
node->op=OP_LEAF;
}
return 0;
}
}
bol=0;
}
return 0;
}
/* c has to be unsigned char here!! */
static int match_node(struct tree_node* node,const unsigned char* c,size_t len,const char** info)
{
struct tree_node** children;
int rc;
massert(node);
massert(c);
massert(info);
if(!node->u.children)
return MATCH_FAILED;/* tree empty */
*info = NULL;
len++;
c--;
for(;;) {
massert(node);
children = node->u.children;
switch(node->op) {
case OP_ROOT:
rc=1;
break;
case OP_PARCLOSE:
/*this isn't a real character, so don't move*/
c--;
len++;
rc=1;
break;
case OP_CHAR:
massert(*c==node->c && "We know this has to match");
rc = 1;/* *c==node->c;- we know it has matched */
break;
case OP_DOT:
rc = *c!='\n';
break;
case OP_STDCLASS:
rc = char_class[*c]&(node->c);
break;
case OP_CUSTOMCLASS:
{
char_bitmap_p bitmap;
massert(children);
bitmap = (char_bitmap_p)node->u.bitmap[0];
children++;
rc = bitmap[*c>>3]&(1<<(*c&0x7));
break;
}
case OP_LEAF:
{
const struct leaf_info* leaf = node->u.leaf;
/*isleaf = 1;*/
if(leaf->preg) {
rc = !cli_regexec(leaf->preg,(const char*)c,0,NULL,0);
}
else {
massert(*c==node->c && "We know this has to match[2]");
rc = 1;
}
if(rc) {
*info = leaf->info;
return MATCH_SUCCESS;
}
break;
}
default:
/* impossible */
cli_errmsg("Encountered invalid operator in tree:%d\n",node->op);
exit(1);
}
len--;
if(!len) rc=0;
c++;
if(rc) {
const char csearch = *c;
int left = 0,right = node->alternatives-1;
int mid;
/*matched so far, go deeper*/
/*do a binary search between children */
massert(children);
while(left<=right) {
mid = left+(right-left)/2;
if (children[mid]->c == csearch)
break;
else if(children[mid]->c < csearch)
left=mid+1;
else
right=mid-1;
}
if(left<=right) {
node = children[mid];
massert(node);
}
else {
if(node->alternatives) {
if(!children[0]->listend) {
node = children[0];
c++;
len--;
}
while(node && node->listend) {
node = node->next;/* climb up */
c--;
len++;
}
if(!node || !node->next)
return MATCH_FAILED;/* reached root node */
node=node->next;
c--;
len++;
}
else if(node->u.children) {
struct tree_node* rewrite_next = NULL;
if(node->op==OP_PARCLOSE)
rewrite_next = node;
node = children[0];
massert(node);
massert(node->op!=OP_CHAR);
if(rewrite_next)
node->next = rewrite_next;/* this node is pointed to by several parent nodes,
we need to know
from which one we came, so we can find out way back
should we fail to match somewhere deeper*/
}
}
}
else {
/* this node didn't match, try sibling, or parent (if no more siblings) */
while(node && node->listend) {
node = node->next;/* sibling of parent */
c--;
len++;
}
if(!node || !node->next) /* reached root node, it has no next */
return MATCH_FAILED;
else {
c--;
len++;
node=node->next;
}
}
}
return MATCH_FAILED;
}
/* push node on stack, only if it isn't there already */
static void stack_push_once(struct node_stack* stack,struct tree_node* node)
{
size_t i;
massert(stack);
massert(node);
for(i=0;i < stack->cnt;i++)
if(stack->data[i]==node)
return;
stack_push(stack,node);
}
static void destroy_tree_internal(struct regex_matcher* matcher,struct tree_node* node)
{
struct tree_node **children;
massert(matcher);
massert(node);
children = tree_node_get_children(node);
if(node->op==OP_LEAF) {
struct leaf_info* leaf = node->u.leaf;
if(node->next && !node->listend)
destroy_tree_internal(matcher,node->next);
stack_push_once(&matcher->node_stack,(struct tree_node*)node->u.leaf);/* cast to make compiler happy, and to not make another stack implementation for storing void* */
stack_push_once(&matcher->node_stack,node);
if(leaf->preg) {
cli_regfree(leaf->preg);
free(leaf->preg);
leaf->preg=NULL;
}
if(leaf->info) {
free(leaf->info);
leaf->info=NULL;
}
/* return;*/
}
if(node->alternatives) {
int i;
struct tree_node* p;
massert(children);
p = children[0]->op==OP_LEAF ? NULL : children[0]->next;
for(i=0;i<node->alternatives;i++)
destroy_tree_internal(matcher,children[i]);
if(p && p!=node)
destroy_tree_internal(matcher,p);/*?? is this ok, or without _internal?*/
}
else {
if(children) {
if(children[0])
destroy_tree_internal(matcher,children[0]);
}
}
if(node->op!=OP_LEAF && node->next && !node->listend)
destroy_tree_internal(matcher,node->next);
if(node->u.children)
stack_push_once(&matcher->node_stack,(struct tree_node*)node->u.children);/* cast to make compiler happy, it isn't really a tree_node* */
if(node->op==OP_CUSTOMCLASS && node->u.children[0]) {
free(node->u.children[0]);
node->u.children[0]=NULL;
}
stack_push_once(&matcher->node_stack,node);
}
static void destroy_tree(struct regex_matcher* matcher)
{
/* we might have the same node linked by different nodes, so a recursive walk&free doesn't work in all situations,
* i.e. it might double-free, so instead of freeing, just push the nodes on a stack, and later free the nodes in that stack,
* (and push to stack only if it doesn't contain it already*/
massert(matcher);
stack_reset(&matcher->node_stack);
destroy_tree_internal(matcher,matcher->root_regex);
destroy_tree_internal(matcher,matcher->root_regex_hostonly);
while (matcher->node_stack.cnt) {
struct tree_node* node = stack_pop(&matcher->node_stack);
if(node)
free(node);
}
}
#ifndef NDEBUG
static void dump_node(struct tree_node* node)
{
int i;
struct tree_node* p,**children;
massert(node);
if(node->op==OP_LEAF) {
if(node->u.leaf->preg)
printf("n%p [label=\"regex\\nleaf\"]",(void*)node);
else
printf("n%p [label=\"%c\\nleaf\"];\n",(void*)node,node->c);
if(node->next && !node->listend) {
printf("n%p -> n%p;\n",(void*)node,(void*)node->next);
dump_node(node->next);
}
return;
}
printf("n%p [label=\"%c\\n%d\\nlistend:%d\"];\n",(void*)node,(node->op==OP_ROOT||node->op==OP_PARCLOSE) ?'@' :node->c,node->op,node->listend);
if(node->next)
printf("n%p -> n%p;\n",(void*)node,(void*)node->next);
printf("n%p -> {",(void*)node);/*using address of node as id*/
children = tree_node_get_children(node);
if(node->alternatives)
massert(children);
for(i=0;i<node->alternatives;i++)
printf("n%p ",(void*)children[i]);
if(node->alternatives && children[0]->op!=OP_LEAF)
for(p=children[0]->next;p!=node;p=p->next)
{
massert(p);
printf("n%p ",(void*)p);
if(p->op==OP_LEAF || p->listend)
break;
}
if(!node->alternatives && children && children[0])
printf("n%p ",(void*)children[0]);
printf("};\n");
printf("{rank=same;");
for(i=0;i<node->alternatives;i++)
printf("n%p ",(void*)node->u.children[i]);
if(node->alternatives && children[0]->op!=OP_LEAF)
for(p=children[0]->next;p!=node;p=p->next)
{
printf("n%p ",(void*)p);
if(p->op==OP_LEAF || p->listend)
break;
}
if(!node->alternatives && children && children[0])
printf("n%p ",(void*)children[0]);
printf("};\n");
for(i=0;i<node->alternatives;i++)
dump_node(children[i]);
if(node->alternatives && children[0]->op!=OP_LEAF)
for(p=children[0]->next;p!=node;p=p->next)
{
dump_node(p);
if(p->op==OP_LEAF || p->listend)
break;
}
if(!node->alternatives && children && children[0])
dump_node(children[0]);
}
void dump_tree(struct tree_node* root)
{
/*use dot/dotty from graphviz to view it*/
massert(root);
printf("digraph tree {\n");
dump_node(root);
printf("}\n");
}
#endif
#else
/*------------------------New version of regex_list.c------------------------*/
/* Regex_list.c:
* A scalable, trie-based implementation for matching against
* a list of regular expressions.
*
* A trivial way to implement matching against a list of regular expressions
* would have been to construct a single regular expression, by concatenating
* the list with the alternate (|) operator.
* BUT a usual DFA implementation of regular expression matching (eg.: GNU libc)
* leads to "state explosion" when there are many (5000+) alternate (|) operators.
* This is the reason for using a trie-based implementation.
*
*
* Design considerations:
*
* Recursive call points: there are situations when match has to be retried on a different sub-trie, or with a different repeat count.
* Alternate operators, and repeat/range operators (+,*,{}) are recursiv call points. When a failure is encountered during a match,
* the function simply returns from the recursive call, and ends up at a failure point (recursive call point).
*
* "go to parent" below actually means, return from recursive call.
*
* fail_action: we need to return to closest failure point (recursive call point),
* and switch current node to node pointed by fail_action
*
* Node types:
* OP_ROOT: contains information that applies to the entire trie.
* it can only appear as root node, and not as child node.
* On child fail: match has failed
* This is NOT a recursive call point
* OP_CHAR_BINARY_SEARCH: chooses a sub-trie, based on current character;
* using binary-search
* On fail: go to node indicated by fail_action, or if
* fail_action is NULL, to parent
* On child fail: execute fail of current node
* OP_ALTERNATIVES: try matching each sub-trie, if all fails execute fail
* action of current node. This is a recursive call point
* OP_CHAR_REPEAT: repeat specified character a number of times in range:
* [min_range, max_range];
* min_range: 0 for * operator
* 1 for + operator
* max_range: remaining length of current string for *,+ operator
* OR: min_range, max_range as specified by the {min,max} operator
* On fail: fail_action, or parent if NULL
* On child fail: reduce match repeat count, try again on child, if
* repeat count<min_range, execute fail of current node
* Also has a bitmap on what characters are accepted beyond it,
* as an optimizations for the case, when a maximum match isn't possible
* Not recomended to use this when min_range=max_range=1
* This is a recursive call point
* OP_DOT_REPEAT: like OP_CHAR_REPEAT but accept any character
* Not recomended to use this when min_range=max_range=1
* This is a recursive call point
* OP_GROUP_START: start of a group "(", also specifies group flags:
* repeat: is_repeat, min_range, max_range
* This is a recursive call point if is_repeat
* OP_GROUP_END: end of group ")"
* OP_STRCMP: compare with specified string,
* it has an array of fail actions, one for each character
* default fail action: go to parent
* This was introduced from memory- and speed-efficiency
* considerations.
* OP_CHAR_CLASS_REPEAT: match character with character class
* min_range, max_range
* For a single character class min_range=max_range=1
* OP_MATCH_OK: match has succeeded
*
* TODO: maybe we'll need a more efficient way to choose between character classes.
* OP_DOT_REPEAT/OP_CHAR_REPEAT needs a more efficient specification of its failure function, instead of using
* backtracking approach.
*
* The failure function/action is just for optimization, the match algorithms works even without it.
* TODO:In this first draft fail action will always be NULL, in a later version I'll implement fail actions too.
*
*
*/
#include <string.h>
#include "cltypes.h"
#include "others.h"
/* offsetof is not ANSI C */
#ifndef offsetof
# define offsetof(type,memb) ((size_t)&((type*)0)->memb)
#endif
#define container_of(ptr, type, member) ( (type *) ((char *)ptr - offsetof(type, member)) )
#define container_of_const(ptr, type, member) ( (type *) ((const char *)ptr - offsetof(type, member)) )
enum trie_node_type {
OP_ROOT,
OP_CHAR_BINARY_SEARCH,
OP_ALTERNATIVES,
OP_CHAR_REPEAT,
OP_DOT_REPEAT,
OP_CHAR_CLASS_REPEAT,
OP_STRCMP,
OP_GROUP_START,
OP_GROUP_END,
OP_MATCH_OK
};
/* the comon definition of a trie node */
struct trie_node
{
enum trie_node_type type;
};
struct trie_node_match {
struct trie_node node;
/* additional match info */
};
struct trie_node_root
{
struct trie_node node;
struct trie_node* child;
};
struct trie_node_binary_search
{
struct trie_node node;
uint8_t children_count;/* number of children to search among -1! 255 = 256 children*/
struct trie_node* fail_action;
unsigned char* char_choices;/* children_count elements */
struct trie_node** children;/*children_count elements */
};
struct trie_node_alternatives
{
struct trie_node node;
uint32_t alternatives_count;
/* need to support node with lots of alternatives,
* for a worst-case scenario where each line ends up as a sub-trie of OP_ALTERNATIVES*/
struct trie_node* fail_action;
struct trie_node** children;
};
struct trie_node_char_repeat
{
struct trie_node node;
unsigned char character;
uint8_t range_min, range_max;/* according to POSIX we need not support more than 255 repetitions*/
struct char_bitmap* bitmap_accept_after;/* bitmap of characters accepted after this,
to optimize repeat < max_range case; if its NULL
there is no optimization*/
struct trie_node* child;
struct trie_node* fail_action;
};
struct trie_node_dot_repeat
{
struct trie_node node;
uint8_t range_min, range_max;/* according to POSIX we need not support more than 255 repetitions*/
struct char_bitmap* bitmap_accept_after;/* bitmap of characters accepted after this,
to optimize repeat < max_range case; if its NULL
there is no optimization*/
struct trie_node* child;
struct trie_node* fail_action;
};
struct trie_node_group_start
{
struct trie_node node;
uint8_t range_min, range_max;/* if range_min==range_max==1, then this is NOT a repeat, thus not a recursive call point*/
struct trie_node* child;
struct trie_node* fail_action;
};
struct trie_node_group_end
{
struct trie_node node;
struct trie_node* child;
};
struct trie_node_strcmp
{
struct trie_node node;
uint8_t string_length;/* for longer strings a sequence of node_strcmp should be used */
unsigned char* string;
struct trie_node* child;
struct trie_node** fail_actions;/* this has string_length elements, or NULL if no fail_actions are computed */
};
struct trie_node_char_class_repeat
{
struct trie_node node;
struct char_bitmap* bitmap;
struct char_bitmap* bitmap_accept_after;
uint8_t range_min, range_max;
struct trie_node* child;
struct trie_node* fail_action;
};
static inline int bitmap_accepts(const struct char_bitmap* bitmap, const char c)
{
/* TODO: check if c is accepted by bitmap */
return 0;
}
#define MATCH_FAILED 0
#define MATCH_OK 1
#define FAIL_ACTION( fail_node ) (*fail_action = (fail_node), MATCH_FAILED)
#ifndef MIN
#define MIN(a,b) ((a)<(b) ? (a) : (b))
#endif
static int match_node(const struct trie_node* node, const unsigned char* text, const unsigned char* text_end, const struct trie_node** fail_action);
static int match_repeat(const unsigned char* text, const unsigned char* text_end, const size_t range_min, const size_t repeat_start,
const struct char_bitmap* bitmap_accept_after, const struct trie_node* child, const struct trie_node** fail_action,
const struct trie_node* this_fail_action)
{
size_t i;
for(i = repeat_start;i > range_min;i--) {
if(!bitmap_accept_after || bitmap_accepts( bitmap_accept_after, text[i-1])) {
int rc = match_node(child, &text[i], text_end, fail_action);
/* ignore fail_action for now, we have the bitmap_accepts_after optimization */
if(rc) {
return MATCH_OK;
}
}
}
if(!range_min) {
/* this match is optional, try child only */
int rc = match_node(child, text, text_end, fail_action);
if(rc) {
return MATCH_OK;
}
}
return FAIL_ACTION(this_fail_action);
}
/* text_end points to \0 in text */
static int match_node(const struct trie_node* node, const unsigned char* text, const unsigned char* text_end, const struct trie_node** fail_action)
{
while(node && text < text_end) {
switch(node->type) {
case OP_ROOT:
{
const struct trie_node_root* root_node = container_of_const(node, const struct trie_node_root, node);
node = root_node->child;
break;
}
case OP_CHAR_BINARY_SEARCH:
{
const struct trie_node_binary_search* bin_node = container_of_const(node, const struct trie_node_binary_search, node);
const unsigned char csearch = *text;
size_t mid, left = 0, right = bin_node->children_count-1;
while(left<=right) {
mid = left+(right-left)/2;
if(bin_node->char_choices[mid] == csearch)
break;
else if(bin_node->char_choices[mid] < csearch)
left = mid+1;
else
right = mid-1;
}
if(left <= right) {
/* match successful */
node = bin_node->children[mid];
++text;
}
else {
return FAIL_ACTION( bin_node->fail_action );
}
break;
}
case OP_ALTERNATIVES:
{
const struct trie_node_alternatives* alt_node = container_of_const(node, const struct trie_node_alternatives, node);
size_t i;
*fail_action = NULL;
for(i=0;i < alt_node->alternatives_count;i++) {
int rc = match_node(alt_node->children[i], text, text_end, fail_action);
if(rc) {
return MATCH_OK;
}
/* supporting fail_actions is tricky,
* if we just go to the node specified, what happens if the match fails, and no
* further fail_action is specified? We should know where to continue the search.
* For now fail_action isn't supported for OP_ALTERNATIVES*/
}
break;
}
case OP_CHAR_REPEAT:
{
const struct trie_node_char_repeat* char_rep_node = container_of_const(node, const struct trie_node_char_repeat, node);
const size_t max_len = MIN( text_end - text, char_rep_node->range_max-1);
/* todo: what about the 8 bit limitation of range_max, and what about inf (+,*)? */
const char caccept = char_rep_node->character;
size_t rep;
if(max_len < char_rep_node->range_min)
return FAIL_ACTION(char_rep_node->fail_action);
for(rep=0;rep < max_len;rep++) {
if(text[rep] != caccept) {
break;
}
}
return match_repeat(text, text_end, char_rep_node->range_min, rep,
char_rep_node->bitmap_accept_after, char_rep_node->child, fail_action,
char_rep_node->fail_action);
}
case OP_DOT_REPEAT:
{
const struct trie_node_dot_repeat* dot_rep_node = container_of_const(node, const struct trie_node_dot_repeat, node);
const size_t max_len = MIN( text_end - text, dot_rep_node->range_max-1);
/* todo: what about the 8 bit limitation of range_max, and what about inf (+,*)? */
if(max_len < dot_rep_node->range_min)
return FAIL_ACTION(dot_rep_node->fail_action);
return match_repeat(text, text_end, dot_rep_node->range_min, max_len,
dot_rep_node->bitmap_accept_after, dot_rep_node->child, fail_action,
dot_rep_node->fail_action);
}
case OP_CHAR_CLASS_REPEAT:
{
const struct trie_node_char_class_repeat* class_rep_node = container_of_const(node, const struct trie_node_char_class_repeat, node);
const size_t max_len = MIN( text_end - text, class_rep_node->range_max-1);
/* todo: what about the 8 bit limitation of range_max, and what about inf (+,*)? */
size_t rep;
if(max_len < class_rep_node->range_min)
return FAIL_ACTION(class_rep_node->fail_action);
for(rep=0;rep < max_len;rep++) {
if(!bitmap_accepts( class_rep_node->bitmap, text[rep])) {
break;
}
}
return match_repeat(text, text_end, class_rep_node->range_min, rep,
class_rep_node->bitmap_accept_after, class_rep_node->child, fail_action,
class_rep_node->fail_action);
break;
}
case OP_STRCMP:
{
const struct trie_node_strcmp* strcmp_node = container_of_const(node, const struct trie_node_strcmp, node);
size_t i;
if(strcmp_node->fail_actions) {
const size_t max_len = MIN(strcmp_node->string_length, text_end-text);
/* we don't use strncmp, because we need the exact match-fail point */
for(i=0;i < max_len;i++) {
if(text[i] != strcmp_node->string[i]) {
return FAIL_ACTION( strcmp_node->fail_actions[i] );
}
}
if(max_len < strcmp_node->string_length) {
/* failed, because text was shorter */
return FAIL_ACTION( strcmp_node->fail_actions[max_len] );
}
}
else {
/* no fail_actions computed, some shortcuts possible on compare */
if((text_end - text < strcmp_node->string_length) ||
strncmp((const char*)text, (const char*)strcmp_node->string, strcmp_node->string_length)) {
return FAIL_ACTION( NULL );
}
}
/* match successful */
node = strcmp_node->child;
text += strcmp_node->string_length;
break;
}
case OP_GROUP_START:
{
const struct trie_node_group_start* group_start_node = container_of_const(node, const struct trie_node_group_start, node);
/* TODO: implement */
break;
}
case OP_GROUP_END:
{
const struct trie_node_group_end* group_end_node = container_of_const(node, const struct trie_node_group_end, node);
/* TODO: implement */
break;
}
case OP_MATCH_OK:
{
return MATCH_OK;
}
}
}
/* if fail_action was NULL, or text ended*/
return MATCH_FAILED;
}
#endif