Index: php_pcre.c =================================================================== RCS file: /repository/php-src/ext/pcre/php_pcre.c,v retrieving revision 1.170 diff -u -r1.170 php_pcre.c --- php_pcre.c 22 Aug 2005 12:22:10 -0000 1.170 +++ php_pcre.c 1 Sep 2005 19:03:01 -0000 @@ -35,15 +35,23 @@ #define PREG_PATTERN_ORDER 1 #define PREG_SET_ORDER 2 #define PREG_OFFSET_CAPTURE (1<<8) +#define PREG_PARTIAL_RESTART (1<<9) #define PREG_SPLIT_NO_EMPTY (1<<0) #define PREG_SPLIT_DELIM_CAPTURE (1<<1) #define PREG_SPLIT_OFFSET_CAPTURE (1<<2) #define PREG_REPLACE_EVAL (1<<0) +#define PREG_DFA_ENGINE (1<<1) #define PREG_GREP_INVERT (1<<0) +/* size of the workspace array for the DFA engine */ +#define PCRE_DFA_WORKSPACE_SIZE 1000 + +/* minimum size for DFA captures */ +#define PCRE_DFA_CAPTURES_SIZE 50 + ZEND_DECLARE_MODULE_GLOBALS(pcre) @@ -52,6 +60,7 @@ pcre_cache_entry *pce = (pcre_cache_entry *) data; if (!pce) return; pefree(pce->re, 1); + pefree(pce->workspace, 1); #if HAVE_SETLOCALE if ((void*)pce->tables) pefree((void*)pce->tables, 1); pefree(pce->locale, 1); @@ -88,10 +97,12 @@ REGISTER_LONG_CONSTANT("PREG_PATTERN_ORDER", PREG_PATTERN_ORDER, CONST_CS | CONST_PERSISTENT); REGISTER_LONG_CONSTANT("PREG_SET_ORDER", PREG_SET_ORDER, CONST_CS | CONST_PERSISTENT); REGISTER_LONG_CONSTANT("PREG_OFFSET_CAPTURE", PREG_OFFSET_CAPTURE, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("PREG_PARTIAL_RESTART", PREG_PARTIAL_RESTART, CONST_CS | CONST_PERSISTENT); REGISTER_LONG_CONSTANT("PREG_SPLIT_NO_EMPTY", PREG_SPLIT_NO_EMPTY, CONST_CS | CONST_PERSISTENT); REGISTER_LONG_CONSTANT("PREG_SPLIT_DELIM_CAPTURE", PREG_SPLIT_DELIM_CAPTURE, CONST_CS | CONST_PERSISTENT); REGISTER_LONG_CONSTANT("PREG_SPLIT_OFFSET_CAPTURE", PREG_SPLIT_OFFSET_CAPTURE, CONST_CS | CONST_PERSISTENT); REGISTER_LONG_CONSTANT("PREG_GREP_INVERT", PREG_GREP_INVERT, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("PREG_PARTIAL_MATCH", PCRE_ERROR_PARTIAL, CONST_CS | CONST_PERSISTENT); return SUCCESS; } @@ -137,6 +148,21 @@ */ PHPAPI pcre* pcre_get_compiled_regex_ex(char *regex, pcre_extra **extra, int *preg_options, int *compile_options TSRMLS_DC) { + pcre_cache_entry *pce = pcre_get_compiled_regex_full(regex); + + *extra = pce->extra; + *preg_options = pce->preg_options; + *compile_options = pce->compile_options; + + return pce->re; +} + +/* }}} */ + +/* {{{ pcre_get_compiled_regex_full + */ +PHPAPI pcre_cache_entry* pcre_get_compiled_regex_full(char *regex TSRMLS_DC) +{ pcre *re = NULL; int coptions = 0; int soptions = 0; @@ -150,6 +176,7 @@ int regex_len; int do_study = 0; int poptions = 0; + int eoptions = 0; unsigned const char *tables = NULL; #if HAVE_SETLOCALE char *locale = setlocale(LC_CTYPE, NULL); @@ -171,10 +198,7 @@ #if HAVE_SETLOCALE if (!strcmp(pce->locale, locale)) { #endif - *extra = pce->extra; - *preg_options = pce->preg_options; - *compile_options = pce->compile_options; - return pce->re; + return pce; #if HAVE_SETLOCALE } } @@ -247,8 +271,6 @@ /* Move on to the options */ pp++; - /* Clear out preg options */ - *preg_options = 0; /* Parse through the options, setting appropriate flags. Display a warning if we encounter an unknown modifier. */ @@ -268,8 +290,13 @@ case 'X': coptions |= PCRE_EXTRA; break; case 'u': coptions |= PCRE_UTF8; break; + /* PCRE DFA engine specific options */ + case 'H': eoptions |= PCRE_DFA_SHORTEST; break; + case 'P': eoptions |= PCRE_PARTIAL; break; + /* Custom preg options */ case 'e': poptions |= PREG_REPLACE_EVAL; break; + case 'd': poptions |= PREG_DFA_ENGINE; break; case ' ': case '\n': @@ -303,15 +330,14 @@ /* If study option was specified, study the pattern and store the result in extra for passing to pcre_exec. */ if (do_study) { - *extra = pcre_study(re, soptions, &error); + new_entry.extra = pcre_study(re, soptions, &error); if (error != NULL) { php_error_docref(NULL TSRMLS_CC,E_WARNING, "Error while studying pattern"); } + } else { + new_entry.extra = NULL; } - *preg_options = poptions; - *compile_options = coptions; - efree(pattern); /* @@ -326,17 +352,18 @@ /* Store the compiled pattern and extra info in the cache. */ new_entry.re = re; - new_entry.extra = *extra; + new_entry.workspace = NULL; new_entry.preg_options = poptions; new_entry.compile_options = coptions; + new_entry.exec_options = eoptions; #if HAVE_SETLOCALE new_entry.locale = pestrdup(locale, 1); new_entry.tables = tables; #endif zend_hash_update(&PCRE_G(pcre_cache), regex, regex_len+1, (void *)&new_entry, - sizeof(pcre_cache_entry), NULL); + sizeof(pcre_cache_entry), (void **) &pce); - return re; + return pce; } /* }}} */ @@ -378,9 +405,10 @@ **match_sets = NULL; /* An array of sets of matches for each subpattern after a global match */ pcre *re = NULL; /* Compiled regular expression */ + pcre_cache_entry *pe; /* cache entry */ pcre_extra *extra = NULL; /* Holds results of studying */ - int exoptions = 0; /* Execution options */ - int preg_options = 0; /* Custom preg options */ + int exoptions; /* Execution options */ + int preg_options; /* Custom preg options */ int count = 0; /* Count of matched subpatterns */ int *offsets; /* Array of subpattern offsets */ int num_subpats; /* Number of captured subpatterns */ @@ -393,7 +421,8 @@ const char **stringlist; /* Holds list of subpatterns */ char *match; /* The current match */ char **subpat_names = NULL;/* Array for named subpatterns */ - int i, rc; + int i, rc, dfa; + int *workspace; /* DFA workspace */ if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, ((global) ? "ssz|ll" : "ss|zll"), ®ex, ®ex_len, &subject, &subject_len, &subpats, &flags, &start_offset) == FAILURE) { @@ -434,10 +463,34 @@ } /* Compile regex or get it from cache. */ - if ((re = pcre_get_compiled_regex(regex, &extra, &preg_options TSRMLS_CC)) == NULL) { + if ((pe = pcre_get_compiled_regex_full(regex TSRMLS_CC)) == NULL) { RETURN_FALSE; } + /* now fill in the variables we need */ + extra = pe->extra; + preg_options = pe->preg_options; + re = pe->re; + exoptions = pe->exec_options; + workspace = pe->workspace; + dfa = (preg_options & PREG_DFA_ENGINE); + + global &= !dfa; /* disable preg_match_all() for DFA, as it doesn't work well (yet?) */ + + /* fetch the workspace (or create it) if working in DFA mode */ + if (dfa) { + if (workspace) { + /* this can only be activated on a already initializated workspace */ + if (flags & PREG_PARTIAL_RESTART) { + exoptions |= PCRE_DFA_RESTART; + } + + /* not found, create it */ + } else { + pe->workspace = workspace = pemalloc(sizeof(int) * PCRE_DFA_WORKSPACE_SIZE, 1); + } + } + /* Calculate the size of the offsets array, and allocate memory for it. */ rc = pcre_fullinfo(re, extra, PCRE_INFO_CAPTURECOUNT, &num_subpats); if (rc < 0) { @@ -445,7 +498,7 @@ get_active_function_name(TSRMLS_C), rc); RETURN_FALSE; } - num_subpats++; + num_subpats += (dfa ? PCRE_DFA_CAPTURES_SIZE : 1); size_offsets = num_subpats * 3; offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0); @@ -501,8 +554,22 @@ do { /* Execute the regular expression. */ - count = pcre_exec(re, extra, subject, subject_len, start_offset, - exoptions|g_notempty, offsets, size_offsets); + if (dfa) { + count = pcre_dfa_exec(re, extra, subject, subject_len, start_offset, + exoptions|g_notempty, offsets, size_offsets, + workspace, PCRE_DFA_WORKSPACE_SIZE); + + /* if there was a partial match, we have 1 string to fetch */ + if (count == PCRE_ERROR_PARTIAL) { + count = 1; + matched = PCRE_ERROR_PARTIAL-1; /* this is incremented below */ + } + + /* "old" NFA engine */ + } else { + count = pcre_exec(re, extra, subject, subject_len, start_offset, + g_notempty, offsets, size_offsets); + } /* Check for too many substrings condition. */ if (count == 0) { Index: php_pcre.h =================================================================== RCS file: /repository/php-src/ext/pcre/php_pcre.h,v retrieving revision 1.41 diff -u -r1.41 php_pcre.h --- php_pcre.h 3 Aug 2005 14:07:38 -0000 1.41 +++ php_pcre.h 1 Sep 2005 19:03:01 -0000 @@ -41,13 +41,7 @@ PHP_FUNCTION(preg_quote); PHP_FUNCTION(preg_grep); -PHPAPI char *php_pcre_replace(char *regex, int regex_len, char *subject, int subject_len, zval *replace_val, int is_callable_replace, int *result_len, int limit, int *replace_count TSRMLS_DC); -PHPAPI pcre* pcre_get_compiled_regex(char *regex, pcre_extra **extra, int *options TSRMLS_DC); -PHPAPI pcre* pcre_get_compiled_regex_ex(char *regex, pcre_extra **extra, int *preg_options, int *coptions TSRMLS_DC); - -extern zend_module_entry pcre_module_entry; -#define pcre_module_ptr &pcre_module_entry - +/* struct for caching the compiled regexes */ typedef struct { pcre *re; pcre_extra *extra; @@ -57,8 +51,20 @@ unsigned const char *tables; #endif int compile_options; + int exec_options; + int *workspace; } pcre_cache_entry; + +PHPAPI char *php_pcre_replace(char *regex, int regex_len, char *subject, int subject_len, zval *replace_val, int is_callable_replace, int *result_len, int limit, int *replace_count TSRMLS_DC); +PHPAPI pcre* pcre_get_compiled_regex(char *regex, pcre_extra **extra, int *options TSRMLS_DC); +PHPAPI pcre* pcre_get_compiled_regex_ex(char *regex, pcre_extra **extra, int *preg_options, int *coptions TSRMLS_DC); +PHPAPI pcre_cache_entry* pcre_get_compiled_regex_full(char *regex TSRMLS_DC); + +extern zend_module_entry pcre_module_entry; +#define pcre_module_ptr &pcre_module_entry + + ZEND_BEGIN_MODULE_GLOBALS(pcre) HashTable pcre_cache; ZEND_END_MODULE_GLOBALS(pcre)