MultiStringMatcher.php
231 lines
| 1 | <?php |
| 2 | /** |
| 3 | * AhoCorasick PHP Library |
| 4 | * |
| 5 | * A PHP implementation of the Aho-Corasick string matching algorithm. |
| 6 | * |
| 7 | * Alfred V. Aho and Margaret J. Corasick, "Efficient string matching: |
| 8 | * an aid to bibliographic search", CACM, 18(6):333-340, June 1975. |
| 9 | * |
| 10 | * @link http://xlinux.nist.gov/dads//HTML/ahoCorasick.html |
| 11 | * @link https://en.wikipedia.org/wiki/Aho-Corasick_string_matching_algorithm |
| 12 | * |
| 13 | * Copyright (C) 2015 Ori Livneh <ori@wikimedia.org> |
| 14 | * |
| 15 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 16 | * you may not use this file except in compliance with the License. |
| 17 | * You may obtain a copy of the License at |
| 18 | * |
| 19 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 20 | * |
| 21 | * Unless required by applicable law or agreed to in writing, software |
| 22 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 23 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 24 | * See the License for the specific language governing permissions and |
| 25 | * limitations under the License. |
| 26 | * |
| 27 | * @file |
| 28 | * @author Ori Livneh <ori@wikimedia.org> |
| 29 | */ |
| 30 | |
| 31 | namespace AhoCorasick; |
| 32 | |
| 33 | /** |
| 34 | * Represents a finite state machine that can find all occurrences |
| 35 | * of a set of search keywords in a body of text. |
| 36 | * |
| 37 | * The time it takes to construct the finite state machine is |
| 38 | * proportional to the sum of the lengths of the search keywords. |
| 39 | * Once constructed, the machine can locate all occurences of all |
| 40 | * search keywords in a body of text in a single pass, making exactly |
| 41 | * one state transition per input character. |
| 42 | * |
| 43 | * This is an implementation of the Aho-Corasick string matching |
| 44 | * algorithm. |
| 45 | * |
| 46 | * Alfred V. Aho and Margaret J. Corasick, "Efficient string matching: |
| 47 | * an aid to bibliographic search", CACM, 18(6):333-340, June 1975. |
| 48 | * |
| 49 | * @link http://xlinux.nist.gov/dads//HTML/ahoCorasick.html |
| 50 | */ |
| 51 | class MultiStringMatcher { |
| 52 | |
| 53 | /** @var string[] The set of keywords to be searched for. **/ |
| 54 | protected $searchKeywords = []; |
| 55 | |
| 56 | /** @var int The number of possible states of the string-matching finite state machine. **/ |
| 57 | protected $numStates = 1; |
| 58 | |
| 59 | /** @var array Mapping of states to outputs. **/ |
| 60 | protected $outputs = []; |
| 61 | |
| 62 | /** @var array Mapping of failure transitions. **/ |
| 63 | protected $noTransitions = []; |
| 64 | |
| 65 | /** @var array Mapping of success transitions. **/ |
| 66 | protected $yesTransitions = []; |
| 67 | |
| 68 | /** |
| 69 | * Constructor. |
| 70 | * |
| 71 | * @param string[] $searchKeywords The set of keywords to be matched. |
| 72 | */ |
| 73 | public function __construct( array $searchKeywords ) { |
| 74 | foreach ( $searchKeywords as $keyword ) { |
| 75 | if ( $keyword !== '' ) { |
| 76 | $this->searchKeywords[$keyword] = strlen( $keyword ); |
| 77 | } |
| 78 | } |
| 79 | |
| 80 | if ( !$this->searchKeywords ) { |
| 81 | trigger_error( __METHOD__ . ': The set of search keywords is empty.', E_USER_WARNING ); |
| 82 | // Unreachable 'return' when PHPUnit detects trigger_error |
| 83 | return; // @codeCoverageIgnore |
| 84 | } |
| 85 | |
| 86 | $this->computeYesTransitions(); |
| 87 | $this->computeNoTransitions(); |
| 88 | } |
| 89 | |
| 90 | /** |
| 91 | * Accessor for the search keywords. |
| 92 | * |
| 93 | * @return string[] Search keywords. |
| 94 | */ |
| 95 | public function getKeywords() { |
| 96 | return array_keys( $this->searchKeywords ); |
| 97 | } |
| 98 | |
| 99 | /** |
| 100 | * Map the current state and input character to the next state. |
| 101 | * |
| 102 | * @param int $currentState The current state of the string-matching |
| 103 | * automaton. |
| 104 | * @param string $inputChar The character the string-matching |
| 105 | * automaton is currently processing. |
| 106 | * @return int The state the automaton should transition to. |
| 107 | */ |
| 108 | public function nextState( $currentState, $inputChar ) { |
| 109 | $initialState = $currentState; |
| 110 | while ( true ) { |
| 111 | $transitions =& $this->yesTransitions[$currentState]; |
| 112 | if ( isset( $transitions[$inputChar] ) ) { |
| 113 | $nextState = $transitions[$inputChar]; |
| 114 | // Avoid failure transitions next time. |
| 115 | if ( $currentState !== $initialState ) { |
| 116 | $this->yesTransitions[$initialState][$inputChar] = $nextState; |
| 117 | } |
| 118 | return $nextState; |
| 119 | } |
| 120 | if ( $currentState === 0 ) { |
| 121 | return 0; |
| 122 | } |
| 123 | $currentState = $this->noTransitions[$currentState]; |
| 124 | } |
| 125 | // Unreachable outside 'while' |
| 126 | } // @codeCoverageIgnore |
| 127 | |
| 128 | /** |
| 129 | * Locate the search keywords in some text. |
| 130 | * |
| 131 | * @param string $text The string to search in. |
| 132 | * @return array[] An array of matches. Each match is a vector |
| 133 | * containing an integer offset and the matched keyword. |
| 134 | * |
| 135 | * @par Example: |
| 136 | * @code |
| 137 | * $keywords = new MultiStringMatcher( array( 'ore', 'hell' ) ); |
| 138 | * $keywords->searchIn( 'She sells sea shells by the sea shore.' ); |
| 139 | * // result: array( array( 15, 'hell' ), array( 34, 'ore' ) ) |
| 140 | * @endcode |
| 141 | */ |
| 142 | public function searchIn( $text ) { |
| 143 | if ( !$this->searchKeywords || $text === '' ) { |
| 144 | return []; // fast path |
| 145 | } |
| 146 | |
| 147 | $state = 0; |
| 148 | $results = []; |
| 149 | $length = strlen( $text ); |
| 150 | |
| 151 | for ( $i = 0; $i < $length; $i++ ) { |
| 152 | $ch = $text[$i]; |
| 153 | $state = $this->nextState( $state, $ch ); |
| 154 | foreach ( $this->outputs[$state] as $match ) { |
| 155 | $offset = $i - $this->searchKeywords[$match] + 1; |
| 156 | $results[] = [ $offset, $match ]; |
| 157 | } |
| 158 | } |
| 159 | |
| 160 | return $results; |
| 161 | } |
| 162 | |
| 163 | /** |
| 164 | * Get the state transitions which the string-matching automaton |
| 165 | * shall make as it advances through input text. |
| 166 | * |
| 167 | * Constructs a directed tree with a root node which represents the |
| 168 | * initial state of the string-matching automaton and from which a |
| 169 | * path exists which spells out each search keyword. |
| 170 | */ |
| 171 | protected function computeYesTransitions() { |
| 172 | $this->yesTransitions = [ [] ]; |
| 173 | $this->outputs = [ [] ]; |
| 174 | foreach ( $this->searchKeywords as $keyword => $length ) { |
| 175 | $state = 0; |
| 176 | for ( $i = 0; $i < $length; $i++ ) { |
| 177 | $ch = $keyword[$i]; |
| 178 | if ( !empty( $this->yesTransitions[$state][$ch] ) ) { |
| 179 | $state = $this->yesTransitions[$state][$ch]; |
| 180 | } else { |
| 181 | $this->yesTransitions[$state][$ch] = $this->numStates; |
| 182 | $this->yesTransitions[] = []; |
| 183 | $this->outputs[] = []; |
| 184 | $state = $this->numStates++; |
| 185 | } |
| 186 | } |
| 187 | |
| 188 | $this->outputs[$state][] = $keyword; |
| 189 | } |
| 190 | } |
| 191 | |
| 192 | /** |
| 193 | * Get the state transitions which the string-matching automaton |
| 194 | * shall make when a partial match proves false. |
| 195 | */ |
| 196 | protected function computeNoTransitions() { |
| 197 | $queue = []; |
| 198 | $this->noTransitions = []; |
| 199 | |
| 200 | foreach ( $this->yesTransitions[0] as $ch => $toState ) { |
| 201 | $queue[] = $toState; |
| 202 | $this->noTransitions[$toState] = 0; |
| 203 | } |
| 204 | |
| 205 | while ( true ) { |
| 206 | $fromState = array_shift( $queue ); |
| 207 | if ( $fromState === null ) { |
| 208 | break; |
| 209 | } |
| 210 | foreach ( $this->yesTransitions[$fromState] as $ch => $toState ) { |
| 211 | $queue[] = $toState; |
| 212 | $state = $this->noTransitions[$fromState]; |
| 213 | |
| 214 | while ( $state !== 0 && empty( $this->yesTransitions[$state][$ch] ) ) { |
| 215 | $state = $this->noTransitions[$state]; |
| 216 | } |
| 217 | |
| 218 | if ( isset( $this->yesTransitions[$state][$ch] ) ) { |
| 219 | $noState = $this->yesTransitions[$state][$ch]; |
| 220 | } else { |
| 221 | $noState = 0; |
| 222 | } |
| 223 | |
| 224 | $this->noTransitions[$toState] = $noState; |
| 225 | $this->outputs[$toState] = array_merge( |
| 226 | $this->outputs[$toState], $this->outputs[$noState] ); |
| 227 | } |
| 228 | } |
| 229 | } |
| 230 | } |
| 231 |