Skip to content

Commit 06e0611

Browse files
authored
Merge pull request #237 from deliciousbrains/use-preg-split-for-tokenizing
Use regex to split the SQL statement into tokens.
2 parents db57e3a + 02a5528 commit 06e0611

2 files changed

Lines changed: 56 additions & 57 deletions

File tree

src/PHPSQLParser/lexer/LexerSplitter.php

Lines changed: 55 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -56,41 +56,72 @@ class LexerSplitter {
5656
protected static $splitters = array("<=>", "\r\n", "!=", ">=", "<=", "<>", "<<", ">>", ":=", "\\", "&&", "||", ":=",
5757
"/*", "*/", "--", ">", "<", "|", "=", "^", "(", ")", "\t", "\n", "'", "\"", "`",
5858
",", "@", " ", "+", "-", "*", "/", ";");
59-
protected $tokenSize;
60-
protected $hashSet;
59+
60+
/**
61+
* @var string Regex string pattern of splitters.
62+
*/
63+
protected $splitterPattern;
6164

6265
/**
6366
* Constructor.
6467
*
6568
* It initializes some fields.
6669
*/
6770
public function __construct() {
68-
$this->tokenSize = strlen(self::$splitters[0]); // should be the largest one
69-
$this->hashSet = array_flip(self::$splitters);
71+
$this->splitterPattern = $this->convertSplittersToRegexPattern( self::$splitters );
7072
}
7173

72-
/**
73-
* Get the maximum length of a split token.
74-
*
75-
* The largest element must be on position 0 of the internal $_splitters array,
76-
* so the function returns the length of that token. It must be > 0.
77-
*
78-
* @return int The number of characters for the largest split token.
79-
*/
80-
public function getMaxLengthOfSplitter() {
81-
return $this->tokenSize;
74+
/**
75+
* Get the regex pattern string of all the splitters
76+
*
77+
* @return string
78+
*/
79+
public function getSplittersRegexPattern () {
80+
return $this->splitterPattern;
8281
}
8382

84-
/**
85-
* Looks into the internal split token array and compares the given token with
86-
* the array content. It returns true, if the token will be found, false otherwise.
87-
*
88-
* @param String $token a string, which could be a split token.
89-
*
90-
* @return boolean true, if the given string will be a split token, false otherwise
91-
*/
92-
public function isSplitter($token) {
93-
return isset($this->hashSet[$token]);
83+
/**
84+
* Convert an array of splitter tokens to a regex pattern string.
85+
*
86+
* @param array $splitters
87+
*
88+
* @return string
89+
*/
90+
public function convertSplittersToRegexPattern( $splitters ) {
91+
$regex_parts = array();
92+
foreach ( $splitters as $part ) {
93+
$part = preg_quote( $part );
94+
95+
switch ( $part ) {
96+
case "\r\n":
97+
$part = '\r\n';
98+
break;
99+
case "\t":
100+
$part = '\t';
101+
break;
102+
case "\n":
103+
$part = '\n';
104+
break;
105+
case " ":
106+
$part = '\s';
107+
break;
108+
case "/":
109+
$part = "\/";
110+
break;
111+
case "/\*":
112+
$part = "\/\*";
113+
break;
114+
case "\*/":
115+
$part = "\*\/";
116+
break;
117+
}
118+
119+
$regex_parts[] = $part;
120+
}
121+
122+
$pattern = implode( '|', $regex_parts );
123+
124+
return '/(' . $pattern . ')/';
94125
}
95126
}
96127

src/PHPSQLParser/lexer/PHPSQLLexer.php

Lines changed: 1 addition & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -85,39 +85,7 @@ public function split($sql) {
8585
throw new InvalidParameterException($sql);
8686
}
8787

88-
$tokens = array();
89-
$token = "";
90-
91-
$splitLen = $this->splitters->getMaxLengthOfSplitter();
92-
$found = false;
93-
$len = strlen($sql);
94-
$pos = 0;
95-
96-
while ($pos < $len) {
97-
98-
for ($i = $splitLen; $i > 0; $i--) {
99-
$substr = substr($sql, $pos, $i);
100-
if ($this->splitters->isSplitter($substr)) {
101-
102-
if ($token !== "") {
103-
$tokens[] = $token;
104-
}
105-
106-
$tokens[] = $substr;
107-
$pos += $i;
108-
$token = "";
109-
110-
continue 2;
111-
}
112-
}
113-
114-
$token .= $sql[$pos];
115-
$pos++;
116-
}
117-
118-
if ($token !== "") {
119-
$tokens[] = $token;
120-
}
88+
$tokens = preg_split($this->splitters->getSplittersRegexPattern(), $sql, null, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
12189

12290
$tokens = $this->concatEscapeSequences($tokens);
12391
$tokens = $this->balanceBackticks($tokens);

0 commit comments

Comments
 (0)