Fast Text Parser/Tokeniser
Back to snippets
This snippet came out of
this thread when Delfi challenged everyone to improve his text parser. The thread itself is worth a read as it includes timings and information about what optimisations were introduced and how they helped performance.
type
TStringArray = array of string;
function MyExplode(const src: string): TStringArray;
var
idx: Integer;
count: Integer;
CharPtr: PChar;
aChar : Char;
toklen: Integer;
f : Integer;
begin
CharPtr := Pointer(src);
if CharPtr = nil then Exit;
idx := 1;
f := 1;
toklen := 10;
SetLength(Result, toklen);
count := 0;
while CharPtr^ <> #0 do
begin
aChar := CharPtr^;
Inc(CharPtr);
// Look for separators
if (aChar = (' ')) or (aChar = (';')) or
(aChar = (',')) or (aChar = ('.')) or
(aChar = (':')) or (aChar = (#$0D)) or
(aChar = (#$0A)) then
begin
if (f <> 0) then
begin
if (count + 1 > toklen) then
begin
toklen := toklen + (toklen div 2);
SetLength(Result, toklen);
end;
Result[count] := Copy(src, f, idx - f);
f := idx + 1;
Inc(count);
end;
end;
Inc(idx);
end;
if (idx >= f) then
begin
if (count + 1 > toklen) then
begin
Inc(toklen);
SetLength(Result, toklen);
end;
Result[count] := Copy(src, f, MaxInt);
Inc(count);
end;
if toklen > count then
SetLength(Result, count);
end;
Code submitted to PGD by Arthurps, Saturday 6th June 2009.
This tokeniser breaks the incoming string at numerous separators as defined in the IF under the comment 'Look for separators'. When you've done with the returned string array simply use 'finalize' to clean it up. Whilst the code doesn't handle wide chars, by changing the incoming string parameter 'src' and the variables 'achar' and 'charptr' to their wide equivalents, it should work just fine.