PDA

View Full Version : a text parser



JernejL
05-06-2009, 07:19 PM
A simple task: parse a line of text, separated by ",", "tab", "space", ";" into a array of strings, also multiple separator characters can separate values.

There's a utility function "StripAfter" which strips one-linecomments from a single line string (for stripping // comments from a string)

This is my implementation, feel free to use it, but can you do it better or faster?

The data is stored in a global variable, this is a class-free code, if you think using a class would be better give me your reasons and thoughts.



{ ************************************************** ********************* }
{ }
{ Crazy text parsing Unit }
{ }
{ ************************************************** ********************* }

unit textparser;

interface

uses sysutils;

procedure Explode(str: string; cleanup: boolean);
function Parsed(n: integer): string;
function ParsedInt(n: integer): integer;
function ParsedLW(n: integer): longword;
function ParsedBool(n: integer): boolean;
function ParsedFloat(n: integer): single;

function StripAfter(const Text, After: string): string;

var
Tokens: array of string;
Items: integer = 0;

implementation

procedure Explode(str: string; cleanup: boolean);
var
i: integer;
begin

if cleanup = True then
begin
// cleanup old data
for i := 0 to Items - 1 do
begin
Tokens[i] := '';
end;

Items := 0;
setlength(Tokens, Items + 1);

end;

// parse new string
for i := 1 to length(str) do
begin
if ((str[i] = ' ') or (str[i] = ';') or (str[i] = ',')) and (Tokens[Items] <> '') then
begin
Items := Items + 1;
setlength(Tokens, Items + 1);
end
else
Tokens[Items] := Tokens[Items] + trim(string(str[i]));
end;

end;

function Parsed(n: integer): string;
begin
if Items = -1 then
Result := ''
else
Result := Tokens[n];
end;

function ParsedInt(n: integer): integer;
begin
Result := StrToInt(Parsed(n));
end;

function ParsedLW(n: integer): longword;
begin
{$R-}
Result := StrToInt(Parsed(n));
{$R+}
end;

function ParsedBool(n: integer): boolean;
begin
Result := (Parsed(n) = '1') or (lowercase(Parsed(n)) = 'true') or (lowercase(Parsed(n)) = 'yes');
end;

function ParsedFloat(n: integer): single;
begin
Result := strtofloat(Parsed(n));
end;

function StripAfter(const Text, After: string): string;
var
p: integer;
begin
p := pos(After, Text);
if p <> 0 then
Result := copy(Text, 0, p - 1)
else
Result := Text;
end;

end.

AthenaOfDelphi
06-06-2009, 01:33 AM
Hi Delfi,

Interesting challenge... can you make it faster :-)

Here's my quick attempt.


type
TStringArray = array of string;

procedure explode(src:string;var tokens:TStringArray;cleanup:boolean);
var
idx : integer;
len : integer;
count : integer;
temp : string;
aChar : char;
toklen : integer;
begin
if cleanup then
begin
finalize(tokens);
end;

setlength(tokens,10);
count:=0;
idx:=1;
len:=length(src);
toklen:=length(tokens);
temp:='';

while (idx<=len) do
begin
aChar:=src[idx];
inc(idx);

if (aChar=' ') or (aChar=';') or
(aChar=',') or (aChar='.') or
(aChar=':') or (aChar=#$0d) or
(aChar=#$0a) then
begin
if (temp<>'') then
begin
if (count+1>toklen) then
begin
toklen:=toklen+(toklen div 2);
setlength(tokens,toklen);
end;
tokens[count]:=temp;
temp:='';
inc(count);
end;
end
else
begin
temp:=temp+aChar;
end;
end;

if (temp<>'') then
begin
if (count+1>toklen) then
begin
inc(toklen);
setlength(tokens,toklen);
end;
tokens[count]:=temp;
inc(count);
end;

if (toklen>count) then
begin
setlength(tokens,count);
end;
end;


Based on my timings, this is I believe a little under twice as fast as your explode routine.

So, why is it faster? Well, this is why I believe it is faster.

First up, I've tried to reduce the number of calls to functions that are relatively slow prefering instead to keep track of the values they would return myself. Thats why I have more variables involved and the code is somewhat longer.

Secondly, setlength is quite a costly routine. So, instead of calling it for each token, I set the token buffer large and then only call it when I run out of room, growing the list by half it's size each time I do (things like TList etc. do something similar I believe). This reduces the time wasted in costly routines.

And third, although I'm not sure how much difference this makes, instead of having an IF that compares string[ x ] against characters, I read the character I'm interested in, store it in a char and then compare that. I say I'm not sure how much difference it makes because I'm no assembly language expert (at least as far as x86 goes) but the code generated by my version looks a lot cleaner consisting only of cmp's and jnz's I believe.

In terms of timing, I timed just the explode routine using the high performance counters. Neither routine were using their own cleanup. Parsing the string "the quick brown fox jumps over the lazy dog", the timings were as follows:-


DescriptionMineYours
Minimum.028ms0.045ms
Maximum3.3ms2.4ms
Average0.032ms0.051ms


I've also just tried parsing a string with 225 tokens in it. Mine took, on average, 0.73ms with yours taking 1.4ms. All these timings were conducted with a loop which parsed the same string 10,000 times.

arthurprs
06-06-2009, 05:42 AM
Here is my version, based on AthenaOfDelphi one

function MyExplode(const src: string): TStringArray;
var
idx: Integer;
//len: Integer;
count: Integer;
CharPtr: PChar;
aChar : Char;
toklen: Integer;
f : Integer;
begin
CharPtr := Pointer(src);
if CharPtr = nil then Exit;
idx := 1;
f := 1;
//len := 0;

toklen := 10;
SetLength(Result, toklen);
count := 0;

while CharPtr^ <> #0 do
begin
aChar := CharPtr^;
Inc(CharPtr);

if (aChar = (' ')) or (aChar = (';')) or
(aChar = (',')) or (aChar = ('.')) or
(aChar = (':')) or (aChar = (#$0D)) or
(aChar = (#$0A)) then
begin
if (f <> 0) then
begin
if (count + 1 > toklen) then
begin
toklen := toklen + (toklen div 2);
SetLength(Result, toklen);
end;
Result[count] := Copy(src, f, idx - f);
f := idx + 1;
Inc(count);
end;
end;
Inc(idx);
//Inc(len);
end;

if (idx >= f) then // f < len
begin
if (count + 1 > toklen) then
begin
Inc(toklen);
SetLength(Result, toklen);
end;
Result[count] := Copy(src, f, MaxInt); // (len - f + 1)
Inc(count);
end;

if toklen > count then
SetLength(Result, count);
end;

- turned it into a function for cleaner code and to avoid passing a non empty tokens array to the procedure...
- changed src parameter to const
- used a PChar pointer and Inc() to navigate in src string
- removed length calculation
- concatenating a temp string is slow, so i used a "f" variable to track the locations and then Copy() to get the substrings

4 to 7 times faster than AthenaOfDelphi version

i don't think it can get much faster than this using only common optimizations

Arthur.

User137
06-06-2009, 08:42 AM
Would be nice to know which one is the fastest ;D

I got 2 functions, text parser with free separation character or string, and another function you can use to put any contained values to variables too:

function ReadStrings(const s,separator: string; var strings: array of string): integer;
var i,p,l,c2: integer; c,cs,cur: string;
begin
c2:=high(strings);
if c2<1 then begin
result:=0; exit;
end;
if c2>10 then c2:=10;
for i:=0 to c2 do strings[i]:='';
p:=0; l:=length(separator); i:=1; cur:='';
while i<=length(s) do begin
c:=copy(s,i,1); cs:=copy(s,i,l);
if cs=separator then begin
if cur<>'' then begin
strings[p]:=cur; cur:='';
inc(p);
if p>high(strings) then break;
strings[p]:='';
end;
i:=i+l-1;
end else cur:=cur+c;
inc(i);
end;
if cur<>'' then begin
strings[p]:=cur; inc(p);
end;
result:=p;
end;

type
TCustomRead = (crString,crInt,crSingle,crDouble,crByte,crWord,
crShortInt,crSmallInt,crCardinal,crBool,crInt64);

function ReadCustom(const s,separator: string; const arr: array of pointer;
const arrt: array of TCustomRead): integer;
var p,arrtl: integer; cur: string; defaultStr: boolean;
procedure SetValue;
begin
if arr[p]=nil then exit;
if defaultStr then string(arr[p]^):=cur
else case arrt[p mod arrtl] of
crString: string(arr[p]^):=cur;
crInt: integer(arr[p]^):=strtointdef(cur,0);
crSingle: single(arr[p]^):=strtofloat(cur);
crDouble: double(arr[p]^):=strtofloat(cur);
crByte: byte(arr[p]^):=strtointdef(cur,0);
crWord: word(arr[p]^):=strtointdef(cur,0);
crShortInt: shortint(arr[p]^):=strtointdef(cur,0);
crSmallInt: smallint(arr[p]^):=strtointdef(cur,0);
crCardinal: cardinal(arr[p]^):=strtointdef(cur,0);
crBool: boolean(arr[p]^):=(cur<>'0') and (lowercase(cur)<>'false');
crInt64: int64(arr[p]^):=strtoint(cur);
end;
end;
var ha,i,l: integer; c,cs: string;
begin
ha:=high(arr);
if ha<1 then begin
result:=0; exit;
end;
defaultStr:=high(arrt)<1; arrtl:=length(arrt);
p:=0; l:=length(separator); i:=1; cur:='';
while i<=length(s) do begin
c:=copy(s,i,1); cs:=copy(s,i,l);
if cs=separator then begin
if cur<>'' then begin
SetValue; cur:=''; inc(p);
if p>ha then break;
end;
inc(i,l-1);
end else cur:=cur+c;
inc(i);
end;
if cur<>'' then begin
SetValue; inc(p);
end;
result:=p;
end;

Edit: There's no button for pascal tags...

Brainer
06-06-2009, 08:55 AM
Here's my try. :)

procedure SplitString(const AOutput: TStrings; const AText, ASeparator: String);
var
S1, S2: String;
begin
AOutput.Clear();
S2 := AText + ASeparator;
repeat
S1 := Copy(S2, 0, Pos(ASeparator, S2) - 1);
AOutput.Add(S1);
Delete(S2, 1, Length(S1 + ASeparator));
until (S2 = '');
end;

AthenaOfDelphi
06-06-2009, 11:20 AM
Ok, I've plugged everyones attempt (so far) into my test harness and got some timings from them all.

The average results are as follows:-


UserAverage Time
arthurps0.011ms
Brainer0.023ms
AthenaOfDelphi0.029ms
Delfi0.049ms
User1370.057ms


The one thing I will say is that two of them don't operate in the same way. User137 and Brainer, your routines aren't capable of splitting using a variety of different characters at the same time, and Brainer, you return a string list as opposed to an array of strings.

But, none the less... nice job everyone :)

Howevere... at the moment, we have a clear winner... arthurps, your version of my routine is neat. As you've identified, I'm wasting time with string concatenation, a very wasteful operation as it reallocates memory for the result string. So, nice job.

Anyone else got anything faster?

User137
06-06-2009, 12:12 PM
The one thing I will say is that two of them don't operate in the same way. User137 and Brainer, your routines aren't capable of splitting using a variety of different characters at the same time...
There may be uses for all different functions, i came from idea that user defines separator and there should never be more than 1. I could for example use '<>' or tab as separator and there are times you don't want ',' or ' ' or anything certain to cut strings.

Thanks for timing test, there seems to be need for optimize.

JernejL
06-06-2009, 12:37 PM
Wow, i didnt realise mine was such rubbish and i didn't actually think there was a lot of room for optimization there, obviously i was very wrong and i'm still learning how to speed up various parts of my game.

arthurprs's code is really nice, it is similar to mine but he uses pointers which is clearly faster. I have some doubts it would work with widestring, which might be a important factor in the future, especially for translating games.

This string parser is a integral part of most games which parse text files and i use this in practically every project i make that reads plaintext files.

I am planning to write a proper token parser soon, which would parse JSON, i kinda dislike XML's syntax and json seems so well human readable and editeable, i'll try to use tricks i see in arthur's code in it.

AthenaOfDelphi
06-06-2009, 01:32 PM
The one thing I will say is that two of them don't operate in the same way. User137 and Brainer, your routines aren't capable of splitting using a variety of different characters at the same time...
There may be uses for all different functions, i came from idea that user defines separator and there should never be more than 1. I could for example use '<>' or tab as separator and there are times you don't want ',' or ' ' or anything certain to cut strings.

Thanks for timing test, there seems to be need for optimize.


Oh absolutely hon, I would go down the same line as you and have a single separator. I wasn't saying it was a bad thing, I was just pointing out the subtle differences between everyones code.

AthenaOfDelphi
06-06-2009, 01:46 PM
arthurprs's code is really nice, it is similar to mine but he uses pointers which is clearly faster. I have some doubts it would work with widestring, which might be a important factor in the future, especially for translating games.

This string parser is a integral part of most games which parse text files and i use this in practically every project i make that reads plaintext files.

I am planning to write a proper token parser soon, which would parse JSON, i kinda dislike XML's syntax and json seems so well human readable and editeable, i'll try to use tricks i see in arthur's code in it.


The main differences between your code and mine is the optimised sizing of the array. That makes quite a difference I believe as setlength is quite slow. And yeah, Arthur's, taken that and tweaked it some more.

In terms of wide character support, he's used inc, so providing the right variable types are used (widestring and, I think, PWideChar) it should still work.

As for writing a JSON parser... I don't want to discourage anyone from learning, but I also wouldn't encourage re-inventing the wheel (unless of course the wheel you were looking at was square). There is already a JSON parser for Delphi, so if it were me, I'd look at their code, see if could be optimised... and if not... work on something else :)

Anyhow... maybe when I get the competition site sorted we could have a regular... 'see who can do it fastest/smallest competition'... just a thought :D

Mirage
06-06-2009, 03:51 PM
Anyhow... maybe when I get the competition site sorted we could have a regular... 'see who can do it fastest/smallest competition'... just a thought

Seems that's a good idea.:)

arthurprs
06-06-2009, 04:14 PM
Anyhow... maybe when I get the competition site sorted we could have a regular... 'see who can do it fastest/smallest competition'... just a thought

Seems that's a good idea.:)


very good :)

JernejL
06-06-2009, 06:08 PM
As for writing a JSON parser... I don't want to discourage anyone from learning, but I also wouldn't encourage re-inventing the wheel (unless of course the wheel you were looking at was square). There is already a JSON parser for Delphi, so if it were me, I'd look at their code, see if could be optimised... and if not... work on something else :)


it must work in freepascal too, the code there uses variants and is "not lightweight enough".

AthenaOfDelphi
06-06-2009, 06:29 PM
it must work in freepascal too, the code there uses variants and is "not lightweight enough".


Then that sounds like a case of re-inventing the wheel... where the original is square :D

Good luck with that when you get round to it.

AthenaOfDelphi
06-06-2009, 10:15 PM
Given the usefulness of this little session, I've added it to the library in a newly created code snippets section here (http://www.pascalgamedevelopment.com/PGDLib/CodeSnippets)

JernejL
07-06-2009, 01:12 PM
AthenaOfDelphi's and consequently Arthurps's code for some reason separate also by the "." character, which makes it impossible to parse floating point values (think "123.45" ) and also "0D0A" newlines which isnt neccesary but they dont parse the tab #09.
This is easy to fix tho just by removing extra IF condition.

Arthurps's code also doesnt parse properly if you use several separators to separate values (think grid space / tab aligned values): it produces empty values in the array.

So correct characters for separating should be:
(aChar = (' ')) or (aChar = (';')) or
(aChar = (',')) or (aChar = (#09))

I didn't test User137 and Brainer's code since they use classes.

AthenaOfDelphi
07-06-2009, 01:19 PM
AthenaOfDelphi's and consequently Arthurps's code for some reason separate also by the "." character, which makes it impossible to parse floating point values (think "123.45" ).
This is easy to fix tho just by removing extra IF condition.

I didn't test User137 and Brainer's code since they use classes.



I just banged in a bunch of separators without really thinking about it too much. I normally use things like INI files, type files or streamed classes for data storage... depends on the application.

Of course, we perhaps shouldn't have included , either since that works as the decimal separator depending on location... just a thought :)

JernejL
07-06-2009, 01:28 PM
http://en.wikipedia.org/wiki/Decimal_separator

The list is faulty, i've been always taught to use decimal point and the list shows slovenia as using a decimal colon.. practically everyone uses decimal point these days.

arthurprs
07-06-2009, 04:34 PM
yes "dot" should be removed, but it's very easy to modify the tokens separators "ifs"

JernejL
07-06-2009, 08:53 PM
yes "dot" should be removed, but it's very easy to modify the tokens separators "ifs"


That's true and i changed it, and after that Athena's code works but your code has another issue - i quote myself:



Arthurps's code also doesnt parse properly if you use several separators to separate values (think grid space / tab aligned values): it produces empty values in the array.

arthurprs
08-06-2009, 12:51 AM
Arthurps's code also doesnt parse properly if you use several separators to separate values (think grid space / tab aligned values): it produces empty values in the array.



can you give me an example?

are you talking about ???
'lol,,great,xD' => ['lol', '', 'great', 'xD']

i think this is the right way, or not?

JernejL
08-06-2009, 04:32 PM
An example directly from my game config files:



debr debr null %1 0 0 0 0 100 8 0.02 0.07 1 0.3 0.3 5 null 0 0 0.1 2 10 null null 10
sprk sprk null %00011 0 0 0 0 100 8 0.02 0.07 1 0.1 0.1 3 null 0 0 0.1 0 0 null null 10
bomb bomb flam %000011001110101 0 0 100 100 3800 8 0.15 0.07 1 0.15 0.15 15 boom 0 1 0.1 0 0 laun null 20
blod blod null %1 0 0 0 0 300 8 0.001 0.3 0.1 0.5 0.5 0 null 0 0 0.1 0 0 null null 10
bult bult muzl %00011000111 0 0 40.4 5 1000 8 0.159 -0.03 1 0.2 0.2 3 syse 0 1 0.1 0 0 uzif null 20

User137
08-06-2009, 05:10 PM
Sometimes using TIniFile but yeah my 3D model format is still ascii however it's mostly readable with readln() with multiple params. But there's sometimes those special cases when even string parser comes to use... That is 1 of the reason i made mine so "customizable" way, the most common use is something unnormal or for backwards compatibility.

My config files look usually this :-[

???????¥?????™?ß?§?°?û?õ?ò?ï?í?è?ç?ä?á?Ñ?Ŭø¬º¬?¬? ¬¥¬?¬Ø¬?¬´¬®¬¶¬§¬¢ ?æ?ì?°Àú‚Äì‚Ä¢‚Äú‚Äò¬ê¬è¬ç?í‚Ä??†À܂İ‚Ć‚Ć‚Ķ‚Äû ?í?í‚Äö‚Äö‚Äö¬Å¬Å¬Å¬Å¬Å¬Å¬Å‚Äö‚Äö‚Äö?í?í‚Äû‚Ķ‚Ć‚ ƂİÀÜ?†‚Ä??í¬ç¬è¬ê‚Äò‚Äú‚Ä¢‚ÄìÀú?°?ì?æ ¬¢¬§¬¶¬®¬´¬?¬Ø¬?¬¥¬?¬?¬º¬ø?Å?Ñ?á?ä?ç?è?í?ï?ò?õ?û?° ?§?ß?™?????¥?????? ???æ¬ù‚Ä??°‚Ñ¢‚Äî‚Äì‚Äù‚Äú‚Äò¬ê???í?†À܂Ć‚Äû‚Äö‚Ç ¨~|zxvtrpomljigfecba`¬° ???æ¬ù‚Ä??°Àú‚Äî‚Ä¢‚Äù‚Äô¬ê???í?†À܂Ć‚Äû‚Äö‚Ǩ~|z xvtrpnlkihfecba`_¬¢¬° ???æ?ì‚Ä?‚Ñ¢Àú‚Äì‚Ä¢‚Äú‚Äò¬è¬ç‚Ä?‚Ä?‚Ä°‚Ķ‚Äö‚Ǩ~{ ywusqomkjhgedba`_^¬£¬¢¬° ??¬ù

arthurprs
08-06-2009, 08:16 PM
An example directly from my game config files:



debr debr null %1 0 0 0 0 100 8 0.02 0.07 1 0.3 0.3 5 null 0 0 0.1 2 10 null null 10
sprk sprk null %00011 0 0 0 0 100 8 0.02 0.07 1 0.1 0.1 3 null 0 0 0.1 0 0 null null 10
bomb bomb flam %000011001110101 0 0 100 100 3800 8 0.15 0.07 1 0.15 0.15 15 boom 0 1 0.1 0 0 laun null 20
blod blod null %1 0 0 0 0 300 8 0.001 0.3 0.1 0.5 0.5 0 null 0 0 0.1 0 0 null null 10
bult bult muzl %00011000111 0 0 40.4 5 1000 8 0.159 -0.03 1 0.2 0.2 3 syse 0 1 0.1 0 0 uzif null 20



uhm, the function definitely does not work for this kind of input

JernejL
08-06-2009, 08:49 PM
Arthur: offource it parses that line by line, the problem is it uses single space to create new item in array while it shouldnt. the original and athena's function parse it properly.

jdarling
09-06-2009, 03:31 AM
Ok, so I'll admit up front that this is cheating. But here is the lexer I use (as a starting point at least) for just about every compiler, lexer, scripter, etc that I work on. As it stands it pretty much handles everything you put in your requirements. Though, I have to admit that the output array isn't directly accessible :)

unit uLexerBase;

interface

const
ttUnknown = 0;
ttNumber = 1;
ttString = 2;
ttComment = 3;
ttWhite = 32;

type
TOnLexerError = procedure(msg : AnsiString; InFile : AnsiString; line, col : LongInt; var Continue : Boolean; Symbol : Pointer) of object;

PLexerClassifierSymbol = ^TLexerClassifierSymbol;
PLexerClassifierSymbolArray=^TLexerClassifierSymbo lArray;
TLexerClassifierSymbolArray = Array of PLexerClassifierSymbol;
TLexerClassifierSymbol = packed record
c : Char; // What character is this?
eow : boolean; // Is this the last character of a word?
data : LongInt; // Anything
below : TLexerClassifierSymbolArray; // Next characters in words.
end;

{ TLexerClassifier }

TLexerClassifier=class
private
fData : TLexerClassifierSymbol;
fSize: Integer;
function GetData: PLexerClassifierSymbol;
procedure Sort(WhatNode : PLexerClassifierSymbol; Start : Integer = -1; Stop : Integer = -1);
function AddChar(WhatChar : Char; ParentNode : PLexerClassifierSymbol; IsEnd : Boolean) : PLexerClassifierSymbol;
public
constructor Create;
destructor Destroy; override;

procedure Clear;

procedure Add(WhatSymbol : AnsiString; Value : LongInt);
function CharExists(WhatChar : Char; ParentNode : PLexerClassifierSymbol; out IsEnd : Boolean) : Boolean; overload;
function CharExists(WhatChar : Char; ParentNode : PLexerClassifierSymbol) : PLexerClassifierSymbol; overload;
function Exists(WhatSymbol : AnsiString; AllowPartial : Boolean = false) : Boolean;
function SymbolData(WhatSymbol : AnsiString) : LongInt;

property Size : Integer read fSize;
property Data : PLexerClassifierSymbol read GetData;
end;

PLexerToken = ^TLexerToken;
TLexerToken = packed record
line, col, srcpos: longint;
Token: ansistring;
TokenType: integer;
end;
TLexerTokenArray = array of TLexerToken;

{ TLexer }

TLexer=class
private
FFileName: AnsiString;
FOnError: TOnLexerError;
FSource : PChar;
FPos: longint;
FLine: longint;
FCol: longint;
FTokenCount: Longint;
FContinue : Boolean;
FTokens : TLexerTokenArray;
FClassifier : TLexerClassifier;
function GetSource: Pointer;
function GetToken(index: integer): PLexerToken;
function GetTokenCount: longint;
procedure SetFileName(const AValue: AnsiString);
procedure SetOnError(const AValue: TOnLexerError);
procedure SetSource(const AValue: Pointer);
protected
function NewToken: PLexerToken;
function CurrToken : PLexerToken;
function Curr: char;
function Prev: char;
function Next(numCharsToSkip: longint = 1): char;
function Peek: char;
function Match(const str: ansistring; IncIfMatched: boolean = True;
ThrowIfNotMatched: boolean = True): boolean;
procedure Throw(msg : AnsiString); overload;
procedure Throw(const msg : AnsiString; args : array of const); overload;

procedure SkipWhite; virtual;
procedure ScanSymol; virtual;
procedure ScanNumber; virtual;
procedure ScanString(const strDelmChar : Char); virtual;
procedure ScanCommentLine(const CommentStart : AnsiString); virtual;
procedure ScanMultiComment(const CommentClose : AnsiString); virtual;
public
constructor Create; virtual;
destructor Destroy; override;

procedure Reset; virtual;
procedure Step; virtual;
function Process : Boolean; virtual;

function EOF: boolean; virtual;
function BOF: boolean; virtual;

property Source: Pointer Read GetSource Write SetSource;
property OnError : TOnLexerError read FOnError write SetOnError;
property FileName: AnsiString read FFileName write SetFileName;

property Pos : LongInt read FPos;
property Line : LongInt read FLine;
property Col : LongInt read FCol;

property TokenCount: longint Read GetTokenCount;
property Token[index: integer]: PLexerToken Read GetToken; default;
end;

implementation

uses
SysUtils;

const
WHITE_SPACE = [#1..#32, #127, #255];
ALPHA = ['a'..'z', 'A'..'Z'];
NUM = ['0'..'9'];
NUM_EXT = NUM + ['.', 'e', 'E'];
ALPHANUM = ALPHA + NUM;
SYM = ALPHANUM + ['_'];

{ TLexerClassifier }

function TLexerClassifier.GetData: PLexerClassifierSymbol;
begin
result := @FData;
end;

procedure TLexerClassifier.Sort(WhatNode: PLexerClassifierSymbol; Start: Integer;
Stop: Integer);
procedure iSort(var r : TLexerClassifierSymbolArray; lo, up : integer );
var
i, j : Integer;
tempr: PLexerClassifierSymbol;
begin
while up>lo do
begin
i := lo;
j := up;
tempr := r[lo];
{*** Split file in two ***}
while i<j do
begin
while r[j]^.c > tempr^.c do
j := j-1; r[i] := r[j];
while (i<j) and (r[i]^.c<=tempr^.c) do
i := i+1;
r[j] := r[i];
end;
r[i] := tempr;
{*** Sort recursively ***}
iSort(r,lo,i-1);
lo := i+1
end
end;
begin
if Start = -1 then
Start := low(WhatNode^.below);
if Stop = -1 then
Stop := high(WhatNode^.below);
if (Start = Stop) or (Start=-1) or (Stop=-1) then
exit;
iSort(WhatNode^.below, Start, Stop);
end;

function TLexerClassifier.AddChar(WhatChar: Char; ParentNode: PLexerClassifierSymbol;
IsEnd: Boolean): PLexerClassifierSymbol;
begin
WhatChar := UpCase(WhatChar);
result := CharExists(WhatChar, ParentNode);
if assigned(result) then
result^.eow := result^.eow or IsEnd
else
begin
new(result);
result^.c := WhatChar;
result^.eow := IsEnd;
result^.data := ttUnknown;
SetLength(ParentNode^.below, length(ParentNode^.below)+1);
ParentNode^.below[length(ParentNode^.below)-1] := result;
Sort(ParentNode);
end;
end;

function TLexerClassifier.CharExists(WhatChar: Char;
ParentNode: PLexerClassifierSymbol; out IsEnd: Boolean): Boolean;
var
n : PLexerClassifierSymbol;
begin
IsEnd := false;
n := CharExists(WhatChar, ParentNode);
result := assigned(n);
if result then
IsEnd := n^.eow;
end;

function TLexerClassifier.CharExists(WhatChar: Char; ParentNode: PLexerClassifierSymbol
): PLexerClassifierSymbol;
var
f, l, j : Integer;
c : Char;
begin
result := nil;
f := 0;
l := length(ParentNode^.below)-1;
WhatChar := UpCase(WhatChar);
if l = -1 then
begin
result := nil;
exit;
end;
while (l-f) > 1 do
begin
j := (l+f) div 2;
c := ParentNode^.below[j]^.c;
if WhatChar <= c then
l := j
else
f := j;
end;
c := ParentNode^.below[l]^.c;
if c = WhatChar then
result := ParentNode^.below[l]
else if (l <> f) and (f>=0) then
begin
c := ParentNode^.below[f]^.c;
if c = WhatChar then
result := ParentNode^.below[f]
end;
end;

constructor TLexerClassifier.Create;
begin
end;

destructor TLexerClassifier.Destroy;
begin
Clear;
inherited Destroy;
end;

procedure TLexerClassifier.Clear;
var
i : Integer;
procedure ClearNode(WhatNode : PLexerClassifierSymbol);
var
c : Integer;
begin
for c := length(WhatNode^.below)-1 downto 0 do
ClearNode(WhatNode^.below[c]);
SetLength(WhatNode^.below, 0);
Freemem(WhatNode);
end;
begin
if Length(fData.below) = 0 then
exit;
for i := 0 to Length(fData.below)-1 do
ClearNode(fData.below[i]);
SetLength(fData.below, 0);
end;

procedure TLexerClassifier.Add(WhatSymbol: AnsiString; Value: LongInt);
var
n : PLexerClassifierSymbol;
pc : PChar;
begin
pc := PChar(WhatSymbol+#0);
n := @FData;
while pc^<>#0 do
begin
n := AddChar(pc^, n, (pc+1)^=#0);
inc(pc);
end;
n^.data := Value;
end;

function TLexerClassifier.Exists(WhatSymbol: AnsiString; AllowPartial : Boolean): Boolean;
var
n : PLexerClassifierSymbol;
pc : PChar;
begin
n := @FData;
pc := PChar(WhatSymbol+#0);
while (pc^<>#0) and assigned(n) do
begin
n := CharExists(pc^, n);
inc(pc);
end;
result := assigned(n) and (n^.eow or AllowPartial);
end;

function TLexerClassifier.SymbolData(WhatSymbol: AnsiString): LongInt;
var
n : PLexerClassifierSymbol;
pc : PChar;
begin
n := @FData;
pc := PChar(WhatSymbol+#0);
while (pc^<>#0) and assigned(n) do
begin
n := CharExists(pc^, n);
inc(pc);
end;
if assigned(n) and (n^.eow) then
result := n^.data
else
result := 0;
end;

{ TLexer }

function TLexer.GetSource: Pointer;
begin
result := FSource;
end;

function TLexer.Curr: char;
begin
result := (FSource + FPos)^;
end;

function TLexer.GetToken(index: integer): PLexerToken;
begin
Result := @FTokens[index];
end;

function TLexer.GetTokenCount: longint;
begin
Result := FTokenCount;
end;

procedure TLexer.ScanSymol;
var
n: PLexerClassifierSymbol;
c: Char;
begin
CurrToken^.TokenType := ttUnknown;
n := FClassifier.CharExists(Curr, FClassifier.Data);
while (Curr in SYM) do
begin
if (assigned(n)) then
begin
c := Next;
if(c in SYM) then
n := FClassifier.CharExists(c, n);
end
else
Next;
end;
if assigned(n) and (n^.eow) then
CurrToken^.TokenType := n^.Data;
end;

procedure TLexer.ScanNumber;
var
hasDot, hasE: boolean;
begin
hasDot := False;
hasE := False;
if(Curr = '-') then
Next;
while (Curr in NUM_EXT) do
begin
case Next of
'.':
begin
if (hasDot) then
Throw('Unexpected . in source');
hasDot := True;
end;
'e', 'E':
begin
if (hasE) then
Throw('Unexpected E in source');
hasE := True;
end;
end;
end;
CurrToken^.TokenType := ttNumber;
end;

procedure TLexer.ScanString(const strDelmChar : Char);
var
done: boolean;
begin
Match(strDelmChar);
done := False;
while ((not done) and (not EOF)) do
begin
if(Curr = strDelmChar) then
begin
if Peek <> strDelmChar then
begin
done := True;
Next;
end
else
Next(2);
end
else if (Curr in [#13, #10]) then
Throw('Unexpected end of line, expecting end of string')
else
Next;
end;
CurrToken^.TokenType:= ttString;
end;

procedure TLexer.ScanCommentLine(const CommentStart : AnsiString);
begin
Match(CommentStart);
while (not ((FSource + FPos + 1)^ in [#13, #10, #0])) do
Inc(FPos);
Inc(FPos);
CurrToken^.TokenType := ttComment;
end;

procedure TLexer.ScanMultiComment(const CommentClose: AnsiString);
begin
while (not match(CommentClose, false, false)) and (not EOF) do
next;
if match(CommentClose) then
CurrToken^.TokenType := ttComment;
end;

procedure TLexer.SkipWhite;
begin
CurrToken^.TokenType:= ttWhite;
while (Curr in WHITE_SPACE) do
Next;
end;

procedure TLexer.SetFileName(const AValue: AnsiString);
begin
if FFileName=AValue then exit;
FFileName:=AValue;
end;

procedure TLexer.SetOnError(const AValue: TOnLexerError);
begin
if FOnError=AValue then exit;
FOnError:=AValue;
end;

procedure TLexer.SetSource(const AValue: Pointer);
begin
FSource := AValue;
end;

procedure TLexer.Throw(msg: AnsiString);
var
post : AnsiString;
begin
msg := StringReplace(msg, #0, 'End of File', [rfReplaceAll]);
if assigned(FOnError) then
begin
FOnError(msg, FileName, CurrToken^.line, CurrToken^.col, FContinue, CurrToken);
if (not FContinue) then
raise exception.create(msg);
end
else
begin
if(FileName<>'')then
post := ' in file ' + post + '.'
else
post := '.';
raise Exception.CreateFmt('EXCEPTION: "%s" on line %d at col %d'+post, [msg, CurrToken^.line, CurrToken^.col]);
end;
end;

procedure TLexer.Throw(const msg: AnsiString; args: array of const);
begin
Throw(format(msg, args));
end;

function TLexer.Next(numCharsToSkip: longint): char;
var
c: char;
begin
if EOF then
begin
Result := #0;
exit;
end;
while (numCharsToSkip > 0) do
begin
Inc(FPos);
c := Curr;
case (c) of
#10:
begin
Inc(FLine);
FCol := 0;
if (c = #13) then
Inc(FPos);
end;
#13: ;
else
Inc(FCol);
end;
Dec(numCharsToSkip);
end;
Result := Curr;
end;

function TLexer.Peek: char;
begin
Result := (FSource + FPos + 1)^;
end;

function TLexer.Match(const str: ansistring; IncIfMatched: boolean;
ThrowIfNotMatched: boolean): boolean;
var
i: integer;
begin
Result := True;
i := 0;
while (Result and (i < length(str))) do
begin
Result := upcase(str[i + 1]) = upcase((FSource + FPos + i)^);
Inc(i);
end;
if ((not Result) and (ThrowIfNotMatched)) then
Throw('"%s" expected', [str]);
if Result and IncIfMatched then
Inc(FPos, length(str));
end;

constructor TLexer.Create;
begin
FClassifier := TLexerClassifier.Create;
end;

destructor TLexer.Destroy;
begin
FClassifier.Free;
inherited Destroy;
end;

function TLexer.Prev: char;
begin
Result := (FSource + FPos - 1)^;
end;

function TLexer.NewToken: PLexerToken;
begin
if (FTokenCount <= Length(FTokens)) then
SetLength(FTokens, FTokenCount + 1000);
Result := Token[FTokenCount];
Result^.col := FCol;
Result^.line := FLine;
if ((FLine = 1) and (FCol = 0)) then
Result^.srcpos := FPos
else
Result^.srcpos := FPos + 1;
Result^.Token := '';
Result^.TokenType := 0;
Inc(FTokenCount);
end;

function TLexer.CurrToken: PLexerToken;
begin
result := Token[FTokenCount-1];
end;

procedure TLexer.Reset;
begin
FContinue := true;
FLine := 1;
FCol := 0;
FPos := 0;
FTokenCount := 0;
SetLength(FTokens, 0);
end;

procedure TLexer.Step;
var
Start: longint;
tkn: PLexerToken;
begin
Start := FPos;
tkn := NewToken;
case Curr of
'a'..'z',
'A'..'Z',
'_' : ScanSymol;
'0'..'9': ScanNumber;
#1..#32,
#127,
#255 : SkipWhite;
#39,
'"' : ScanString(Curr);
else
tkn^.TokenType := ord(Curr);
Next;
end;
SetString(tkn^.Token, (FSource + Start), FPos - Start);
end;

function TLexer.Process : Boolean;
begin
result := true;
try
Reset;
while (not EOF) and FContinue do
Step;
except
on e:Exception do
begin
result := false;
if (FContinue = true) then
raise e;
end;
end;
end;

function TLexer.EOF: boolean;
begin
Result := Curr = #0;
end;

function TLexer.BOF: boolean;
begin
result := FPos <= 0;
end;

end.


I can make it do only what you asked, or write a faster routine that does it, but this is just too convenient to keep around. And yes, I did write every line of it LOL.

- Jeremy

arthurprs
09-06-2009, 05:17 PM
Arthur: offource it parses that line by line, the problem is it uses single space to create new item in array while it shouldnt. the original and athena's function parse it properly.


when i was modifying eAthena function
i noticed that 'a,,b' becomes ['a', 'b'] when i think the right is ['a', '', 'b'] (same as python and php results) so i modified it xx(

here is the functions,
MyExplode is the reviewed version of the original function
MyExplode2 my version that works like you want
function MyExplode(const src: string): TStringArray;
var
idx: Integer;
count: Integer;
CharPtr: PChar;
aChar: Char;
toklen: Integer;
f: Integer;
begin
CharPtr := Pointer(src);
if CharPtr = nil then Exit;
idx := 1;
f := 1;

toklen := 10;
SetLength(Result, toklen);
count := 0;

while (CharPtr^ <> #0) do
begin
aChar := CharPtr^;
Inc(CharPtr);

if (aChar = (' ')) or (aChar = (';')) or
(aChar = (',')) or (aChar = (#09)) then
begin
if (count + 1 > toklen) then
begin
toklen := toklen + (toklen div 2);
SetLength(Result, toklen);
end;
Result[count] := Copy(src, f, idx - f);
f := idx + 1;
Inc(count);
end;
Inc(idx);
end;

if (idx >= f) then
begin
if (count + 1 > toklen) then
begin
Inc(toklen);
SetLength(Result, toklen);
end;
Result[count] := Copy(src, f, MaxInt);
Inc(count);
end;

if (toklen > count) then
SetLength(Result, count);
end;


function MyExplode2(const src: string): TStringArray;
var
idx: Integer;
count: Integer;
CharPtr: PChar;
aChar: Char;
toklen: Integer;
f: Integer;
valid: Integer;
begin
CharPtr := Pointer(src);
if CharPtr = nil then Exit;
idx := 1;
f := 1;
valid := 0;

toklen := 10;
SetLength(Result, toklen);
count := 0;

while (CharPtr^ <> #0) do
begin
aChar := CharPtr^;
Inc(CharPtr);

if (aChar = (' ')) or (aChar = (';')) or
(aChar = (',')) or (aChar = (#09)) then
begin
if (valid <> 0) then
begin
if (count + 1 > toklen) then
begin
toklen := toklen + (toklen div 2);
SetLength(Result, toklen);
end;
Result[count] := Copy(src, f, idx - f);
f := idx + 1;
valid := 0;
Inc(count);
end
else
Inc(f);
end
else
Inc(valid);

Inc(idx);
end;

if (valid <> 0) then
begin
if (count + 1 > toklen) then
begin
Inc(toklen);
SetLength(Result, toklen);
end;
Result[count] := Copy(src, f, MaxInt);
Inc(count);
end;

if (toklen > count) then
SetLength(Result, count);
end;