unit HtmlReader;
interface
uses
Classes, DomCore;
type
TDelimiters = set of Byte;
TReaderState = (rsInitial, rsBeforeAttr, rsBeforeValue, rsInValue, rsInQuotedValue);
THtmlReader = class
private
FHtmlStr: TDomString;
FPosition: Integer;
FNodeType: Integer;
FPrefix: TDomString;
FLocalName: TDomString;
FNodeValue: TDomString;
FPublicID: TDomString;
FSystemID: TDomString;
FIsEmptyElement: Boolean;
FState: TReaderState;
FQuotation: Word;
FOnAttributeEnd: TNotifyEvent;
FOnAttributeStart: TNotifyEvent;
FOnCDataSection: TNotifyEvent;
FOnComment: TNotifyEvent;
FOnDocType: TNotifyEvent;
FOnElementEnd: TNotifyEvent;
FOnElementStart: TNotifyEvent;
FOnEndElement: TNotifyEvent;
FOnEntityReference: TNotifyEvent;
FOnNotation: TNotifyEvent;
FOnProcessingInstruction: TNotifyEvent;
FOnTextNode: TNotifyEvent;
function GetNodeName: TDomString;
function GetToken(Delimiters: TDelimiters): TDomString;
function IsAttrTextChar: Boolean;
function IsDigit(HexBase: Boolean): Boolean;
function IsEndEntityChar: Boolean;
function IsEntityChar: Boolean;
function IsEqualChar: Boolean;
function IsHexEntityChar: Boolean;
function IsNumericEntity: Boolean;
function IsQuotation: Boolean;
function IsSlashChar: Boolean;
function IsSpecialTagChar: Boolean;
function IsStartCharacterData: Boolean;
function IsStartComment: Boolean;
function IsStartDocumentType: Boolean;
function IsStartEntityChar: Boolean;
function IsStartMarkupChar: Boolean;
function IsStartTagChar: Boolean;
function Match(const Signature: TDomString; IgnoreCase: Boolean): Boolean;
function ReadAttrNode: Boolean;
function ReadAttrTextNode: Boolean;
function ReadCharacterData: Boolean;
function ReadComment: Boolean;
function ReadDocumentType: Boolean;
function ReadElementNode: Boolean;
function ReadEndElementNode: Boolean;
function ReadEntityNode: Boolean;
function ReadNamedEntityNode: Boolean;
function ReadNumericEntityNode: Boolean;
function ReadQuotedValue(var Value: TDomString): Boolean;
function ReadSpecialNode: Boolean;
function ReadTagNode: Boolean;
function ReadValueNode: Boolean;
function SkipTo(const Signature: TDomString): Boolean;
procedure FireEvent(Event: TNotifyEvent);
procedure ReadElementTail;
procedure ReadTextNode;
procedure SetHtmlStr(const Value: TDomString);
procedure SetNodeName(Value: TDomString);
procedure SkipWhiteSpaces;
public
constructor Create;
function read: Boolean;
property htmlStr: TDomString read FHtmlStr write SetHtmlStr;
property isEmptyElement: Boolean read FIsEmptyElement;
property localName: TDomString read FLocalName;
property nodeName: TDomString read GetNodeName;
property nodeType: Integer read FNodeType;
property position: Integer read FPosition;
property prefix: TDomString read FPrefix;
property publicID: TDomString read FPublicID;
property state: TReaderState read FState;
property systemID: TDomString read FSystemID;
property nodeValue: TDomString read FNodeValue;
property OnAttributeEnd: TNotifyEvent read FOnAttributeEnd write FOnAttributeEnd;
property OnAttributeStart: TNotifyEvent read FOnAttributeStart write FOnAttributeStart;
property OnCDataSection: TNotifyEvent read FOnCDataSection write FOnCDataSection;
property OnComment: TNotifyEvent read FOnComment write FOnComment;
property OnDocType: TNotifyEvent read FOnDocType write FOnDocType;
property OnElementEnd: TNotifyEvent read FOnElementEnd write FOnElementEnd;
property OnElementStart: TNotifyEvent read FOnElementStart write FOnElementStart;
property OnEndElement: TNotifyEvent read FOnEndElement write FOnEndElement;
property OnEntityReference: TNotifyEvent read FOnEntityReference write FOnEntityReference;
property OnNotation: TNotifyEvent read FOnNotation write FOnNotation;
property OnProcessingInstruction: TNotifyEvent read FOnProcessingInstruction write FOnProcessingInstruction;
property OnTextNode: TNotifyEvent read FOnTextNode write FOnTextNode;
end;
implementation
uses
SysUtils;
const
startTagChar = Ord('<');
endTagChar = Ord('>');
specialTagChar = Ord('!');
slashChar = Ord('/');
equalChar = Ord('=');
quotation = [Ord(''''), Ord('"')];
tagDelimiter = [slashChar, endTagChar];
tagNameDelimiter = whiteSpace + tagDelimiter;
attrNameDelimiter = tagNameDelimiter + [equalChar];
startEntity = Ord('&');
startMarkup = [startTagChar, startEntity];
endEntity = Ord(';');
notEntity = [endEntity] + startMarkup + whiteSpace;
notAttrText = whiteSpace + quotation + tagDelimiter;
numericEntity = Ord('#');
hexEntity = [Ord('x'), Ord('X')];
decDigit = [Ord('0')..Ord('9')];
hexDigit = [Ord('a')..Ord('f'), Ord('A')..Ord('F')];
DocTypeStartStr = 'DOCTYPE';
DocTypeEndStr = '>';
CDataStartStr = '[CDATA[';
CDataEndStr = ']]>';
CommentStartStr = '--';
CommentEndStr = '-->';
function DecValue(const Digit: WideChar): Word;
begin
Result := Ord(Digit) - Ord('0')
end;
function HexValue(const HexChar: WideChar): Word;
var
C: Char;
begin
if Ord(HexChar) in decDigit then
Result := Ord(HexChar) - Ord('0')
else
begin
C := UpCase(Chr(Ord(HexChar)));
Result := Ord(C) - Ord('A')
end
end;
constructor THtmlReader.Create;
begin
inherited Create;
FHtmlStr := HtmlStr;
FPosition := 1
end;
function THtmlReader.GetNodeName: TDomString;
begin
if FPrefix <> '' then
Result := FPrefix + ':' + FLocalName
else
Result := FLocalName
end;
function THtmlReader.GetToken(Delimiters: TDelimiters): TDomString;
var
Start: Integer;
begin
Start := FPosition;
while (FPosition <= Length(FHtmlStr)) and not (Ord(FHtmlStr[FPosition]) in Delimiters) do
Inc(FPosition);
Result := Copy(FHtmlStr, Start, FPosition - Start)
end;
function THtmlReader.IsAttrTextChar: Boolean;
var
WC: WideChar;
begin
WC := FHtmlStr[FPosition];
if FState = rsInQuotedValue then
Result := (Ord(WC) <> FQuotation) and (Ord(WC) <> startEntity)
else
Result := not (Ord(WC) in notAttrText)
end;
function THtmlReader.IsDigit(HexBase: Boolean): Boolean;
var
WC: WideChar;
begin
WC := FHtmlStr[FPosition];
Result := Ord(WC) in decDigit;
if not Result and HexBase then
Result := Ord(WC) in hexDigit
end;
function THtmlReader.IsEndEntityChar: Boolean;
var
WC: WideChar;
begin
WC := FHtmlStr[FPosition];
Result := Ord(WC) = endEntity
end;
function THtmlReader.IsEntityChar: Boolean;
var
WC: WideChar;
begin
WC := FHtmlStr[FPosition];
Result := not (Ord(WC) in notEntity)
end;
function THtmlReader.IsEqualChar: Boolean;
var
WC: WideChar;
begin
WC := FHtmlStr[FPosition];
Result := Ord(WC) = equalChar
end;
function THtmlReader.IsHexEntityChar: Boolean;
var
WC: WideChar;
begin
WC := FHtmlStr[FPosition];
Result := Ord(WC) in hexEntity
end;
function THtmlReader.IsNumericEntity: Boolean;
var
WC: WideChar;
begin
WC := FHtmlStr[FPosition];
Result := Ord(WC) = numericEntity
end;
function THtmlReader.IsQuotation: Boolean;
var
WC: WideChar;
begin
WC := FHtmlStr[FPosition];
if FQuotation = 0 then
Result := Ord(WC) in quotation
else
Result := Ord(WC) = FQuotation
end;
function THtmlReader.IsSlashChar: Boolean;
var
WC: WideChar;
begin
WC := FHtmlStr[FPosition];
Result := Ord(WC) = slashChar
end;
function THtmlReader.IsSpecialTagChar: Boolean;
var
WC: WideChar;
begin
WC := FHtmlStr[FPosition];
Result := Ord(WC) = specialTagChar
end;
function THtmlReader.IsStartCharacterData: Boolean;
begin
Result := Match(CDataStartStr, false)
end;
function THtmlReader.IsStartComment: Boolean;
begin
Result := Match(CommentStartStr, false)
end;
function THtmlReader.IsStartDocumentType: Boolean;
begin
Result := Match(DocTypeStartStr, true)
end;
function THtmlReader.IsStartEntityChar: Boolean;
var
WC: WideChar;
begin
WC := FHtmlStr[FPosition];
Result := Ord(WC) = startEntity
end;
function THtmlReader.IsStartMarkupChar: Boolean;
var
WC: WideChar;
begin
WC := FHtmlStr[FPosition];
Result := Ord(WC) in startMarkup
end;
function THtmlReader.IsStartTagChar: Boolean;
var
WC: WideChar;
begin
WC := FHtmlStr[FPosition];
Result := Ord(WC) = startTagChar
end;
function THtmlReader.Match(const Signature: TDomString; IgnoreCase: Boolean): Boolean;
var
I, J: Integer;
W1, W2: WideChar;
begin
Result := false;
for I := 1 to Length(Signature) do
begin
J := FPosition + I - 1;
if (J < 1) or (J > Length(FHtmlStr)) then
Exit;
W1 := Signature[I];
W2 := FHtmlStr[J];
if (W1 <> W2) and (not IgnoreCase or (UpperCase(W1) <> UpperCase(W2))) then
Exit
end;
Result := true
end;
function THtmlReader.ReadAttrNode: Boolean;
var
AttrName: TDomString;
begin
Result := false;
SkipWhiteSpaces;
AttrName := LowerCase(GetToken(attrNameDelimiter));
if AttrName = '' then
Exit;
SetNodeName(AttrName);
FireEvent(FOnAttributeStart);
FState := rsBeforeValue;
FQuotation := 0;
Result := true
end;
function THtmlReader.ReadAttrTextNode: Boolean;
var
Start: Integer;
begin
Result := false;
Start := FPosition;
while (FPosition <= Length(FHtmlStr)) and IsAttrTextChar do
Inc(FPosition);
if FPosition = Start then
Exit;
FNodeType := TEXT_NODE;
FNodeValue:= Copy(FHtmlStr, Start, FPosition - Start);
FireEvent(FOnTextNode);
Result := true
end;
function THtmlReader.ReadCharacterData: Boolean;
var
StartPos: Integer;
begin
Inc(FPosition, Length(CDataStartStr));
StartPos := FPosition;
Result := SkipTo(CDataEndStr);
if Result then
begin
FNodeType := CDATA_SECTION_NODE;
FNodeValue := Copy(FHtmlStr, StartPos, FPosition - StartPos - Length(CDataEndStr));
FireEvent(FOnCDataSection)
end
end;
function THtmlReader.ReadComment: Boolean;
var
StartPos: Integer;
begin
Inc(FPosition, Length(CommentStartStr));
StartPos := FPosition;
Result := SkipTo(CommentEndStr);
if Result then
begin
FNodeType := COMMENT_NODE;
FNodeValue := Copy(FHtmlStr, StartPos, FPosition - StartPos - Length(CommentEndStr));
FireEvent(FOnComment)
end
end;
function THtmlReader.ReadDocumentType: Boolean;
var
Name: TDomString;
begin
Result := false;
Inc(FPosition, Length(DocTypeStartStr));
SkipWhiteSpaces;
Name := GetToken(tagNameDelimiter);
if Name = '' then
Exit;
SetNodeName(Name);
SkipWhiteSpaces;
GetToken(tagNameDelimiter);
SkipWhiteSpaces;
if not ReadQuotedValue(FPublicID) then
Exit;
SkipWhiteSpaces;
if FHtmlStr[FPosition] = '"' then
begin
if not ReadQuotedValue(FSystemID) then
Exit
end;
Result := SkipTo(DocTypeEndStr)
end;
function THtmlReader.ReadElementNode: Boolean;
var
TagName: TDomString;
begin
Result := false;
if FPosition < Length(FHtmlStr) then
begin
TagName := LowerCase(GetToken(tagNameDelimiter));
if TagName = '' then
Exit;
FNodeType := ELEMENT_NODE;
SetNodeName(TagName);
FState := rsBeforeAttr;
FireEvent(FOnElementStart);
Result := true
end
end;
function THtmlReader.ReadEndElementNode: Boolean;
var
TagName: TDomString;
begin
Result := false;
Inc(FPosition);
if FPosition > Length(FHtmlStr) then
Exit;
TagName := LowerCase(GetToken(tagNameDelimiter));
if TagName = '' then
Exit;
Result := SkipTo(WideChar(endTagChar));
if Result then
begin
FNodeType := END_ELEMENT_NODE;
SetNodeName(TagName);
FireEvent(FOnEndElement);
Result := true
end
end;
function THtmlReader.ReadEntityNode: Boolean;
var
CurrPos: Integer;
begin
Result := false;
CurrPos := FPosition;
Inc(FPosition);
if FPosition > Length(FHtmlStr) then
Exit;
if IsNumericEntity then
begin
Inc(FPosition);
Result := ReadNumericEntityNode
end
else
Result := ReadNamedEntityNode;
if Result then
begin
FNodeType := ENTITY_REFERENCE_NODE;
//FireEvent(FOnEntityReference); VVV - remove, entity node is added in ReadXXXEntityNode
end
else
FPosition := CurrPos
end;
function THtmlReader.ReadNamedEntityNode: Boolean;
var
Start: Integer;
begin
Result := false;
if FPosition > Length(FHtmlStr) then
Exit;
Start := FPosition;
while (FPosition <= Length(FHtmlStr)) and IsEntityChar do
Inc(FPosition);
if (FPosition > Length(FHtmlStr)) or not IsEndEntityChar then
Exit;
FNodeType := ENTITY_REFERENCE_NODE;
SetNodeName(Copy(FHtmlStr, Start, FPosition - Start));
Inc(FPosition);
FireEvent(FOnEntityReference);
Result := true
end;
function THtmlReader.ReadNumericEntityNode: Boolean;
var
Value: Word;
HexBase: Boolean;
begin
Result := false;
if FPosition > Length(FHtmlStr) then
Exit;
HexBase := IsHexEntityChar;
if HexBase then
Inc(FPosition);
Value := 0;
while (FPosition <= Length(FHtmlStr)) and IsDigit(HexBase) do
begin
try
if HexBase then
Value := Value * 16 + HexValue(FHtmlStr[FPosition])
else
Value := Value * 10 + DecValue(FHtmlStr[FPosition])
except
Exit
end;
Inc(FPosition)
end;
if (FPosition > Length(FHtmlStr)) or not IsEndEntityChar then
Exit;
Inc(FPosition);
FNodeType := TEXT_NODE;
//FNodeValue := WideChar(Value);
FNodeValue := System.UTF8Encode(WideChar(Value)); // !!! Ben degistirdim. UTF8' e zorladim.
FireEvent(FOnTextNode);
Result := true
end;
function THtmlReader.ReadQuotedValue(var Value: TDomString): Boolean;
var
QuotedChar: WideChar;
Start: Integer;
begin
QuotedChar := FHtmlStr[FPosition];
Inc(FPosition);
Start := FPosition;
Result := SkipTo(QuotedChar);
if Result then
Value := Copy(FHtmlStr, Start, FPosition - Start)
end;
function THtmlReader.ReadSpecialNode: Boolean;
begin
Result := false;
Inc(FPosition);
if FPosition > Length(FHtmlStr) then
Exit;
if IsStartDocumentType then
Result := ReadDocumentType
else
if IsStartCharacterData then
Result := ReadCharacterData
else
if IsStartComment then
Result := ReadComment
end;
function THtmlReader.ReadTagNode: Boolean;
var
CurrPos: Integer;
begin
Result := false;
CurrPos := FPosition;
Inc(FPosition);
if FPosition > Length(FHtmlStr) then
Exit;
if IsSlashChar then
Result := ReadEndElementNode
else
if IsSpecialTagChar then
Result := ReadSpecialNode
else
Result := ReadElementNode;
if not Result then
FPosition := CurrPos
end;
function THtmlReader.SkipTo(const Signature: TDomString): Boolean;
begin
while FPosition <= Length(FHtmlStr) do
begin
if Match(Signature, false) then
begin
Inc(FPosition, Length(Signature));
Result := true;
Exit
end;
Inc(FPosition)
end;
Result := false
end;
procedure THtmlReader.FireEvent(Event: TNotifyEvent);
begin
if Assigned(Event) then
Event(Self)
end;
function THtmlReader.read: Boolean;
begin
FNodeType := NONE;
FPrefix := '';
FLocalName := '';
FNodeValue := '';
FPublicID := '';
FSystemID := '';
FIsEmptyElement := false;
Result := false;
if FPosition > Length(FHtmlStr) then
Exit;
Result := true;
if FState in [rsBeforeValue, rsInValue, rsInQuotedValue] then
begin
if ReadValueNode then
Exit;
if FState = rsInQuotedValue then
Inc(FPosition);
FNodeType := ATTRIBUTE_NODE;
FireEvent(FOnAttributeEnd);
FState := rsBeforeAttr
end
else
if FState = rsBeforeAttr then
begin
if ReadAttrNode then
Exit;
ReadElementTail;
FState := rsInitial;
end
else
if IsStartTagChar then
begin
if ReadTagNode then
Exit;
Inc(FPosition);
FNodeType := ENTITY_REFERENCE_NODE;
SetNodeName('lt');
FireEvent(FOnEntityReference);
end
else
if IsStartEntityChar then
begin
if ReadEntityNode then
Exit;
Inc(FPosition);
FNodeType := ENTITY_REFERENCE_NODE;
SetNodeName('amp');
FireEvent(FOnEntityReference)
end
else
ReadTextNode
end;
procedure THtmlReader.ReadTextNode;
var
Start: Integer;
begin
Start := FPosition;
repeat
Inc(FPosition)
until (FPosition > Length(FHtmlStr)) or IsStartMarkupChar;
FNodeType := TEXT_NODE;
FNodeValue:= Copy(FHtmlStr, Start, FPosition - Start);
FireEvent(FOnTextNode)
end;
function THtmlReader.ReadValueNode: Boolean;
begin
Result := false;
if FState = rsBeforeValue then
begin
SkipWhiteSpaces;
if FPosition > Length(FHtmlStr) then
Exit;
if not IsEqualChar then
Exit;
Inc(FPosition);
SkipWhiteSpaces;
if FPosition > Length(FHtmlStr) then
Exit;
if IsQuotation then
begin
FQuotation := Ord(FHtmlStr[FPosition]);
Inc(FPosition);
FState := rsInQuotedValue
end
else
FState := rsInValue
end;
if FPosition > Length(FHtmlStr) then
Exit;
if IsStartEntityChar then
begin
Result := true;
if ReadEntityNode then
Exit;
Inc(FPosition);
FNodeType := ENTITY_REFERENCE_NODE;
SetNodeName('amp');
FireEvent(FOnEntityReference)
end
else
Result := ReadAttrTextNode
end;
procedure THtmlReader.ReadElementTail;
begin
SkipWhiteSpaces;
if (FPosition <= Length(FHtmlStr)) and IsSlashChar then
begin
FIsEmptyElement := true;
Inc(FPosition)
end;
SkipTo(WideChar(endTagChar));
FNodeType := ELEMENT_NODE;
FireEvent(FOnElementEnd)
end;
procedure THtmlReader.SetHtmlStr(const Value: TDomString);
begin
FHtmlStr := Value;
FPosition := 1
end;
procedure THtmlReader.SetNodeName(Value: TDomString);
var
I: Integer;
begin
I := Pos(':', Value);
if I > 0 then
begin
FPrefix := Copy(Value, 1, I - 1);
FLocalName := Copy(Value, I + 1, Length(Value) - I)
end
else
begin
FPrefix := '';
FLocalName := Value
end
end;
procedure THtmlReader.SkipWhiteSpaces;
begin
while (FPosition <= Length(FHtmlStr)) and (Ord(FHtmlStr[FPosition]) in whiteSpace) do
Inc(FPosition)
end;
end.