unit HtmlParser; interface uses DomCore, HtmlReader, HtmlTags; type THtmlParser = class private FHtmlDocument: TDocument; FHtmlReader: THtmlReader; FCurrentNode: TNode; FCurrentTag: THtmlTag; function FindDefParent: TElement; function FindParent: TElement; function FindParentElement(tagList: THtmlTagSet): TElement; function FindTableParent: TElement; function FindThisElement: TElement; function GetMainElement(const tagName: TDomString): TElement; procedure ProcessAttributeEnd(Sender: TObject); procedure ProcessAttributeStart(Sender: TObject); procedure ProcessCDataSection(Sender: TObject); procedure ProcessComment(Sender: TObject); procedure ProcessDocType(Sender: TObject); procedure ProcessElementEnd(Sender: TObject); procedure ProcessElementStart(Sender: TObject); procedure ProcessEndElement(Sender: TObject); procedure ProcessEntityReference(Sender: TObject); procedure ProcessTextNode(Sender: TObject); public constructor Create; destructor Destroy; override; function parseString(const htmlStr: TDomString): TDocument; property HtmlDocument: TDocument read FHtmlDocument; end; implementation const htmlTagName = 'html'; headTagName = 'head'; bodyTagName = 'body'; constructor THtmlParser.Create; begin inherited Create; FHtmlReader := THtmlReader.Create; with FHtmlReader do begin OnAttributeEnd := ProcessAttributeEnd; OnAttributeStart := ProcessAttributeStart; OnCDataSection := ProcessCDataSection; OnComment := ProcessComment; OnDocType := ProcessDocType; OnElementEnd := ProcessElementEnd; OnElementStart := ProcessElementStart; OnEndElement := ProcessEndElement; OnEntityReference := ProcessEntityReference; //OnNotation := ProcessNotation; //OnProcessingInstruction := ProcessProcessingInstruction; OnTextNode := ProcessTextNode; end end; destructor THtmlParser.Destroy; begin FHtmlReader.Free; inherited Destroy end; function THtmlParser.FindDefParent: TElement; begin if FCurrentTag.Number in [HEAD_TAG, BODY_TAG] then Result := FHtmlDocument.appendChild(FHtmlDocument.createElement(htmlTagName)) as TElement else if FCurrentTag.Number in HeadTags then Result := GetMainElement(headTagName) else Result := GetMainElement(bodyTagName) end; function THtmlParser.FindParent: TElement; begin if (FCurrentTag.Number = P_TAG) or (FCurrentTag.Number in BlockTags) then Result := FindParentElement(BlockParentTags) else if FCurrentTag.Number = LI_TAG then Result := FindParentElement(ListItemParentTags) else if FCurrentTag.Number in [DD_TAG, DT_TAG] then Result := FindParentElement(DefItemParentTags) else if FCurrentTag.Number in [TD_TAG, TH_TAG] then Result := FindParentElement(CellParentTags) else if FCurrentTag.Number = TR_TAG then Result := FindParentElement(RowParentTags) else if FCurrentTag.Number = COL_TAG then Result := FindParentElement(ColParentTags) else if FCurrentTag.Number in [COLGROUP_TAG, THEAD_TAG, TFOOT_TAG, TBODY_TAG] then Result := FindParentElement(TableSectionParentTags) else if FCurrentTag.Number = TABLE_TAG then Result := FindTableParent else if FCurrentTag.Number = OPTION_TAG then Result := FindParentElement(OptionParentTags) else if FCurrentTag.Number in [HEAD_TAG, BODY_TAG] then Result := FHtmlDocument.documentElement as TElement else Result := nil; if Result = nil then Result := FindDefParent end; function THtmlParser.FindParentElement(tagList: THtmlTagSet): TElement; var Node: TNode; HtmlTag: THtmlTag; begin Node := FCurrentNode; while Node.nodeType = ELEMENT_NODE do begin HtmlTag := HtmlTagList.GetTagByName(Node.nodeName); if HtmlTag.Number in tagList then begin Result := Node as TElement; Exit end; Node := Node.parentNode end; Result := nil end; function THtmlParser.FindTableParent: TElement; var Node: TNode; HtmlTag: THtmlTag; begin Node := FCurrentNode; while Node.nodeType = ELEMENT_NODE do begin HtmlTag := HtmlTagList.GetTagByName(Node.nodeName); if (HtmlTag.Number = TD_TAG) or (HtmlTag.Number in BlockTags) then begin Result := Node as TElement; Exit end; Node := Node.parentNode end; Result := GetMainElement(bodyTagName) end; function THtmlParser.FindThisElement: TElement; var Node: TNode; begin Node := FCurrentNode; while Node.nodeType = ELEMENT_NODE do begin Result := Node as TElement; if Result.tagName = FHtmlReader.nodeName then Exit; Node := Node.parentNode end; Result := nil end; function THtmlParser.GetMainElement(const tagName: TDomString): TElement; var child: TNode; I: Integer; begin if FHtmlDocument.documentElement = nil then FHtmlDocument.appendChild(FHtmlDocument.createElement(htmlTagName)); for I := 0 to FHtmlDocument.documentElement.childNodes.length - 1 do begin child := FHtmlDocument.documentElement.childNodes.item(I); if (child.nodeType = ELEMENT_NODE) and (child.nodeName = tagName) then begin Result := child as TElement; Exit end end; Result := FHtmlDocument.createElement(tagName); FHtmlDocument.documentElement.appendChild(Result) end; procedure THtmlParser.ProcessAttributeEnd(Sender: TObject); begin FCurrentNode := (FCurrentNode as TAttr).ownerElement end; procedure THtmlParser.ProcessAttributeStart(Sender: TObject); var Attr: TAttr; begin Attr := FHtmlDocument.createAttribute((Sender as THtmlReader).nodeName); (FCurrentNode as TElement).setAttributeNode(Attr); FCurrentNode := Attr end; procedure THtmlParser.ProcessCDataSection(Sender: TObject); var CDataSection: TCDataSection; begin CDataSection := FHtmlDocument.createCDATASection(FHtmlReader.nodeValue); FCurrentNode.appendChild(CDataSection) end; procedure THtmlParser.ProcessComment(Sender: TObject); var Comment: TComment; begin Comment := FHtmlDocument.createComment(FHtmlReader.nodeValue); FCurrentNode.appendChild(Comment) end; procedure THtmlParser.ProcessDocType(Sender: TObject); begin with FHtmlReader do FHtmlDocument.docType := DomImplementation.createDocumentType(nodeName, publicID, systemID); end; procedure THtmlParser.ProcessElementEnd(Sender: TObject); begin if FHtmlReader.isEmptyElement or (FCurrentTag.Number in EmptyTags) then FCurrentNode := FCurrentNode.parentNode; FCurrentTag := nil end; procedure THtmlParser.ProcessElementStart(Sender: TObject); var Element: TElement; Parent: TNode; begin FCurrentTag := HtmlTagList.GetTagByName(FHtmlReader.nodeName); if FCurrentTag.Number in NeedFindParentTags + BlockTags then begin Parent := FindParent; if not Assigned(Parent) then raise DomException.Create(HIERARCHY_REQUEST_ERR); FCurrentNode := Parent end; Element := FHtmlDocument.createElement(FHtmlReader.nodeName); FCurrentNode.appendChild(Element); FCurrentNode := Element end; procedure THtmlParser.ProcessEndElement(Sender: TObject); var Element: TElement; begin Element := FindThisElement; if Assigned(Element) then FCurrentNode := Element.parentNode { else if IsBlockTagName(FHtmlReader.nodeName) then raise DomException.Create(HIERARCHY_REQUEST_ERR)} end; procedure THtmlParser.ProcessEntityReference(Sender: TObject); var EntityReference: TEntityReference; begin EntityReference := FHtmlDocument.createEntityReference(FHtmlReader.nodeName); FCurrentNode.appendChild(EntityReference) end; procedure THtmlParser.ProcessTextNode(Sender: TObject); var TextNode: TTextNode; begin TextNode := FHtmlDocument.createTextNode(FHtmlReader.nodeValue); FCurrentNode.appendChild(TextNode) end; function THtmlParser.parseString(const htmlStr: TDomString): TDocument; begin FHtmlReader.htmlStr := htmlStr; FHtmlDocument := DomImplementation.createEmptyDocument(nil); FCurrentNode := FHtmlDocument; try while FHtmlReader.Read do; except // TODO: Add event ? end; Result := FHtmlDocument end; end.