NOTE: The behaviour might change in future versions as it is not clear what "wild HTML the real world uses" really implies.
It can be used to parse a wild HTML document and output it as valid XHTML document (well, if you are lucky):
echo loadHtml("mydirty.html")
Every tag in the resulting tree is in lower case.
Note: The resulting XmlNode already uses the clientData field, so it cannot be used by clients of this library.
Example: Transforming hyperlinks
This code demonstrates how you can iterate over all the tags in an HTML file and write back the modified version. In this case we look for hyperlinks ending with the extension .rst and convert them to .html.
import htmlparser import xmltree # To use '$' for XmlNode import strtabs # To access XmlAttributes import os # To use splitFile import strutils # To use cmpIgnoreCase proc transformHyperlinks() = let html = loadHtml("input.html") for a in html.findAll("a"): if a.attrs.hasKey "href": let (dir, filename, ext) = splitFile(a.attrs["href"]) if cmpIgnoreCase(ext, ".rst") == 0: a.attrs["href"] = dir / filename & ".html" writeFile("output.html", $html)
Types
HtmlTag = enum tagUnknown, tagA, tagAbbr, tagAcronym, tagAddress, tagApplet, tagArea, tagArticle, tagAside, tagAudio, tagB, tagBase, tagBdi, tagBdo, tagBasefont, tagBig, tagBlockquote, tagBody, tagBr, tagButton, tagCanvas, tagCaption, tagCenter, tagCite, tagCode, tagCol, tagColgroup, tagCommand, tagDatalist, tagDd, tagDel, tagDetails, tagDfn, tagDialog, tagDiv, tagDir, tagDl, tagDt, tagEm, tagEmbed, tagFieldset, tagFigcaption, tagFigure, tagFont, tagFooter, tagForm, tagFrame, tagFrameset, tagH1, tagH2, tagH3, tagH4, tagH5, tagH6, tagHead, tagHeader, tagHgroup, tagHtml, tagHr, tagI, tagIframe, tagImg, tagInput, tagIns, tagIsindex, tagKbd, tagKeygen, tagLabel, tagLegend, tagLi, tagLink, tagMap, tagMark, tagMenu, tagMeta, tagMeter, tagNav, tagNobr, tagNoframes, tagNoscript, tagObject, tagOl, tagOptgroup, tagOption, tagOutput, tagP, tagParam, tagPre, tagProgress, tagQ, tagRp, tagRt, tagRuby, tagS, tagSamp, tagScript, tagSection, tagSelect, tagSmall, tagSource, tagSpan, tagStrike, tagStrong, tagStyle, tagSub, tagSummary, tagSup, tagTable, tagTbody, tagTd, tagTextarea, tagTfoot, tagTh, tagThead, tagTime, tagTitle, tagTr, tagTrack, tagTt, tagU, tagUl, tagVar, tagVideo, tagWbr
- list of all supported HTML tags; order will always be alphabetically Source Edit
Consts
tagToStr = ["a", "abbr", "acronym", "address", "applet", "area", "article", "aside", "audio", "b", "base", "basefont", "bdi", "bdo", "big", "blockquote", "body", "br", "button", "canvas", "caption", "center", "cite", "code", "col", "colgroup", "command", "datalist", "dd", "del", "details", "dfn", "dialog", "div", "dir", "dl", "dt", "em", "embed", "fieldset", "figcaption", "figure", "font", "footer", "form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "html", "hr", "i", "iframe", "img", "input", "ins", "isindex", "kbd", "keygen", "label", "legend", "li", "link", "map", "mark", "menu", "meta", "meter", "nav", "nobr", "noframes", "noscript", "object", "ol", "optgroup", "option", "output", "p", "param", "pre", "progress", "q", "rp", "rt", "ruby", "s", "samp", "script", "section", "select", "small", "source", "span", "strike", "strong", "style", "sub", "summary", "sup", "table", "tbody", "td", "textarea", "tfoot", "th", "thead", "time", "title", "tr", "track", "tt", "u", "ul", "var", "video", "wbr"]
- Source Edit
InlineTags = {tagA, tagAbbr, tagAcronym, tagApplet, tagB, tagBasefont, tagBdo, tagBig, tagBr, tagButton, tagCite, tagCode, tagDel, tagDfn, tagEm, tagFont, tagI, tagImg, tagIns, tagInput, tagIframe, tagKbd, tagLabel, tagMap, tagObject, tagQ, tagSamp, tagScript, tagSelect, tagSmall, tagSpan, tagStrong, tagSub, tagSup, tagTextarea, tagTt, tagVar, tagApplet, tagBasefont, tagFont, tagIframe, tagU, tagS, tagStrike, tagWbr}
- Source Edit
BlockTags = {tagAddress, tagBlockquote, tagCenter, tagDel, tagDir, tagDiv, tagDl, tagFieldset, tagForm, tagH1, tagH2, tagH3, tagH4, tagH5, tagH6, tagHr, tagIns, tagIsindex, tagMenu, tagNoframes, tagNoscript, tagOl, tagP, tagPre, tagTable, tagUl, tagCenter, tagDir, tagIsindex, tagMenu, tagNoframes}
- Source Edit
SingleTags = {tagArea, tagBase, tagBasefont, tagBr, tagCol, tagFrame, tagHr, tagImg, tagIsindex, tagLink, tagMeta, tagParam, tagWbr}
- Source Edit
Procs
proc htmlTag(n: XmlNode): HtmlTag {...}{.raises: [], tags: [].}
- Gets n's tag as a HtmlTag. Source Edit
proc htmlTag(s: string): HtmlTag {...}{.raises: [], tags: [].}
- Converts s to a HtmlTag. If s is no HTML tag, tagUnknown is returned. Source Edit
proc runeToEntity(rune: Rune): string {...}{.raises: [], tags: [].}
-
converts a Rune to its numeric HTML entity equivalent.
Example:
import unicode doAssert runeToEntity(Rune(0)) == "" doAssert runeToEntity(Rune(-1)) == "" doAssert runeToEntity("Ü".runeAt(0)) == "#220" doAssert runeToEntity("∈".runeAt(0)) == "#8712"
Source Edit proc entityToRune(entity: string): Rune {...}{.raises: [], tags: [].}
-
Converts an HTML entity name like Ü or values like Ü or Ü to its UTF-8 equivalent. Rune(0) is returned if the entity name is unknown.
Example:
import unicode doAssert entityToRune("") == Rune(0) doAssert entityToRune("a") == Rune(0) doAssert entityToRune("gt") == ">".runeAt(0) doAssert entityToRune("Uuml") == "Ü".runeAt(0) doAssert entityToRune("quest") == "?".runeAt(0) doAssert entityToRune("#x0003F") == "?".runeAt(0)
Source Edit proc entityToUtf8(entity: string): string {...}{.raises: [], tags: [].}
-
Converts an HTML entity name like Ü or values like Ü or Ü to its UTF-8 equivalent. "" is returned if the entity name is unknown. The HTML parser already converts entities to UTF-8.
Example:
const sigma = "Σ" doAssert entityToUtf8("") == "" doAssert entityToUtf8("a") == "" doAssert entityToUtf8("gt") == ">" doAssert entityToUtf8("Uuml") == "Ü" doAssert entityToUtf8("quest") == "?" doAssert entityToUtf8("#63") == "?" doAssert entityToUtf8("Sigma") == sigma doAssert entityToUtf8("#931") == sigma doAssert entityToUtf8("#0931") == sigma doAssert entityToUtf8("#x3A3") == sigma doAssert entityToUtf8("#x03A3") == sigma doAssert entityToUtf8("#x3a3") == sigma doAssert entityToUtf8("#X3a3") == sigma
Source Edit proc parseHtml(s: Stream; filename: string; errors: var seq[string]): XmlNode {...}{. raises: [IOError, OSError, ValueError, Exception], tags: [ReadIOEffect, RootEffect, WriteIOEffect].}
- Parses the XML from stream s and returns a XmlNode. Every occurred parsing error is added to the errors sequence. Source Edit
proc parseHtml(s: Stream): XmlNode {...}{.raises: [IOError, OSError, ValueError, Exception], tags: [ReadIOEffect, RootEffect, WriteIOEffect].}
- Parses the HTML from stream s and returns a XmlNode. All parsing errors are ignored. Source Edit
proc parseHtml(html: string): XmlNode {...}{.raises: [IOError, OSError, ValueError, Exception], tags: [ReadIOEffect, RootEffect, WriteIOEffect].}
- Parses the HTML from string html and returns a XmlNode. All parsing errors are ignored. Source Edit
proc loadHtml(path: string; errors: var seq[string]): XmlNode {...}{. raises: [IOError, OSError, ValueError, Exception], tags: [ReadIOEffect, RootEffect, WriteIOEffect].}
- Loads and parses HTML from file specified by path, and returns a XmlNode. Every occurred parsing error is added to the errors sequence. Source Edit
proc loadHtml(path: string): XmlNode {...}{.raises: [IOError, OSError, ValueError, Exception], tags: [ReadIOEffect, RootEffect, WriteIOEffect].}
- Loads and parses HTML from file specified by path, and returns a XmlNode. All parsing errors are ignored. Source Edit