htmlparser

NOTE: The behaviour might change in future versions as it is not clear what "wild HTML the real world uses" really implies.

It can be used to parse a wild HTML document and output it as valid XHTML document (well, if you are lucky):

echo loadHtml("mydirty.html")

Every tag in the resulting tree is in lower case.

Note: The resulting XmlNode already uses the clientData field, so it cannot be used by clients of this library.

Example: Transforming hyperlinks

This code demonstrates how you can iterate over all the tags in an HTML file and write back the modified version. In this case we look for hyperlinks ending with the extension .rst and convert them to .html.

import htmlparser
import xmltree  # To use '$' for XmlNode
import strtabs  # To access XmlAttributes
import os       # To use splitFile
import strutils # To use cmpIgnoreCase

proc transformHyperlinks() =
  let html = loadHtml("input.html")
  
  for a in html.findAll("a"):
    if a.attrs.hasKey "href":
      let (dir, filename, ext) = splitFile(a.attrs["href"])
      if cmpIgnoreCase(ext, ".rst") == 0:
        a.attrs["href"] = dir / filename & ".html"
  
  writeFile("output.html", $html)

Types

HtmlTag = enum
  tagUnknown, tagA, tagAbbr, tagAcronym, tagAddress, tagApplet, tagArea,
  tagArticle, tagAside, tagAudio, tagB, tagBase, tagBdi, tagBdo, tagBasefont,
  tagBig, tagBlockquote, tagBody, tagBr, tagButton, tagCanvas, tagCaption,
  tagCenter, tagCite, tagCode, tagCol, tagColgroup, tagCommand, tagDatalist,
  tagDd, tagDel, tagDetails, tagDfn, tagDialog, tagDiv, tagDir, tagDl, tagDt,
  tagEm, tagEmbed, tagFieldset, tagFigcaption, tagFigure, tagFont, tagFooter,
  tagForm, tagFrame, tagFrameset, tagH1, tagH2, tagH3, tagH4, tagH5, tagH6,
  tagHead, tagHeader, tagHgroup, tagHtml, tagHr, tagI, tagIframe, tagImg,
  tagInput, tagIns, tagIsindex, tagKbd, tagKeygen, tagLabel, tagLegend, tagLi,
  tagLink, tagMap, tagMark, tagMenu, tagMeta, tagMeter, tagNav, tagNobr,
  tagNoframes, tagNoscript, tagObject, tagOl, tagOptgroup, tagOption, tagOutput,
  tagP, tagParam, tagPre, tagProgress, tagQ, tagRp, tagRt, tagRuby, tagS,
  tagSamp, tagScript, tagSection, tagSelect, tagSmall, tagSource, tagSpan,
  tagStrike, tagStrong, tagStyle, tagSub, tagSummary, tagSup, tagTable,
  tagTbody, tagTd, tagTextarea, tagTfoot, tagTh, tagThead, tagTime, tagTitle,
  tagTr, tagTrack, tagTt, tagU, tagUl, tagVar, tagVideo, tagWbr
list of all supported HTML tags; order will always be alphabetically   Source Edit

Consts

tagToStr = ["a", "abbr", "acronym", "address", "applet", "area", "article",
            "aside", "audio", "b", "base", "basefont", "bdi", "bdo", "big",
            "blockquote", "body", "br", "button", "canvas", "caption", "center",
            "cite", "code", "col", "colgroup", "command", "datalist", "dd",
            "del", "details", "dfn", "dialog", "div", "dir", "dl", "dt", "em",
            "embed", "fieldset", "figcaption", "figure", "font", "footer",
            "form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6",
            "head", "header", "hgroup", "html", "hr", "i", "iframe", "img",
            "input", "ins", "isindex", "kbd", "keygen", "label", "legend", "li",
            "link", "map", "mark", "menu", "meta", "meter", "nav", "nobr",
            "noframes", "noscript", "object", "ol", "optgroup", "option",
            "output", "p", "param", "pre", "progress", "q", "rp", "rt", "ruby",
            "s", "samp", "script", "section", "select", "small", "source",
            "span", "strike", "strong", "style", "sub", "summary", "sup",
            "table", "tbody", "td", "textarea", "tfoot", "th", "thead", "time",
            "title", "tr", "track", "tt", "u", "ul", "var", "video", "wbr"]
  Source Edit
InlineTags = {tagA, tagAbbr, tagAcronym, tagApplet, tagB, tagBasefont, tagBdo,
              tagBig, tagBr, tagButton, tagCite, tagCode, tagDel, tagDfn, tagEm,
              tagFont, tagI, tagImg, tagIns, tagInput, tagIframe, tagKbd,
              tagLabel, tagMap, tagObject, tagQ, tagSamp, tagScript, tagSelect,
              tagSmall, tagSpan, tagStrong, tagSub, tagSup, tagTextarea, tagTt,
              tagVar, tagApplet, tagBasefont, tagFont, tagIframe, tagU, tagS,
              tagStrike, tagWbr}
  Source Edit
BlockTags = {tagAddress, tagBlockquote, tagCenter, tagDel, tagDir, tagDiv,
             tagDl, tagFieldset, tagForm, tagH1, tagH2, tagH3, tagH4, tagH5,
             tagH6, tagHr, tagIns, tagIsindex, tagMenu, tagNoframes,
             tagNoscript, tagOl, tagP, tagPre, tagTable, tagUl, tagCenter,
             tagDir, tagIsindex, tagMenu, tagNoframes}
  Source Edit
SingleTags = {tagArea, tagBase, tagBasefont, tagBr, tagCol, tagFrame, tagHr,
              tagImg, tagIsindex, tagLink, tagMeta, tagParam, tagWbr}
  Source Edit

Procs

proc htmlTag(n: XmlNode): HtmlTag {...}{.raises: [], tags: [].}
Gets n's tag as a HtmlTag.   Source Edit
proc htmlTag(s: string): HtmlTag {...}{.raises: [], tags: [].}
Converts s to a HtmlTag. If s is no HTML tag, tagUnknown is returned.   Source Edit
proc runeToEntity(rune: Rune): string {...}{.raises: [], tags: [].}
converts a Rune to its numeric HTML entity equivalent.

Example:

import unicode
doAssert runeToEntity(Rune(0)) == ""
doAssert runeToEntity(Rune(-1)) == ""
doAssert runeToEntity("Ü".runeAt(0)) == "#220"
doAssert runeToEntity("∈".runeAt(0)) == "#8712"
  Source Edit
proc entityToRune(entity: string): Rune {...}{.raises: [], tags: [].}
Converts an HTML entity name like Ü or values like Ü or Ü to its UTF-8 equivalent. Rune(0) is returned if the entity name is unknown.

Example:

import unicode
doAssert entityToRune("") == Rune(0)
doAssert entityToRune("a") == Rune(0)
doAssert entityToRune("gt") == ">".runeAt(0)
doAssert entityToRune("Uuml") == "Ü".runeAt(0)
doAssert entityToRune("quest") == "?".runeAt(0)
doAssert entityToRune("#x0003F") == "?".runeAt(0)
  Source Edit
proc entityToUtf8(entity: string): string {...}{.raises: [], tags: [].}
Converts an HTML entity name like Ü or values like Ü or Ü to its UTF-8 equivalent. "" is returned if the entity name is unknown. The HTML parser already converts entities to UTF-8.

Example:

const sigma = "Σ"
doAssert entityToUtf8("") == ""
doAssert entityToUtf8("a") == ""
doAssert entityToUtf8("gt") == ">"
doAssert entityToUtf8("Uuml") == "Ü"
doAssert entityToUtf8("quest") == "?"
doAssert entityToUtf8("#63") == "?"
doAssert entityToUtf8("Sigma") == sigma
doAssert entityToUtf8("#931") == sigma
doAssert entityToUtf8("#0931") == sigma
doAssert entityToUtf8("#x3A3") == sigma
doAssert entityToUtf8("#x03A3") == sigma
doAssert entityToUtf8("#x3a3") == sigma
doAssert entityToUtf8("#X3a3") == sigma
  Source Edit
proc parseHtml(s: Stream; filename: string; errors: var seq[string]): XmlNode {...}{.
    raises: [IOError, OSError, ValueError, Exception],
    tags: [ReadIOEffect, RootEffect, WriteIOEffect].}
Parses the XML from stream s and returns a XmlNode. Every occurred parsing error is added to the errors sequence.   Source Edit
proc parseHtml(s: Stream): XmlNode {...}{.raises: [IOError, OSError, ValueError,
    Exception], tags: [ReadIOEffect, RootEffect, WriteIOEffect].}
Parses the HTML from stream s and returns a XmlNode. All parsing errors are ignored.   Source Edit
proc parseHtml(html: string): XmlNode {...}{.raises: [IOError, OSError, ValueError,
    Exception], tags: [ReadIOEffect, RootEffect, WriteIOEffect].}
Parses the HTML from string html and returns a XmlNode. All parsing errors are ignored.   Source Edit
proc loadHtml(path: string; errors: var seq[string]): XmlNode {...}{.
    raises: [IOError, OSError, ValueError, Exception],
    tags: [ReadIOEffect, RootEffect, WriteIOEffect].}
Loads and parses HTML from file specified by path, and returns a XmlNode. Every occurred parsing error is added to the errors sequence.   Source Edit
proc loadHtml(path: string): XmlNode {...}{.raises: [IOError, OSError, ValueError,
    Exception], tags: [ReadIOEffect, RootEffect, WriteIOEffect].}
Loads and parses HTML from file specified by path, and returns a XmlNode. All parsing errors are ignored.   Source Edit