处理AI胡乱生成的乱摊子

This commit is contained in:
2025-09-07 20:31:38 +08:00
parent c377a0784d
commit ba513e0827
202 changed files with 11751 additions and 67183 deletions

View File

@@ -104,7 +104,7 @@ tokenization, and tokenization and tree construction stages of the WHATWG HTML
parsing specification respectively. While the tokenizer parses and normalizes
individual HTML tokens, only the parser constructs the DOM tree from the
tokenized HTML, as described in the tree construction stage of the
specification, dynamically modifying or extending the docuemnt's DOM tree.
specification, dynamically modifying or extending the document's DOM tree.
If your use case requires semantically well-formed HTML documents, as defined by
the WHATWG specification, the parser should be used rather than the tokenizer.

View File

@@ -194,9 +194,8 @@ func render1(w writer, n *Node) error {
}
}
// Render any child nodes.
switch n.Data {
case "iframe", "noembed", "noframes", "noscript", "plaintext", "script", "style", "xmp":
// Render any child nodes
if childTextNodesAreLiteral(n) {
for c := n.FirstChild; c != nil; c = c.NextSibling {
if c.Type == TextNode {
if _, err := w.WriteString(c.Data); err != nil {
@@ -213,7 +212,7 @@ func render1(w writer, n *Node) error {
// last element in the file, with no closing tag.
return plaintextAbort
}
default:
} else {
for c := n.FirstChild; c != nil; c = c.NextSibling {
if err := render1(w, c); err != nil {
return err
@@ -231,6 +230,27 @@ func render1(w writer, n *Node) error {
return w.WriteByte('>')
}
func childTextNodesAreLiteral(n *Node) bool {
// Per WHATWG HTML 13.3, if the parent of the current node is a style,
// script, xmp, iframe, noembed, noframes, or plaintext element, and the
// current node is a text node, append the value of the node's data
// literally. The specification is not explicit about it, but we only
// enforce this if we are in the HTML namespace (i.e. when the namespace is
// "").
// NOTE: we also always include noscript elements, although the
// specification states that they should only be rendered as such if
// scripting is enabled for the node (which is not something we track).
if n.Namespace != "" {
return false
}
switch n.Data {
case "iframe", "noembed", "noframes", "noscript", "plaintext", "script", "style", "xmp":
return true
default:
return false
}
}
// writeQuoted writes s to w surrounded by quotes. Normally it will use double
// quotes, but if s contains a double quote, it will use single quotes.
// It is used for writing the identifiers in a doctype declaration.

View File

@@ -910,10 +910,16 @@ func (z *Tokenizer) readTagAttrKey() {
return
}
switch c {
case ' ', '\n', '\r', '\t', '\f', '/':
z.pendingAttr[0].end = z.raw.end - 1
return
case '=', '>':
case '=':
if z.pendingAttr[0].start+1 == z.raw.end {
// WHATWG 13.2.5.32, if we see an equals sign before the attribute name
// begins, we treat it as a character in the attribute name and continue.
continue
}
fallthrough
case ' ', '\n', '\r', '\t', '\f', '/', '>':
// WHATWG 13.2.5.33 Attribute name state
// We need to reconsume the char in the after attribute name state to support the / character
z.raw.end--
z.pendingAttr[0].end = z.raw.end
return
@@ -932,6 +938,11 @@ func (z *Tokenizer) readTagAttrVal() {
if z.err != nil {
return
}
if c == '/' {
// WHATWG 13.2.5.34 After attribute name state
// U+002F SOLIDUS (/) - Switch to the self-closing start tag state.
return
}
if c != '=' {
z.raw.end--
return