Class LagartoParser

java.lang.Object
jodd.lagarto.LagartoParser

public class LagartoParser extends Object
HTML/XML content parser/tokenizer using TagVisitor for callbacks. Works by the HTML5 specs for tokenization, as described on WhatWG. Differences from the specs:
  • text is emitted as a block of text, and not character by character.
  • tags name case (and letter case of other entities) is not changed, but case-sensitive information exist for matching.
  • the whole tokenization process is implemented here, without going into the tree building. This applies for switching to the RAWTEXT state.
  • script tag is emitted separately
  • conditional comments added
  • xml states and callbacks added
  • Field Details

    • visitor

      protected TagVisitor visitor
    • tag

      protected ParsedTag tag
    • doctype

      protected ParsedDoctype doctype
    • in

      protected final CharsInput in
    • config

      protected final LagartoParserConfig config
    • parsing

      protected boolean parsing
    • DATA_STATE

      protected State DATA_STATE
      Data state.
    • TAG_OPEN

      protected State TAG_OPEN
    • END_TAG_OPEN

      protected State END_TAG_OPEN
    • TAG_NAME

      protected State TAG_NAME
    • BEFORE_ATTRIBUTE_NAME

      protected State BEFORE_ATTRIBUTE_NAME
    • ATTRIBUTE_NAME

      protected State ATTRIBUTE_NAME
    • AFTER_ATTRIBUTE_NAME

      protected State AFTER_ATTRIBUTE_NAME
    • BEFORE_ATTRIBUTE_VALUE

      protected State BEFORE_ATTRIBUTE_VALUE
    • ATTR_VALUE_UNQUOTED

      protected State ATTR_VALUE_UNQUOTED
    • ATTR_VALUE_SINGLE_QUOTED

      protected State ATTR_VALUE_SINGLE_QUOTED
    • ATTR_VALUE_DOUBLE_QUOTED

      protected State ATTR_VALUE_DOUBLE_QUOTED
    • AFTER_ATTRIBUTE_VALUE_QUOTED

      protected State AFTER_ATTRIBUTE_VALUE_QUOTED
    • SELF_CLOSING_START_TAG

      protected State SELF_CLOSING_START_TAG
    • BOGUS_COMMENT

      protected State BOGUS_COMMENT
    • MARKUP_DECLARATION_OPEN

      protected State MARKUP_DECLARATION_OPEN
    • rawTextStart

      protected int rawTextStart
    • rawTextEnd

      protected int rawTextEnd
    • rawTagName

      protected char[] rawTagName
    • RAWTEXT

      protected State RAWTEXT
    • RAWTEXT_LESS_THAN_SIGN

      protected State RAWTEXT_LESS_THAN_SIGN
    • RAWTEXT_END_TAG_OPEN

      protected State RAWTEXT_END_TAG_OPEN
    • RAWTEXT_END_TAG_NAME

      protected State RAWTEXT_END_TAG_NAME
    • rcdataTagStart

      protected int rcdataTagStart
    • rcdataTagName

      protected char[] rcdataTagName
    • RCDATA

      protected State RCDATA
    • RCDATA_LESS_THAN_SIGN

      protected State RCDATA_LESS_THAN_SIGN
    • RCDATA_END_TAG_OPEN

      protected State RCDATA_END_TAG_OPEN
    • RCDATA_END_TAG_NAME

      protected State RCDATA_END_TAG_NAME
    • commentStart

      protected int commentStart
    • COMMENT_START

      protected State COMMENT_START
    • COMMENT_START_DASH

      protected State COMMENT_START_DASH
    • COMMENT

      protected State COMMENT
    • COMMENT_END_DASH

      protected State COMMENT_END_DASH
    • COMMENT_END

      protected State COMMENT_END
    • COMMENT_END_BANG

      protected State COMMENT_END_BANG
    • DOCTYPE

      protected State DOCTYPE
    • BEFORE_DOCTYPE_NAME

      protected State BEFORE_DOCTYPE_NAME
    • DOCTYPE_NAME

      protected State DOCTYPE_NAME
    • AFTER_DOCUMENT_NAME

      protected State AFTER_DOCUMENT_NAME
    • doctypeIdNameStart

      protected int doctypeIdNameStart
    • AFTER_DOCTYPE_PUBLIC_KEYWORD

      protected State AFTER_DOCTYPE_PUBLIC_KEYWORD
    • BEFORE_DOCTYPE_PUBLIC_IDENTIFIER

      protected State BEFORE_DOCTYPE_PUBLIC_IDENTIFIER
    • DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED

      protected State DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED
    • DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED

      protected State DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED
    • AFTER_DOCTYPE_PUBLIC_IDENTIFIER

      protected State AFTER_DOCTYPE_PUBLIC_IDENTIFIER
    • BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS

      protected State BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS
    • BOGUS_DOCTYPE

      protected State BOGUS_DOCTYPE
    • AFTER_DOCTYPE_SYSTEM_KEYWORD

      protected State AFTER_DOCTYPE_SYSTEM_KEYWORD
    • BEFORE_DOCTYPE_SYSTEM_IDENTIFIER

      protected State BEFORE_DOCTYPE_SYSTEM_IDENTIFIER
    • DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED

      protected State DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED
    • DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED

      protected State DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED
    • AFTER_DOCTYPE_SYSTEM_IDENTIFIER

      protected State AFTER_DOCTYPE_SYSTEM_IDENTIFIER
    • scriptStartNdx

      protected int scriptStartNdx
    • scriptEndNdx

      protected int scriptEndNdx
    • scriptEndTagName

      protected int scriptEndTagName
    • SCRIPT_DATA

      protected State SCRIPT_DATA
    • SCRIPT_DATA_LESS_THAN_SIGN

      protected State SCRIPT_DATA_LESS_THAN_SIGN
    • SCRIPT_DATA_END_TAG_OPEN

      protected State SCRIPT_DATA_END_TAG_OPEN
    • SCRIPT_DATA_END_TAG_NAME

      protected State SCRIPT_DATA_END_TAG_NAME
    • scriptEscape

      protected LagartoParser.ScriptEscape scriptEscape
    • xmlDeclaration

      protected LagartoParser.XmlDeclaration xmlDeclaration
    • text

      protected char[] text
    • textLen

      protected int textLen
    • attrStartNdx

      protected int attrStartNdx
    • attrEndNdx

      protected int attrEndNdx
    • conditionalCommentStarted

      private boolean conditionalCommentStarted
    • state

      protected State state
    • TAG_WHITESPACES

      private static final char[] TAG_WHITESPACES
    • TAG_WHITESPACES_OR_END

      private static final char[] TAG_WHITESPACES_OR_END
    • CONTINUE_CHARS

      private static final char[] CONTINUE_CHARS
    • ATTR_INVALID_1

      private static final char[] ATTR_INVALID_1
    • ATTR_INVALID_2

      private static final char[] ATTR_INVALID_2
    • ATTR_INVALID_3

      private static final char[] ATTR_INVALID_3
    • ATTR_INVALID_4

      private static final char[] ATTR_INVALID_4
    • COMMENT_DASH

      private static final char[] COMMENT_DASH
    • T_DOCTYPE

      private static final char[] T_DOCTYPE
    • T_SCRIPT

      private static final char[] T_SCRIPT
    • T_XMP

      private static final char[] T_XMP
    • T_STYLE

      private static final char[] T_STYLE
    • T_IFRAME

      private static final char[] T_IFRAME
    • T_NOFRAMES

      private static final char[] T_NOFRAMES
    • T_NOEMBED

      private static final char[] T_NOEMBED
    • T_NOSCRIPT

      private static final char[] T_NOSCRIPT
    • T_TEXTAREA

      private static final char[] T_TEXTAREA
    • T_TITLE

      private static final char[] T_TITLE
    • A_PUBLIC

      private static final char[] A_PUBLIC
    • A_SYSTEM

      private static final char[] A_SYSTEM
    • CDATA

      private static final char[] CDATA
    • CDATA_END

      private static final char[] CDATA_END
    • XML

      private static final char[] XML
    • XML_VERSION

      private static final char[] XML_VERSION
    • XML_ENCODING

      private static final char[] XML_ENCODING
    • XML_STANDALONE

      private static final char[] XML_STANDALONE
    • CC_IF

      private static final char[] CC_IF
    • CC_ENDIF

      private static final char[] CC_ENDIF
    • CC_ENDIF2

      private static final char[] CC_ENDIF2
    • CC_END

      private static final char[] CC_END
    • RAWTEXT_TAGS

      private static final char[][] RAWTEXT_TAGS
    • RCDATA_TAGS

      private static final char[][] RCDATA_TAGS
    • REPLACEMENT_CHAR

      private static final char REPLACEMENT_CHAR
      See Also:
    • INVALID_CHARS

      private static final char[] INVALID_CHARS
    • _ENDIF

      private static final CharSequence _ENDIF
  • Constructor Details

    • LagartoParser

      public LagartoParser(LagartoParserConfig parserConfig, char[] input)
      Creates parser on char array.
    • LagartoParser

      public LagartoParser(char[] input)
      Creates parser on char array.
    • LagartoParser

      public LagartoParser(LagartoParserConfig parserConfig, CharSequence input)
      Creates parser on a char sequence.
    • LagartoParser

      public LagartoParser(CharSequence input)
      Creates parser on a char sequence.
  • Method Details

    • initialize

      protected void initialize()
      Initializes parser.
    • getConfig

      public LagartoParserConfig getConfig()
      Returns configuration of the parser.
    • configure

      public LagartoParser configure(Consumer<LagartoParserConfig> configConsumer)
      Configures the parser.
    • parse

      public void parse(TagVisitor visitor)
      Parses content and emits event to provided TagVisitor.
    • consumeCharacterReference

      protected void consumeCharacterReference(char allowedChar)
    • consumeCharacterReference

      protected void consumeCharacterReference()
    • _consumeCharacterReference

      private void _consumeCharacterReference()
    • _consumeAttrCharacterReference

      private void _consumeAttrCharacterReference()
    • _consumeNumber

      private void _consumeNumber(int unconsumeNdx)
    • ensureCapacity

      private void ensureCapacity()
    • ensureCapacity

      private void ensureCapacity(int growth)
    • textEmitChar

      protected void textEmitChar(char c)
      Emits characters into the local text buffer.
    • textStart

      protected void textStart()
      Resets text buffer.
    • textEmitChars

      protected void textEmitChars(int from, int to)
    • textEmitChars

      protected void textEmitChars(char[] buffer)
    • textWrap

      protected CharSequence textWrap()
    • _addAttribute

      private void _addAttribute()
    • _addAttributeWithValue

      private void _addAttributeWithValue()
    • _addAttribute

      private void _addAttribute(CharSequence attrName, CharSequence attrValue)
    • emitTag

      protected void emitTag()
    • emitComment

      protected void emitComment(int from, int to)
      Emits a comment. Also checks for conditional comments!
    • emitText

      protected void emitText()
      Emits text if there is some content.
    • emitScript

      protected void emitScript(int from, int to)
    • emitDoctype

      protected void emitDoctype()
    • emitXml

      protected void emitXml()
    • emitCData

      protected void emitCData(CharSequence charSequence)
    • errorEOF

      protected void errorEOF()
    • errorInvalidToken

      protected void errorInvalidToken()
    • errorCharReference

      protected void errorCharReference()
    • _error

      protected void _error(String message)
      Prepares error message and reports it to the visitor.
    • isAppropriateTagName

      private boolean isAppropriateTagName(char[] lowerCaseNameToMatch, int from, int to)
    • matchTagName

      private boolean matchTagName(char[] tagNameLowercase)
    • switchTypeToSelfClosing

      private void switchTypeToSelfClosing()