|
|||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | ||||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Object | +--javax.swing.text.html.parser.Parser
Untamed: A simple DTD-driven HTML parser. The parser reads an HTML file from an InputStream and calls various methods (which should be overridden in a subclass) when tags and data are encountered.
Unfortunately there are many badly implemented HTML parsers out there, and as a result there are many badly formatted HTML files. This parser attempts to parse most HTML files. This means that the implementation sometimes deviates from the SGML specification in favor of HTML.
The parser treats \r and \r\n as \n. Newlines after starttags and before end tags are ignored just as specified in the SGML/HTML specification.
The html spec does not specify how spaces are to be coalesced very well. Specifically, the following scenarios are not discussed (note that a space should be used here, but I am using   to force the space to be displayed):
'<b>blah <i> <strike> foo' which can be treated as: '<b>blah <i><strike>foo'
as well as: '<p><a href="xx"> <em>Using</em></a></p>' which appears to be treated as: '<p><a href="xx"><em>Using</em></a></p>'
If strict
is false, when a tag that breaks flow,
(TagElement.breaksFlows
) or trailing whitespace is
encountered, all whitespace will be ignored until a non whitespace
character is encountered. This appears to give behavior closer to
the popular browsers.
DTD
,
TagElement
,
SimpleAttributeSet
Field Summary | |
private SimpleAttributeSet |
attributes
|
private char[] |
buf
|
private int |
ch
|
private int |
crCount
Number of \r's encountered. |
private int |
crlfCount
Number of \r\n's encountered. |
private int |
currentBlockStartPos
The start position of the current block. |
private int |
currentPosition
|
protected DTD |
dtd
|
private boolean |
ignoreSpace
The html spec does not specify how spaces are coalesced very well. |
private Reader |
in
|
private TagElement |
last
|
private int |
lastBlockStartPos
Start position of the last block. |
private TagElement |
lastFormSent
|
private int |
len
|
private int |
lfCount
Number of \n's encountered. |
private int |
ln
|
private int |
pos
|
private Element |
recent
|
private boolean |
seenBody
|
private boolean |
seenHead
|
private boolean |
seenHtml
|
private boolean |
skipTag
|
private boolean |
space
|
private TagStack |
stack
|
private char[] |
str
|
protected boolean |
strict
This flag determines whether or not the Parser will be strict in enforcing SGML compatibility. |
private int |
strpos
|
private char[] |
text
|
private int |
textpos
|
Fields inherited from interface javax.swing.text.html.parser.DTDConstants |
ANY, CDATA, CONREF, CURRENT, DEFAULT, EMPTY, ENDTAG, ENTITIES, ENTITY, FIXED, GENERAL, ID, IDREF, IDREFS, IMPLIED, MD, MODEL, MS, NAME, NAMES, NMTOKEN, NMTOKENS, NOTATION, NUMBER, NUMBERS, NUTOKEN, NUTOKENS, PARAMETER, PI, PUBLIC, RCDATA, REQUIRED, SDATA, STARTTAG, SYSTEM |
Constructor Summary | |
Parser(DTD dtd)
Enabled: |
Method Summary | |
(package private) void |
addString(int c)
Add a char to the string buffer. |
protected void |
endTag(boolean omitted)
Handle an end tag. |
protected void |
error(String err)
|
protected void |
error(String err,
String arg1)
|
protected void |
error(String err,
String arg1,
String arg2)
|
protected void |
error(String err,
String arg1,
String arg2,
String arg3)
Invoke the error handler. |
(package private) void |
errorContext()
Error context. |
protected void |
flushAttributes()
|
protected SimpleAttributeSet |
getAttributes()
|
(package private) int |
getBlockStartPosition()
Returns the start position of the current block. |
(package private) char[] |
getChars(int pos)
|
(package private) char[] |
getChars(int pos,
int endPos)
|
protected int |
getCurrentLine()
|
protected int |
getCurrentPos()
|
(package private) String |
getEndOfLineString()
Returns the end of line string. |
(package private) String |
getString(int pos)
Get the string that's been accumulated. |
protected void |
handleComment(char[] text)
Called when an HTML comment is encountered. |
protected void |
handleEmptyTag(TagElement tag)
Called when an empty tag is encountered. |
protected void |
handleEndTag(TagElement tag)
Called when an end tag is encountered. |
protected void |
handleEOFInComment()
|
protected void |
handleError(int ln,
String msg)
An error has occurred. |
protected void |
handleStartTag(TagElement tag)
Called when a start tag is encountered. |
protected void |
handleText(char[] text)
Called when PCDATA is encountered. |
(package private) void |
handleText(TagElement tag)
Output text. |
protected void |
handleTitle(char[] text)
Called when an HTML title tag is encountered. |
(package private) boolean |
ignoreElement(Element elem)
|
(package private) boolean |
legalElementContext(Element elem)
Create a legal content for an element. |
(package private) void |
legalTagContext(TagElement tag)
Create a legal context for a tag. |
protected TagElement |
makeTag(Element elem)
|
protected TagElement |
makeTag(Element elem,
boolean fictional)
Makes a TagElement. |
protected void |
markFirstTime(Element elem)
Marks the first time a tag has been seen in a document |
void |
parse(Reader in)
Enabled: Parse an HTML stream, given a DTD. |
(package private) void |
parseAttributeSpecificationList(Element elem)
Parse attribute specification List. |
(package private) String |
parseAttributeValue(boolean lower)
Parse attribute value. |
(package private) void |
parseComment()
Parse a comment. |
(package private) void |
parseContent()
Parse Content. |
String |
parseDTDMarkup()
Enabled: Parses th Document Declaration Type markup declaration. |
private char[] |
parseEntityReference()
Parse an entity reference. |
(package private) boolean |
parseIdentifier(boolean lower)
Parse identifier. |
(package private) void |
parseInvalidTag()
Parse an invalid tag. |
(package private) void |
parseLiteral(boolean replace)
Parse literal content. |
protected boolean |
parseMarkupDeclarations(StringBuffer strBuff)
Parse markup declarations. |
(package private) void |
parseTag()
Parse a start or end tag. |
private int |
readCh()
|
(package private) void |
resetStrBuffer()
|
(package private) void |
skipSpace()
Skip space. |
protected void |
startTag(TagElement tag)
Handle a start tag. |
(package private) int |
strIndexOf(char target)
|
Methods inherited from class java.lang.Object |
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
Field Detail |
private char[] text
private int textpos
private TagElement last
private boolean space
private char[] str
private int strpos
protected DTD dtd
private int ch
private int ln
private Reader in
private Element recent
private TagStack stack
private boolean skipTag
private TagElement lastFormSent
private SimpleAttributeSet attributes
private boolean seenHtml
private boolean seenHead
private boolean seenBody
private boolean ignoreSpace
The problematic scenarios are: '<b>blah <i> <strike> foo' which can be treated as: '<b>blah <i><strike>foo' as well as: '<p><a href="xx"> <em>Using</em></a></p>' which appears to be treated as: '<p><a href="xx"><em>Using</em></a></p>'
When a tag that breaks flow, or trailing whitespace is encountered ignoreSpace is set to true. From then on, all whitespace will be ignored. ignoreSpace will be set back to false the first time a non whitespace character is encountered. This appears to give behavior closer to the popular browsers.
protected boolean strict
private int crlfCount
private int crCount
private int lfCount
private int currentBlockStartPos
private int lastBlockStartPos
private char[] buf
private int pos
private int len
private int currentPosition
Constructor Detail |
public Parser(DTD dtd)
Method Detail |
protected int getCurrentLine()
int getBlockStartPosition()
protected TagElement makeTag(Element elem, boolean fictional)
protected TagElement makeTag(Element elem)
protected SimpleAttributeSet getAttributes()
protected void flushAttributes()
protected void handleText(char[] text)
protected void handleTitle(char[] text)
protected void handleComment(char[] text)
protected void handleEOFInComment()
protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException
ChangedCharSetException
protected void handleStartTag(TagElement tag)
protected void handleEndTag(TagElement tag)
protected void handleError(int ln, String msg)
void handleText(TagElement tag)
protected void error(String err, String arg1, String arg2, String arg3)
protected void error(String err, String arg1, String arg2)
protected void error(String err, String arg1)
protected void error(String err)
protected void startTag(TagElement tag) throws ChangedCharSetException
ChangedCharSetException
protected void endTag(boolean omitted)
boolean ignoreElement(Element elem)
protected void markFirstTime(Element elem)
boolean legalElementContext(Element elem) throws ChangedCharSetException
ChangedCharSetException
void legalTagContext(TagElement tag) throws ChangedCharSetException
ChangedCharSetException
void errorContext() throws ChangedCharSetException
ChangedCharSetException
void addString(int c)
String getString(int pos)
char[] getChars(int pos)
char[] getChars(int pos, int endPos)
void resetStrBuffer()
int strIndexOf(char target)
void skipSpace() throws IOException
IOException
boolean parseIdentifier(boolean lower) throws IOException
IOException
private char[] parseEntityReference() throws IOException
IOException
void parseComment() throws IOException
IOException
void parseLiteral(boolean replace) throws IOException
IOException
String parseAttributeValue(boolean lower) throws IOException
IOException
void parseAttributeSpecificationList(Element elem) throws IOException
IOException
public String parseDTDMarkup() throws IOException
IOException
protected boolean parseMarkupDeclarations(StringBuffer strBuff) throws IOException
IOException
void parseInvalidTag() throws IOException
IOException
void parseTag() throws IOException
IOException
void parseContent() throws IOException
IOException
String getEndOfLineString()
public void parse(Reader in) throws IOException
IOException
private final int readCh() throws IOException
IOException
protected int getCurrentPos()
|
|||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | ||||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |