uk.ac.bham.gloss
Class StreamTokenizer

java.lang.Object
  extended by uk.ac.bham.gloss.StreamTokenizer
All Implemented Interfaces:
Locator, Tokenizer

public class StreamTokenizer
extends java.lang.Object
implements Tokenizer

The StreamTokenizer class represents an object that reads a stream of data and converts it into tokens. This stream is parsed using Java's regular expression methods. A typical method call to get a token will list the types of tokens allowed at that point. Since this depends on the current mode this list of types may vary. If a token is rejected the input characters that form it may be put back on the input stream; a subsequent call to get a token may extract a different token from the same characters if the types allowed are different. This class also handles comments (starting with ; and continuing to the end of the line) and line continuations (\ immediately followed by newline, though the character \ can be changed). Copyright Richard Kaye 2007-8 for GLOSS, http://gloss.bham.ac.uk Usage permitted according to the GPL. No waranty.


Field Summary
protected  int attrvaltsREI
           
protected static java.lang.String b64digitre
          b64digitre matches a base64 digit
protected  int b64dtsREI
           
protected static java.lang.String b64re
          b64re matches a base64 constant starting with = and ending with = or == for padding if required.
protected  int chardtsREI
           
protected static java.lang.String charre
          charre matches a character constant which is delimited by '...' and contains either a single character or a \c escape combination.
protected  int commentnlREI
           
protected  int cref1REI
           
protected  int cref2REI
           
protected  int eltnametsREI
           
protected  int entrefREI
           
protected  int fpdatatsREI
           
protected static java.lang.String fpre
          fpre matches an (arb precision) fp constant
protected  int hexdatatsREI
           
protected static java.lang.String hexre
          hexre matches an (arb precision) signed hex integer, almost according to OpenMath syntax, except that a leading 0 is required (after any - sign) and before the x.
protected  int intdatatsREI
           
protected static java.lang.String intre
          intre matches an (arb precision) signed integer written to base 10, acording to syntax defined in OpenMath.
protected  int leadcommREI
           
protected  int leadlabcommREI
           
protected  int leadspcommREI
           
protected  int leadspREI
           
protected static java.lang.String namere
          namere matches a fully qualified element name.
protected  int nlcoREI
           
protected static java.lang.String nsre
          nsre matches a non-space unicode character in UTF-16
protected  int nsreREI
           
protected  int parreftsREI
           
protected  int parvaltsREI
           
protected  int peltvaltsREI
           
protected  int pideftsREI
           
protected static java.lang.String pnamere
          pnamere matches a resolved parameter name.
protected  int puncvaltsREI
           
protected  int slabtabREI
           
protected  int spcommentnlREI
           
protected  int speqspREI
           
protected  int spnlREI
           
protected  int sptabREI
           
protected  int stringdtsREI
           
protected static java.lang.String stringre1
          stringre1 matches a string delimited with "
protected static java.lang.String stringre2
           
protected static java.lang.String tsre
          tsre matches trailing space
protected static java.lang.String ucre
          ucre matches a unicode character in UTF-16
protected  int ucreREI
           
protected static java.lang.String uqnre
          uqnre matches a namespace prefix.
protected  int uridtsREI
           
protected static java.lang.String urire
          urire matches a uri constant which can be almost anything that uses the safe characters a-z A-Z 0-9 $ - _ .
protected static java.lang.String valuere
          valuere matches either stringre1, stringre2 or a string not starting with " or ' and not containing whitespace
 
Constructor Summary
StreamTokenizer(java.lang.String documentname, java.io.InputStreamReader isr, Logger l)
          Constructor: use null for documentname if it is not available.
 
Method Summary
protected  int addRE(java.lang.String re)
          Adds an RE "re" to the library, precompiling it for future use.
 int col()
          Returns the column number of the next character to be read.
protected  void displayState(java.lang.String message)
          Internal method that displays the current state of the Tokenizer.
 java.lang.String doc()
          Returns the document name or URL.
protected  Token getATTR(int depth, int thislinen, int thiscoln)
           
protected  Token getB64(int depth, int thislinen, int thiscoln)
           
protected  Token getCHAR(int depth, int thislinen, int thiscoln)
           
protected  Token getCREF(int depth, int thislinen, int thiscoln)
           
protected  Token getELT(int depth, int thislinen, int thiscoln)
           
protected  Token getEOS(int depth, int thislinen, int thiscoln)
           
protected  Token getEREF(int depth, int thislinen, int thiscoln)
           
protected  Token getFP(int depth, int thislinen, int thiscoln)
           
protected  Token getHEX(int depth, int thislinen, int thiscoln)
           
protected  Token getINT(int depth, int thislinen, int thiscoln)
           
protected  Token getLABEL(int depth, int thislinen, int thiscoln)
           
 char getLineContinuationChar()
          Returns the line continuation char, or '\0' if there is none set.
protected  Token getNS(int depth, int thislinen, int thiscoln)
           
protected  Token getPDEF(int depth, int thislinen, int thiscoln)
           
protected  Token getPELT(int depth, int thislinen, int thiscoln)
           
protected  Token getPI(int depth, int thislinen, int thiscoln)
           
protected  Token getPREF(int depth, int thislinen, int thiscoln)
           
protected  Token getPUNC(java.lang.String tdata, int depth, int thislinen, int thiscoln)
           
protected  java.lang.String getRegex(int reIndx)
          Internal private method that gets a string matching a regex at the beginning of unread input (inbuff, at inindx) returning it (or returning null if no such match).
protected  Token getSTR(int depth, int thislinen, int thiscoln)
           
 Token getToken(java.lang.String accept)
          Default form of getToken in which the "disregardComments" and "disregardWhitespace" parameters are true provided UC is not one of the acceptable types, and false otherwise.
 Token getToken(java.lang.String accept, boolean disregardComments, boolean disregardWhitespace, boolean allowLinecontinuation, boolean allowLabel)
          Gets the next token, where the argument "allowedtypes" is a bitwise OR (|) of Token.type_TYPE values of allowed token types.
protected  Token getUC(int depth, int thislinen, int thiscoln)
           
protected  Token getURI(int depth, int thislinen, int thiscoln)
           
 int lastLineNumber()
          Returns the number of the last line read from the input stream, 0 if no line has been read.
 int line()
          Returns the line number of the next character to be read.
 Logger logger()
          accessor: get the logger object
protected  boolean matches(java.lang.String s, int reIndx)
          Internal private method that tests is a string matches a RE.
protected  boolean notReadEnough(int startindx, boolean atStart, boolean disregardComments, boolean disregardWhitespace, boolean allowLabel)
          Method to determine if we have read in enough data to guarentee a token.
protected  java.lang.String replaceAll(java.lang.String s, int reIndx, java.lang.String replacement)
          Internal private method that performs a replaceAll operation on string s against an re.
protected  java.lang.String replaceFirst(java.lang.String s, int reIndx, java.lang.String replacement)
          Internal private method that performs a replaceFirst operation on string s against an re.
 void setLineContinuationChar(char c)
          Sets the line continuation char, set to '\0' if none is required.
 void setLogger(Logger l)
          set the logger object
protected  java.lang.String trimLeadingSpace(java.lang.String s)
          Returns the string with leading whitespace (tab, space, nl) removed
protected  java.lang.String trimTrailingSpace(java.lang.String s)
          Returns the string with trailing whitespace (tab, space, nl) removed
 void ungetToken()
          Restores the state of this Tokenizer to just before the last getToken operation.
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

uqnre

protected static final java.lang.String uqnre
uqnre matches a namespace prefix. TO DO: only normal letters are allowed at present; fix this to allow general unicode letters

See Also:
Constant Field Values

namere

protected static final java.lang.String namere
namere matches a fully qualified element name. TO DO: only normal letters are allowed at present; fix this to allow general unicode letters

See Also:
Constant Field Values

tsre

protected static final java.lang.String tsre
tsre matches trailing space

See Also:
Constant Field Values

stringre1

protected static final java.lang.String stringre1
stringre1 matches a string delimited with "

See Also:
Constant Field Values

stringre2

protected static final java.lang.String stringre2
See Also:
Constant Field Values

valuere

protected static final java.lang.String valuere
valuere matches either stringre1, stringre2 or a string not starting with " or ' and not containing whitespace

See Also:
Constant Field Values

ucre

protected static final java.lang.String ucre
ucre matches a unicode character in UTF-16

See Also:
Constant Field Values

nsre

protected static final java.lang.String nsre
nsre matches a non-space unicode character in UTF-16

See Also:
Constant Field Values

charre

protected static final java.lang.String charre
charre matches a character constant which is delimited by '...' and contains either a single character or a \c escape combination. The combinations \', \\, \n, \t, \r are currently defined. Otherwise \c resolves to c.

See Also:
Constant Field Values

fpre

protected static final java.lang.String fpre
fpre matches an (arb precision) fp constant

See Also:
Constant Field Values

hexre

protected static final java.lang.String hexre
hexre matches an (arb precision) signed hex integer, almost according to OpenMath syntax, except that a leading 0 is required (after any - sign) and before the x.

See Also:
Constant Field Values

intre

protected static final java.lang.String intre
intre matches an (arb precision) signed integer written to base 10, acording to syntax defined in OpenMath. Note that leading + signs are NOT allowed.

See Also:
Constant Field Values

b64digitre

protected static final java.lang.String b64digitre
b64digitre matches a base64 digit

See Also:
Constant Field Values

b64re

protected static final java.lang.String b64re
b64re matches a base64 constant starting with = and ending with = or == for padding if required. The empty base64 constant "=" is also allowed.

See Also:
Constant Field Values

urire

protected static final java.lang.String urire
urire matches a uri constant which can be almost anything that uses the safe characters a-z A-Z 0-9 $ - _ . + ! * \' ( ) , / & ? # : ; = @ or { } only

See Also:
Constant Field Values

pnamere

protected static final java.lang.String pnamere
pnamere matches a resolved parameter name. TO DO: only normal letters are allowed at present

See Also:
Constant Field Values

commentnlREI

protected int commentnlREI

spcommentnlREI

protected int spcommentnlREI

spnlREI

protected int spnlREI

entrefREI

protected int entrefREI

parvaltsREI

protected int parvaltsREI

stringdtsREI

protected int stringdtsREI

nsreREI

protected int nsreREI

ucreREI

protected int ucreREI

chardtsREI

protected int chardtsREI

b64dtsREI

protected int b64dtsREI

uridtsREI

protected int uridtsREI

attrvaltsREI

protected int attrvaltsREI

peltvaltsREI

protected int peltvaltsREI

eltnametsREI

protected int eltnametsREI

fpdatatsREI

protected int fpdatatsREI

hexdatatsREI

protected int hexdatatsREI

intdatatsREI

protected int intdatatsREI

cref1REI

protected int cref1REI

cref2REI

protected int cref2REI

nlcoREI

protected int nlcoREI

slabtabREI

protected int slabtabREI

leadspREI

protected int leadspREI

leadcommREI

protected int leadcommREI

leadspcommREI

protected int leadspcommREI

leadlabcommREI

protected int leadlabcommREI

sptabREI

protected int sptabREI

speqspREI

protected int speqspREI

puncvaltsREI

protected int puncvaltsREI

parreftsREI

protected int parreftsREI

pideftsREI

protected int pideftsREI
Constructor Detail

StreamTokenizer

public StreamTokenizer(java.lang.String documentname,
                       java.io.InputStreamReader isr,
                       Logger l)
Constructor: use null for documentname if it is not available.

Method Detail

logger

public Logger logger()
accessor: get the logger object


setLogger

public void setLogger(Logger l)
set the logger object


addRE

protected int addRE(java.lang.String re)
Adds an RE "re" to the library, precompiling it for future use. Returns its index in the reLibrary vector.


setLineContinuationChar

public void setLineContinuationChar(char c)
Sets the line continuation char, set to '\0' if none is required.


getLineContinuationChar

public char getLineContinuationChar()
Returns the line continuation char, or '\0' if there is none set.


lastLineNumber

public int lastLineNumber()
Returns the number of the last line read from the input stream, 0 if no line has been read.


line

public int line()
Returns the line number of the next character to be read.

Specified by:
line in interface Locator

col

public int col()
Returns the column number of the next character to be read.

Specified by:
col in interface Locator

doc

public java.lang.String doc()
Returns the document name or URL.

Specified by:
doc in interface Locator

ungetToken

public void ungetToken()
                throws GlossException
Restores the state of this Tokenizer to just before the last getToken operation. Only one getToken() can be undone using this method, otherwise this method throws an Exception.

Specified by:
ungetToken in interface Tokenizer
Throws:
GlossException

displayState

protected void displayState(java.lang.String message)
Internal method that displays the current state of the Tokenizer.


trimTrailingSpace

protected java.lang.String trimTrailingSpace(java.lang.String s)
Returns the string with trailing whitespace (tab, space, nl) removed


trimLeadingSpace

protected java.lang.String trimLeadingSpace(java.lang.String s)
Returns the string with leading whitespace (tab, space, nl) removed


notReadEnough

protected boolean notReadEnough(int startindx,
                                boolean atStart,
                                boolean disregardComments,
                                boolean disregardWhitespace,
                                boolean allowLabel)
Method to determine if we have read in enough data to guarentee a token. We read from index startindx in inbuff and, if atStart is true, we assume this is the start of a line.


replaceFirst

protected java.lang.String replaceFirst(java.lang.String s,
                                        int reIndx,
                                        java.lang.String replacement)
Internal private method that performs a replaceFirst operation on string s against an re. The argument reIndx is an index number returned from addRE(re).


replaceAll

protected java.lang.String replaceAll(java.lang.String s,
                                      int reIndx,
                                      java.lang.String replacement)
Internal private method that performs a replaceAll operation on string s against an re. The argument reIndx is an index number returned from addRE(re).


matches

protected boolean matches(java.lang.String s,
                          int reIndx)
Internal private method that tests is a string matches a RE. The argument reIndx is an index number returned from addRE(re).


getRegex

protected java.lang.String getRegex(int reIndx)
Internal private method that gets a string matching a regex at the beginning of unread input (inbuff, at inindx) returning it (or returning null if no such match). This method also updates coln, linen and inindx. The argument reIndx is an index number returned from addRE(re) and the regular expression re must start with "^".


getToken

public Token getToken(java.lang.String accept)
               throws GlossException
Default form of getToken in which the "disregardComments" and "disregardWhitespace" parameters are true provided UC is not one of the acceptable types, and false otherwise.

Specified by:
getToken in interface Tokenizer
Throws:
GlossException

getToken

public Token getToken(java.lang.String accept,
                      boolean disregardComments,
                      boolean disregardWhitespace,
                      boolean allowLinecontinuation,
                      boolean allowLabel)
               throws GlossException,
                      java.io.IOException
Gets the next token, where the argument "allowedtypes" is a bitwise OR (|) of Token.type_TYPE values of allowed token types. The argument punctvals is a string of all allowed punctuation at this point, separated by |, eg "[|]|{|}". If allowedTypes includes Token.PUNCTYPE and punctvals is non-null then the punctuation token must be one of the ones listed. Up to one ungetToken operation will be possible after a successful return from this method. Returns null if no such token can be found. The depth of the next token is the column number (coln) of the first nonblank character of the token, provided it is the first nonblank token on the currect line and the tokeniser is accepting "structured tokens"; it is 10000000+coln otherwise. Special rules apply to labels---see below. The EOS token represents end-of-stream and must be explicitly allowed. If a label token is allowed here and returned, we must be at the beginning of a line (having removed blank lines and comment-only lines) and the line must match "^ *"+namere+"\t", namere is the usual regular expression for an XML name. There must also be at least one non-label in the input text: the depth of this label is defined to be the same as the depth of the next nonlabel token. Puctuation combinations must be specified in the argument (in which case the only rules are that a valid punctuation combination is not equal to another typename and may not contain |). The empty punctuation combination is allowed. Punctuation combinations are matched left-to-right so use "[[|[" rather than "[|[[" as "[[" will never be matched in the latter case. These are matched before any other tokens apart from labels and EOS. Attributes match "@"xmlname where the xmlname agrees with usual XML syntax Pseudo-elements match "!"xmlname where the xmlname agrees with usual XML syntax A pi matches "?"xmlname where xmlname is as above An element name matches the usual XML regular expression for names An entity reference matches &xmlname; A parameter definition matches ${({uri})?name}[ \t]*=[ \t]*value where name is an unqualified XML namd and value is "..." or '...' A parameter reference matches ${({uri})?name}|$[a-z\$] where name is as above A string constant matches \"[^\"\n]*\" A character constant matches \'[^\\\n\r\']|\\[^\n\r]\' A base 64 constant looks like =asgfgjah8s== padded so that there are 4n characters after the first =. (The special case = is allowed.) A URI constant is prefixed with ~ to distinguish it from other tokens, as in ~uri The uri itself can (at present at least) be more or less anything at all, and the only restriction is that it cannot contain whitespace. A floating point constant obeys the usual syntax rules and may have an exponent. It may not contain whitespace. A hex constant looks like 0xA34F Upper or lower case may be used for x, a-f and these may be mixed. It may not contain whitespace. It prepresents an arbitrary precision unsigned integer. An integer constant matches [+-][0-9]+ and represents an arbitrary precision signed integer. A character reference matches &#[0-9]+; &#[xX][0-9a-fA-F]+; the name is the #[0-9]+ or #[xX][0-9a-fA-F]+ the value is the character with that unicode number and the full data is a string name=value. Unicode characters of more that 16 bits are supported. A ns matches [^ \t\n] (any non-whitespace character) The character (with that unicode number, in the first two cases) is returned as a uc token. A uc matches . (including whitespace, new line, etc.) The character (with that unicode number, in the first two cases) is returned. The "disregardComments" flag allows comments ( ;... ) to be ignored. The "disregardWhitespace" flag allows leading whitespace to be ignored. The combination disregardComments=true and disregardWhitespace==false doesn't really make sense no is not guaranteed. The "allowLinecontinuation" flag allows the line continuation character to merge two lines.

Throws:
GlossException
java.io.IOException

getLABEL

protected Token getLABEL(int depth,
                         int thislinen,
                         int thiscoln)
                  throws GlossException
Throws:
GlossException

getPUNC

protected Token getPUNC(java.lang.String tdata,
                        int depth,
                        int thislinen,
                        int thiscoln)

getEOS

protected Token getEOS(int depth,
                       int thislinen,
                       int thiscoln)
                throws GlossException
Throws:
GlossException

getELT

protected Token getELT(int depth,
                       int thislinen,
                       int thiscoln)

getATTR

protected Token getATTR(int depth,
                        int thislinen,
                        int thiscoln)

getPELT

protected Token getPELT(int depth,
                        int thislinen,
                        int thiscoln)

getEREF

protected Token getEREF(int depth,
                        int thislinen,
                        int thiscoln)

getCREF

protected Token getCREF(int depth,
                        int thislinen,
                        int thiscoln)

getPREF

protected Token getPREF(int depth,
                        int thislinen,
                        int thiscoln)

getPDEF

protected Token getPDEF(int depth,
                        int thislinen,
                        int thiscoln)

getPI

protected Token getPI(int depth,
                      int thislinen,
                      int thiscoln)

getSTR

protected Token getSTR(int depth,
                       int thislinen,
                       int thiscoln)

getCHAR

protected Token getCHAR(int depth,
                        int thislinen,
                        int thiscoln)

getB64

protected Token getB64(int depth,
                       int thislinen,
                       int thiscoln)

getURI

protected Token getURI(int depth,
                       int thislinen,
                       int thiscoln)

getHEX

protected Token getHEX(int depth,
                       int thislinen,
                       int thiscoln)

getFP

protected Token getFP(int depth,
                      int thislinen,
                      int thiscoln)

getINT

protected Token getINT(int depth,
                       int thislinen,
                       int thiscoln)

getNS

protected Token getNS(int depth,
                      int thislinen,
                      int thiscoln)

getUC

protected Token getUC(int depth,
                      int thislinen,
                      int thiscoln)