Initial revision

This commit is contained in:
Hans Lambermont
2002-10-12 11:37:38 +00:00
commit 12315f4d0e
1699 changed files with 444708 additions and 0 deletions

View File

@@ -0,0 +1,39 @@
""" Constants for sets (of characters)
(c) Copyright Marc-Andre Lemburg; All Rights Reserved.
See the documentation for further information on copyrights,
or contact the author (mal@lemburg.com).
"""
import string
# Simple character strings
a2z = 'abcdefghijklmnopqrstuvwxyz'
A2Z = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
umlaute = '<EFBFBD><EFBFBD><EFBFBD><EFBFBD>'
Umlaute = '<EFBFBD><EFBFBD><EFBFBD>'
alpha = A2Z + a2z
german_alpha = A2Z + a2z + umlaute + Umlaute
number = '0123456789'
alphanumeric = alpha + number
white = ' \t\v'
newline = '\r\n'
formfeed = '\f'
whitespace = white + newline + formfeed
any = '\000\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037 !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\177\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377'
# Precompiled as sets, e.g. a2z_set = set(a2z)
a2z_set = '\000\000\000\000\000\000\000\000\000\000\000\000\376\377\377\007\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000'
A2Z_set = '\000\000\000\000\000\000\000\000\376\377\377\007\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000'
alpha_set = '\000\000\000\000\000\000\000\000\376\377\377\007\376\377\377\007\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000'
german_alpha_set = '\000\000\000\000\000\000\000\000\376\377\377\007\376\377\377\007\000\000\000\000\000\000\000\000\020\000@\220\020\000@\020'
number_set = '\000\000\000\000\000\000\377\003\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000'
alphanumeric_set = '\000\000\000\000\000\000\377\003\376\377\377\007\376\377\377\007\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000'
white_set = '\000\002\000\000\001\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000'
newline_set = '\000$\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000'
whitespace_set = '\000&\000\000\001\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000'
nonwhitespace_set = '\377\301\377\377\376\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377'
any_set = '\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377'
# Clean up
del string

View File

@@ -0,0 +1,348 @@
""" Constants for writing tag tables
The documentation in this file is obsoleted by the HTML docs in
the Doc/ subdirectory of the package. Constants defined here must
match those in mxTextTools/mxte.h.
(c) Copyright Marc-Andre Lemburg; All Rights Reserved.
See the documentation for further information on copyrights,
or contact the author (mal@lemburg.com).
"""
#########################################################################
# This file contains the definitions and constants used by the tagging
# engine:
#
# 1. Matching Tables
# 2. Commands & Constants
# 3. Matching Functions
# 4. Callable tagobjects
# 5. Calling the engine & Taglists
#
#########################################################################
# 1. Matching Tables:
#
# these are tuples of tuples, each entry having the following meaning:
#
# syntax: (tag, cmd, chars|table|fct [,jne] [,je=1])
# tag = object used to mark this section, if it matches
# cmd = command (see below)
# chars = match one or more of these characters
# table = table to use for matching characters
# fct = function to call (see below)
# jne = if the current character doesn't match, jump this
# many table entries relative to the current entry
# je = if we have a match make a relative jump of this length
#
# * a table matches a string iff the end of the table is reached
# (that is: an index is requested that is beyond the end-of-table)
# * a table is not matched if a tag is not matched and no jne is given;
# if it is matched then processing simply moves on to the next entry
# * marking is done by adding the matching slice in the string
# together with the marking object to the tag list; if the object is
# None, then it will not be appended to the taglist list
# * if the flag CallTag is set in cmd, then instead of appending
# matches to the taglist, the tagobj will be called (see below)
#
# TIP: if you are getting an error 'call of a non-function' while
# writing a table definition, you probably have a missing ','
# somewhere in the tuple !
#
# For examples see the tag*.py - files that came with this engine.
#
#########################################################################
# 2. Commands & Constants
#
#
#
# some useful constants for writing matching tables
#
To = None # good for cmd=Jump
Here = None # good for cmd=Fail and EOF
MatchOk = 20000 # somewhere beyond the end of the tag table...
MatchFail = -20000 # somewhere beyond the start of the tag table...
ToEOF = -1 # good for cmd=Move
ThisTable = 999 # to recursively match using the current table;
# can be passed as argument to Table and SubTable
# instead of a tuple
#
# commands and flags passed in cmd (see below)
#
# note: I might add some further commands to this list, if needed
# (the numbers will then probably change, but not the
# names)
#
# convention: a command "matches", if and only if it moves the
# current position at least one character; a command "reads"
# characters the characters, if they match ok
#
# notations:
#
# x refers to the current position in the string
# len refers to the string length or what the function tag() is told to
# believe it to be (i.e. the engine only looks at the slice text[x:len])
# text refers to the text string
# jne is the optional relative jump distance in case the command
# did not match, i.e. x before and after applying the command
# are the same (if not given the current table is considered
# not to match)
# je is the optional relative jump distance in case the command
# did match (it defaults to +1)
#
# commands
Fail = 0 # this will always fail (position remains unchanged)
Jump = 0 # jump to jne (position remains unchanged)
# match & read chars
AllIn = 11 # all chars in match (at least one)
AllNotIn = 12 # all chars not in match (at least one)
Is = 13 # current char must be == match (matches one char)
IsIn = 14 # current char must be in match (matches one char)
IsNot = 15 # current char must be be != match (matches one char)
IsNotIn = 15 # current char must be not be in match (matches one char)
AllInSet = 31
IsInSet = 32
# match & read for whole words
Word = 21 # the next chars must be those in match
WordStart = 22 # all chars up to the first occ. of match (at least one)
WordEnd = 23 # same as WordStart, accept that the text pointer
# is moved behind the match
NoWord = WordStart # all chars up to the first occ. of match (at least one)
# match using search objects BMS or FS
sWordStart = 111 # all chars up to the first occ. of match (may be 0 chars)
sWordEnd = 112 # same as WordStart, accept that the text pointer
# is moved behind the match
sFindWord = 113 # find match and process the found slice only (ignoring
# the chars that lead up to the match); positions
# the text pointer right after the match like WordEnd
# functions & tables
Call = 201 # call match(text,x,len) as function (see above)
CallArg = 202 # match has to be a 2-tuple (fct,arg), then
# fct(text,x,len,arg) is called; the return value is taken
# as new x; it is considered matching if the new x is
# different than the x before the call -- like always
# (note: arg has to be *one* object, e.g. a tuple)
Table = 203 # match using table (given in match)
SubTable = 207 # match using sub table (given in match); the sub table
# uses the same taglist as the calling table
TableInList = 204 # same as Table, but match is a tuple (list,index)
# and the table list[index] is used as matching
# table
SubTableInList = 208
# same as TableInList, but the sub table
# uses the same taglist as the calling table
# specials
EOF = 1 # current position must be EOF, e.g. >= len(string)
Skip = 2 # skip match (must be an integer) chars; note: this cmd
# always matches ok, so jne doesn't have any meaning in
# this context
Move = 3 # move the current text position to match (if negative,
# the text length + 1 (!) is added, thus -1 moves to the
# EOF, -2 to the last char and so on); note: this cmd
# always matches ok, so jne doesn't have any meaning in
# this context
# loops
Loop = 205 # loop-construct
#
# (tagobj,Loop,Count,jne,je) - sets/decrements the
# loop variable for current table according to the
# following rules:
# 1. the first time the engine passes this entry
# sets the loop variable to Count and continues
# without reading any character, but saving the
# current position in text
# 2. the next time, it decrements the loop variable
# and checks if it is < 0:
# (a) if it is, then the tagobj is added to the
# taglist with the slice (saved position, current
# position) and processing continues at entry
# current + jne
# (b) else, processing continues at entry current + je
# Note: if you jump out of the loop while the loop
# variable is still > 0, then you *must*
# reset the loop mechanism with
# (None,LoopControl,Reset)
# Note: you can skip the remaining loops by calling
# (None,LoopControl,Break) and jumping back
# to the Loop-entry; this sets the loop
# variable to 0
# Note: tables cannot have nested loops within their
# context; you can have nested loops in nested
# tables though (there is one loop var per
# tag()-call which takes place every time
# a table match is done)
#
LoopControl = 206 # controls the loop variable (always succeeds, i.e.
# jne has no meaning);
# match may be one of:
Break = 0 # * sets the loop variable to 0, thereby allowing
# to skip the remaining loops
Reset = -1 # * resets the loop mechanism (see note above)
#
# See tagLoop.py for some examples.
##########################################################################
#
# Flags (to be '+'ed with the above command code)
#
CallTag = 256 # call tagobj(taglist,text,l,r,subtags)
# upon successfully matching the slice [l:r] in text
# * taglist is the current list tags found (may be None)
# * subtags is a sub-list, passed when a subtable was used
# to do the matching -- it is None otherwise !)
#
# example entry with CallTag-flag set:
#
# (found_a_tag,CallTag+Table,tagtable)
# -- if tagtable matches the current text position,
# found_a_tag(taglist,text,l,r,newtaglist) is called and
# the match is *not* appended to the taglist by the tagging
# engine (the function would have to do this, in case it is needed)
AppendToTagobj = 512 # this appends the slice found to the tagobj, assuming
# that it is a Python list:
# does a tagobj.append((None,l,r,subtags)) call
# Alias for b/w comp.
AppendToTag = AppendToTagobj
AppendTagobj = 1024 # don't append (tagobj,l,r,subtags) to the taglist,
# but only tagobj itself; the information in l,r,subtags
# is lost, yet this can be used to write tag tables
# whose output can be used directly by tag.join()
AppendMatch = 2048 # append the match to the taglist instead of
# the tag object; this produces non-standard
# taglists !
#########################################################################
# 3. Matching Functions
#
# syntax:
#
# fct(s,x,len_s)
# where s = string we are working on
# x = current index in s where we wnat to match something
# len_s = 'length' of s, this is how far the search may be
# conducted in s, not necessarily the true length of s
#
# * the function has to return the index of the char right after
# matched string, e.g.
#
# 'xyzabc' ---> 'xyz' matches ---> return x+3
#
# * if the string doesn't match simply return x; in other words:
# the function has to return the matching slice's right index
# * you can use this to match e.g. 10 characters of a certain kind,
# or any word out of a given list, etc.
# * note: you cannot give the function additional parameters from within
# the matching table, so it has to know everything it needs to
# know a priori; use dynamic programming !
#
# some examples (not needed, since all are implemented by commands)
#
#
#def matchword(x):
# s = """
#def a(s,x,len_text):
# y = x+%i
# if s[x:y] == %s: return y
# return x
#"""
# exec s % (len(x),repr(x))
# return a
#
#def rejectword(x):
# s = """
#def a(s,x,len_text):
# while x < len(s) and s[x:x+%i] != %s:
# x = x + 1
# return x
#"""
# exec s % (len(x),repr(x))
# return a
#
#def HTML_Comment(s,x,len_text):
# while x < len_text and s[x:x+3] != '-->':
# x = x + 1
# return x
#
#
#########################################################################
# 4. Callable tagobjects
#
# a sample callable tagobj:
#
#
#def test(taglist,text,l,r,newtaglist):
#
# print 'found',repr(text[l:r])[:40],(l,r)
#
#
#########################################################################
# 5. Calling the engine & Taglists
#
# The function
# tag(text,table,start=0,len_text=len(text),taglistinit=[])
# found in mxTextTools:
#
# This function does all the matching according to the above rules.
# You give it a text string and a tag table and it will
# start processing the string starting from 'start' (which defaults to 0)
# and continue working until it reaches the 'EOF', i.e. len_text (which
# defaults to the text length). It thus tags the slice text[start:len_text].
#
# The function will create a list of found tags in the following
# format (which I call taglist):
#
# (tagobj,l,r,subtaglist)
#
# where: tagobj = specified tag object taken from the table
# [l:r] = slice that matched the tag in text
# subtaglist = if matching was done using a subtable
# this is the taglist it produced; in all other
# cases this will be None
#
# * if you pass None as taglistinit, then no taglist will be created,
# i.e. only CallTag commands will have any effect. (This saves
# temporary memory for big files)
# * the function returns a tuple:
# (success, taglist, nextindex)
# where: success = 0/1
# taglist = the produced list or None
# nextindex = the index+1 of the last char that matched
# (in case of failure, this points to the beginning
# of the substring that caused the problem)
#
### Module init.
def _module_init():
global id2cmd
import types
id2cmd = {}
IntType = types.IntType
for cmd,value in globals().items():
if type(value) == IntType:
if value == 0:
id2cmd[0] = 'Fail/Jump'
else:
id2cmd[value] = cmd
_module_init()

View File

@@ -0,0 +1 @@