Initial revision
This commit is contained in:
39
intern/python/modules/TextTools/Constants/Sets.py
Normal file
39
intern/python/modules/TextTools/Constants/Sets.py
Normal file
@@ -0,0 +1,39 @@
|
||||
""" Constants for sets (of characters)
|
||||
|
||||
(c) Copyright Marc-Andre Lemburg; All Rights Reserved.
|
||||
See the documentation for further information on copyrights,
|
||||
or contact the author (mal@lemburg.com).
|
||||
"""
|
||||
import string
|
||||
|
||||
# Simple character strings
|
||||
|
||||
a2z = 'abcdefghijklmnopqrstuvwxyz'
|
||||
A2Z = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
||||
umlaute = '<EFBFBD><EFBFBD><EFBFBD><EFBFBD>'
|
||||
Umlaute = '<EFBFBD><EFBFBD><EFBFBD>'
|
||||
alpha = A2Z + a2z
|
||||
german_alpha = A2Z + a2z + umlaute + Umlaute
|
||||
number = '0123456789'
|
||||
alphanumeric = alpha + number
|
||||
white = ' \t\v'
|
||||
newline = '\r\n'
|
||||
formfeed = '\f'
|
||||
whitespace = white + newline + formfeed
|
||||
any = '\000\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037 !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\177\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377'
|
||||
|
||||
# Precompiled as sets, e.g. a2z_set = set(a2z)
|
||||
a2z_set = '\000\000\000\000\000\000\000\000\000\000\000\000\376\377\377\007\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000'
|
||||
A2Z_set = '\000\000\000\000\000\000\000\000\376\377\377\007\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000'
|
||||
alpha_set = '\000\000\000\000\000\000\000\000\376\377\377\007\376\377\377\007\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000'
|
||||
german_alpha_set = '\000\000\000\000\000\000\000\000\376\377\377\007\376\377\377\007\000\000\000\000\000\000\000\000\020\000@\220\020\000@\020'
|
||||
number_set = '\000\000\000\000\000\000\377\003\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000'
|
||||
alphanumeric_set = '\000\000\000\000\000\000\377\003\376\377\377\007\376\377\377\007\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000'
|
||||
white_set = '\000\002\000\000\001\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000'
|
||||
newline_set = '\000$\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000'
|
||||
whitespace_set = '\000&\000\000\001\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000'
|
||||
nonwhitespace_set = '\377\301\377\377\376\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377'
|
||||
any_set = '\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377'
|
||||
|
||||
# Clean up
|
||||
del string
|
348
intern/python/modules/TextTools/Constants/TagTables.py
Normal file
348
intern/python/modules/TextTools/Constants/TagTables.py
Normal file
@@ -0,0 +1,348 @@
|
||||
""" Constants for writing tag tables
|
||||
|
||||
The documentation in this file is obsoleted by the HTML docs in
|
||||
the Doc/ subdirectory of the package. Constants defined here must
|
||||
match those in mxTextTools/mxte.h.
|
||||
|
||||
(c) Copyright Marc-Andre Lemburg; All Rights Reserved.
|
||||
See the documentation for further information on copyrights,
|
||||
or contact the author (mal@lemburg.com).
|
||||
"""
|
||||
#########################################################################
|
||||
# This file contains the definitions and constants used by the tagging
|
||||
# engine:
|
||||
#
|
||||
# 1. Matching Tables
|
||||
# 2. Commands & Constants
|
||||
# 3. Matching Functions
|
||||
# 4. Callable tagobjects
|
||||
# 5. Calling the engine & Taglists
|
||||
#
|
||||
|
||||
#########################################################################
|
||||
# 1. Matching Tables:
|
||||
#
|
||||
# these are tuples of tuples, each entry having the following meaning:
|
||||
#
|
||||
# syntax: (tag, cmd, chars|table|fct [,jne] [,je=1])
|
||||
# tag = object used to mark this section, if it matches
|
||||
# cmd = command (see below)
|
||||
# chars = match one or more of these characters
|
||||
# table = table to use for matching characters
|
||||
# fct = function to call (see below)
|
||||
# jne = if the current character doesn't match, jump this
|
||||
# many table entries relative to the current entry
|
||||
# je = if we have a match make a relative jump of this length
|
||||
#
|
||||
# * a table matches a string iff the end of the table is reached
|
||||
# (that is: an index is requested that is beyond the end-of-table)
|
||||
# * a table is not matched if a tag is not matched and no jne is given;
|
||||
# if it is matched then processing simply moves on to the next entry
|
||||
# * marking is done by adding the matching slice in the string
|
||||
# together with the marking object to the tag list; if the object is
|
||||
# None, then it will not be appended to the taglist list
|
||||
# * if the flag CallTag is set in cmd, then instead of appending
|
||||
# matches to the taglist, the tagobj will be called (see below)
|
||||
#
|
||||
# TIP: if you are getting an error 'call of a non-function' while
|
||||
# writing a table definition, you probably have a missing ','
|
||||
# somewhere in the tuple !
|
||||
#
|
||||
# For examples see the tag*.py - files that came with this engine.
|
||||
#
|
||||
|
||||
#########################################################################
|
||||
# 2. Commands & Constants
|
||||
#
|
||||
#
|
||||
|
||||
#
|
||||
# some useful constants for writing matching tables
|
||||
#
|
||||
|
||||
To = None # good for cmd=Jump
|
||||
Here = None # good for cmd=Fail and EOF
|
||||
MatchOk = 20000 # somewhere beyond the end of the tag table...
|
||||
MatchFail = -20000 # somewhere beyond the start of the tag table...
|
||||
ToEOF = -1 # good for cmd=Move
|
||||
|
||||
ThisTable = 999 # to recursively match using the current table;
|
||||
# can be passed as argument to Table and SubTable
|
||||
# instead of a tuple
|
||||
|
||||
#
|
||||
# commands and flags passed in cmd (see below)
|
||||
#
|
||||
# note: I might add some further commands to this list, if needed
|
||||
# (the numbers will then probably change, but not the
|
||||
# names)
|
||||
#
|
||||
# convention: a command "matches", if and only if it moves the
|
||||
# current position at least one character; a command "reads"
|
||||
# characters the characters, if they match ok
|
||||
#
|
||||
# notations:
|
||||
#
|
||||
# x refers to the current position in the string
|
||||
# len refers to the string length or what the function tag() is told to
|
||||
# believe it to be (i.e. the engine only looks at the slice text[x:len])
|
||||
# text refers to the text string
|
||||
# jne is the optional relative jump distance in case the command
|
||||
# did not match, i.e. x before and after applying the command
|
||||
# are the same (if not given the current table is considered
|
||||
# not to match)
|
||||
# je is the optional relative jump distance in case the command
|
||||
# did match (it defaults to +1)
|
||||
#
|
||||
|
||||
# commands
|
||||
Fail = 0 # this will always fail (position remains unchanged)
|
||||
Jump = 0 # jump to jne (position remains unchanged)
|
||||
|
||||
# match & read chars
|
||||
AllIn = 11 # all chars in match (at least one)
|
||||
AllNotIn = 12 # all chars not in match (at least one)
|
||||
Is = 13 # current char must be == match (matches one char)
|
||||
IsIn = 14 # current char must be in match (matches one char)
|
||||
IsNot = 15 # current char must be be != match (matches one char)
|
||||
IsNotIn = 15 # current char must be not be in match (matches one char)
|
||||
|
||||
AllInSet = 31
|
||||
IsInSet = 32
|
||||
|
||||
# match & read for whole words
|
||||
Word = 21 # the next chars must be those in match
|
||||
WordStart = 22 # all chars up to the first occ. of match (at least one)
|
||||
WordEnd = 23 # same as WordStart, accept that the text pointer
|
||||
# is moved behind the match
|
||||
NoWord = WordStart # all chars up to the first occ. of match (at least one)
|
||||
|
||||
|
||||
# match using search objects BMS or FS
|
||||
sWordStart = 111 # all chars up to the first occ. of match (may be 0 chars)
|
||||
sWordEnd = 112 # same as WordStart, accept that the text pointer
|
||||
# is moved behind the match
|
||||
sFindWord = 113 # find match and process the found slice only (ignoring
|
||||
# the chars that lead up to the match); positions
|
||||
# the text pointer right after the match like WordEnd
|
||||
|
||||
# functions & tables
|
||||
Call = 201 # call match(text,x,len) as function (see above)
|
||||
CallArg = 202 # match has to be a 2-tuple (fct,arg), then
|
||||
# fct(text,x,len,arg) is called; the return value is taken
|
||||
# as new x; it is considered matching if the new x is
|
||||
# different than the x before the call -- like always
|
||||
# (note: arg has to be *one* object, e.g. a tuple)
|
||||
Table = 203 # match using table (given in match)
|
||||
SubTable = 207 # match using sub table (given in match); the sub table
|
||||
# uses the same taglist as the calling table
|
||||
TableInList = 204 # same as Table, but match is a tuple (list,index)
|
||||
# and the table list[index] is used as matching
|
||||
# table
|
||||
SubTableInList = 208
|
||||
# same as TableInList, but the sub table
|
||||
# uses the same taglist as the calling table
|
||||
|
||||
# specials
|
||||
EOF = 1 # current position must be EOF, e.g. >= len(string)
|
||||
Skip = 2 # skip match (must be an integer) chars; note: this cmd
|
||||
# always matches ok, so jne doesn't have any meaning in
|
||||
# this context
|
||||
Move = 3 # move the current text position to match (if negative,
|
||||
# the text length + 1 (!) is added, thus -1 moves to the
|
||||
# EOF, -2 to the last char and so on); note: this cmd
|
||||
# always matches ok, so jne doesn't have any meaning in
|
||||
# this context
|
||||
|
||||
# loops
|
||||
Loop = 205 # loop-construct
|
||||
#
|
||||
# (tagobj,Loop,Count,jne,je) - sets/decrements the
|
||||
# loop variable for current table according to the
|
||||
# following rules:
|
||||
# 1. the first time the engine passes this entry
|
||||
# sets the loop variable to Count and continues
|
||||
# without reading any character, but saving the
|
||||
# current position in text
|
||||
# 2. the next time, it decrements the loop variable
|
||||
# and checks if it is < 0:
|
||||
# (a) if it is, then the tagobj is added to the
|
||||
# taglist with the slice (saved position, current
|
||||
# position) and processing continues at entry
|
||||
# current + jne
|
||||
# (b) else, processing continues at entry current + je
|
||||
# Note: if you jump out of the loop while the loop
|
||||
# variable is still > 0, then you *must*
|
||||
# reset the loop mechanism with
|
||||
# (None,LoopControl,Reset)
|
||||
# Note: you can skip the remaining loops by calling
|
||||
# (None,LoopControl,Break) and jumping back
|
||||
# to the Loop-entry; this sets the loop
|
||||
# variable to 0
|
||||
# Note: tables cannot have nested loops within their
|
||||
# context; you can have nested loops in nested
|
||||
# tables though (there is one loop var per
|
||||
# tag()-call which takes place every time
|
||||
# a table match is done)
|
||||
#
|
||||
LoopControl = 206 # controls the loop variable (always succeeds, i.e.
|
||||
# jne has no meaning);
|
||||
# match may be one of:
|
||||
Break = 0 # * sets the loop variable to 0, thereby allowing
|
||||
# to skip the remaining loops
|
||||
Reset = -1 # * resets the loop mechanism (see note above)
|
||||
#
|
||||
# See tagLoop.py for some examples.
|
||||
|
||||
##########################################################################
|
||||
#
|
||||
# Flags (to be '+'ed with the above command code)
|
||||
#
|
||||
CallTag = 256 # call tagobj(taglist,text,l,r,subtags)
|
||||
# upon successfully matching the slice [l:r] in text
|
||||
# * taglist is the current list tags found (may be None)
|
||||
# * subtags is a sub-list, passed when a subtable was used
|
||||
# to do the matching -- it is None otherwise !)
|
||||
#
|
||||
# example entry with CallTag-flag set:
|
||||
#
|
||||
# (found_a_tag,CallTag+Table,tagtable)
|
||||
# -- if tagtable matches the current text position,
|
||||
# found_a_tag(taglist,text,l,r,newtaglist) is called and
|
||||
# the match is *not* appended to the taglist by the tagging
|
||||
# engine (the function would have to do this, in case it is needed)
|
||||
|
||||
AppendToTagobj = 512 # this appends the slice found to the tagobj, assuming
|
||||
# that it is a Python list:
|
||||
# does a tagobj.append((None,l,r,subtags)) call
|
||||
# Alias for b/w comp.
|
||||
AppendToTag = AppendToTagobj
|
||||
|
||||
AppendTagobj = 1024 # don't append (tagobj,l,r,subtags) to the taglist,
|
||||
# but only tagobj itself; the information in l,r,subtags
|
||||
# is lost, yet this can be used to write tag tables
|
||||
# whose output can be used directly by tag.join()
|
||||
|
||||
AppendMatch = 2048 # append the match to the taglist instead of
|
||||
# the tag object; this produces non-standard
|
||||
# taglists !
|
||||
|
||||
#########################################################################
|
||||
# 3. Matching Functions
|
||||
#
|
||||
# syntax:
|
||||
#
|
||||
# fct(s,x,len_s)
|
||||
# where s = string we are working on
|
||||
# x = current index in s where we wnat to match something
|
||||
# len_s = 'length' of s, this is how far the search may be
|
||||
# conducted in s, not necessarily the true length of s
|
||||
#
|
||||
# * the function has to return the index of the char right after
|
||||
# matched string, e.g.
|
||||
#
|
||||
# 'xyzabc' ---> 'xyz' matches ---> return x+3
|
||||
#
|
||||
# * if the string doesn't match simply return x; in other words:
|
||||
# the function has to return the matching slice's right index
|
||||
# * you can use this to match e.g. 10 characters of a certain kind,
|
||||
# or any word out of a given list, etc.
|
||||
# * note: you cannot give the function additional parameters from within
|
||||
# the matching table, so it has to know everything it needs to
|
||||
# know a priori; use dynamic programming !
|
||||
#
|
||||
# some examples (not needed, since all are implemented by commands)
|
||||
#
|
||||
#
|
||||
#def matchword(x):
|
||||
# s = """
|
||||
#def a(s,x,len_text):
|
||||
# y = x+%i
|
||||
# if s[x:y] == %s: return y
|
||||
# return x
|
||||
#"""
|
||||
# exec s % (len(x),repr(x))
|
||||
# return a
|
||||
#
|
||||
#def rejectword(x):
|
||||
# s = """
|
||||
#def a(s,x,len_text):
|
||||
# while x < len(s) and s[x:x+%i] != %s:
|
||||
# x = x + 1
|
||||
# return x
|
||||
#"""
|
||||
# exec s % (len(x),repr(x))
|
||||
# return a
|
||||
#
|
||||
#def HTML_Comment(s,x,len_text):
|
||||
# while x < len_text and s[x:x+3] != '-->':
|
||||
# x = x + 1
|
||||
# return x
|
||||
#
|
||||
#
|
||||
|
||||
#########################################################################
|
||||
# 4. Callable tagobjects
|
||||
#
|
||||
# a sample callable tagobj:
|
||||
#
|
||||
#
|
||||
#def test(taglist,text,l,r,newtaglist):
|
||||
#
|
||||
# print 'found',repr(text[l:r])[:40],(l,r)
|
||||
#
|
||||
#
|
||||
|
||||
#########################################################################
|
||||
# 5. Calling the engine & Taglists
|
||||
#
|
||||
# The function
|
||||
# tag(text,table,start=0,len_text=len(text),taglistinit=[])
|
||||
# found in mxTextTools:
|
||||
#
|
||||
# This function does all the matching according to the above rules.
|
||||
# You give it a text string and a tag table and it will
|
||||
# start processing the string starting from 'start' (which defaults to 0)
|
||||
# and continue working until it reaches the 'EOF', i.e. len_text (which
|
||||
# defaults to the text length). It thus tags the slice text[start:len_text].
|
||||
#
|
||||
# The function will create a list of found tags in the following
|
||||
# format (which I call taglist):
|
||||
#
|
||||
# (tagobj,l,r,subtaglist)
|
||||
#
|
||||
# where: tagobj = specified tag object taken from the table
|
||||
# [l:r] = slice that matched the tag in text
|
||||
# subtaglist = if matching was done using a subtable
|
||||
# this is the taglist it produced; in all other
|
||||
# cases this will be None
|
||||
#
|
||||
# * if you pass None as taglistinit, then no taglist will be created,
|
||||
# i.e. only CallTag commands will have any effect. (This saves
|
||||
# temporary memory for big files)
|
||||
# * the function returns a tuple:
|
||||
# (success, taglist, nextindex)
|
||||
# where: success = 0/1
|
||||
# taglist = the produced list or None
|
||||
# nextindex = the index+1 of the last char that matched
|
||||
# (in case of failure, this points to the beginning
|
||||
# of the substring that caused the problem)
|
||||
#
|
||||
|
||||
### Module init.
|
||||
|
||||
def _module_init():
|
||||
|
||||
global id2cmd
|
||||
|
||||
import types
|
||||
id2cmd = {}
|
||||
IntType = types.IntType
|
||||
for cmd,value in globals().items():
|
||||
if type(value) == IntType:
|
||||
if value == 0:
|
||||
id2cmd[0] = 'Fail/Jump'
|
||||
else:
|
||||
id2cmd[value] = cmd
|
||||
|
||||
_module_init()
|
1
intern/python/modules/TextTools/Constants/__init__.py
Normal file
1
intern/python/modules/TextTools/Constants/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
|
Reference in New Issue
Block a user