Initial revision

This commit is contained in:
Hans Lambermont
2002-10-12 11:37:38 +00:00
commit 12315f4d0e
1699 changed files with 444708 additions and 0 deletions

View File

@@ -0,0 +1,39 @@
""" Constants for sets (of characters)
(c) Copyright Marc-Andre Lemburg; All Rights Reserved.
See the documentation for further information on copyrights,
or contact the author (mal@lemburg.com).
"""
import string
# Simple character strings
a2z = 'abcdefghijklmnopqrstuvwxyz'
A2Z = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
umlaute = '<EFBFBD><EFBFBD><EFBFBD><EFBFBD>'
Umlaute = '<EFBFBD><EFBFBD><EFBFBD>'
alpha = A2Z + a2z
german_alpha = A2Z + a2z + umlaute + Umlaute
number = '0123456789'
alphanumeric = alpha + number
white = ' \t\v'
newline = '\r\n'
formfeed = '\f'
whitespace = white + newline + formfeed
any = '\000\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037 !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\177\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377'
# Precompiled as sets, e.g. a2z_set = set(a2z)
a2z_set = '\000\000\000\000\000\000\000\000\000\000\000\000\376\377\377\007\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000'
A2Z_set = '\000\000\000\000\000\000\000\000\376\377\377\007\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000'
alpha_set = '\000\000\000\000\000\000\000\000\376\377\377\007\376\377\377\007\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000'
german_alpha_set = '\000\000\000\000\000\000\000\000\376\377\377\007\376\377\377\007\000\000\000\000\000\000\000\000\020\000@\220\020\000@\020'
number_set = '\000\000\000\000\000\000\377\003\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000'
alphanumeric_set = '\000\000\000\000\000\000\377\003\376\377\377\007\376\377\377\007\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000'
white_set = '\000\002\000\000\001\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000'
newline_set = '\000$\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000'
whitespace_set = '\000&\000\000\001\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000'
nonwhitespace_set = '\377\301\377\377\376\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377'
any_set = '\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377'
# Clean up
del string

View File

@@ -0,0 +1,348 @@
""" Constants for writing tag tables
The documentation in this file is obsoleted by the HTML docs in
the Doc/ subdirectory of the package. Constants defined here must
match those in mxTextTools/mxte.h.
(c) Copyright Marc-Andre Lemburg; All Rights Reserved.
See the documentation for further information on copyrights,
or contact the author (mal@lemburg.com).
"""
#########################################################################
# This file contains the definitions and constants used by the tagging
# engine:
#
# 1. Matching Tables
# 2. Commands & Constants
# 3. Matching Functions
# 4. Callable tagobjects
# 5. Calling the engine & Taglists
#
#########################################################################
# 1. Matching Tables:
#
# these are tuples of tuples, each entry having the following meaning:
#
# syntax: (tag, cmd, chars|table|fct [,jne] [,je=1])
# tag = object used to mark this section, if it matches
# cmd = command (see below)
# chars = match one or more of these characters
# table = table to use for matching characters
# fct = function to call (see below)
# jne = if the current character doesn't match, jump this
# many table entries relative to the current entry
# je = if we have a match make a relative jump of this length
#
# * a table matches a string iff the end of the table is reached
# (that is: an index is requested that is beyond the end-of-table)
# * a table is not matched if a tag is not matched and no jne is given;
# if it is matched then processing simply moves on to the next entry
# * marking is done by adding the matching slice in the string
# together with the marking object to the tag list; if the object is
# None, then it will not be appended to the taglist list
# * if the flag CallTag is set in cmd, then instead of appending
# matches to the taglist, the tagobj will be called (see below)
#
# TIP: if you are getting an error 'call of a non-function' while
# writing a table definition, you probably have a missing ','
# somewhere in the tuple !
#
# For examples see the tag*.py - files that came with this engine.
#
#########################################################################
# 2. Commands & Constants
#
#
#
# some useful constants for writing matching tables
#
To = None # good for cmd=Jump
Here = None # good for cmd=Fail and EOF
MatchOk = 20000 # somewhere beyond the end of the tag table...
MatchFail = -20000 # somewhere beyond the start of the tag table...
ToEOF = -1 # good for cmd=Move
ThisTable = 999 # to recursively match using the current table;
# can be passed as argument to Table and SubTable
# instead of a tuple
#
# commands and flags passed in cmd (see below)
#
# note: I might add some further commands to this list, if needed
# (the numbers will then probably change, but not the
# names)
#
# convention: a command "matches", if and only if it moves the
# current position at least one character; a command "reads"
# characters the characters, if they match ok
#
# notations:
#
# x refers to the current position in the string
# len refers to the string length or what the function tag() is told to
# believe it to be (i.e. the engine only looks at the slice text[x:len])
# text refers to the text string
# jne is the optional relative jump distance in case the command
# did not match, i.e. x before and after applying the command
# are the same (if not given the current table is considered
# not to match)
# je is the optional relative jump distance in case the command
# did match (it defaults to +1)
#
# commands
Fail = 0 # this will always fail (position remains unchanged)
Jump = 0 # jump to jne (position remains unchanged)
# match & read chars
AllIn = 11 # all chars in match (at least one)
AllNotIn = 12 # all chars not in match (at least one)
Is = 13 # current char must be == match (matches one char)
IsIn = 14 # current char must be in match (matches one char)
IsNot = 15 # current char must be be != match (matches one char)
IsNotIn = 15 # current char must be not be in match (matches one char)
AllInSet = 31
IsInSet = 32
# match & read for whole words
Word = 21 # the next chars must be those in match
WordStart = 22 # all chars up to the first occ. of match (at least one)
WordEnd = 23 # same as WordStart, accept that the text pointer
# is moved behind the match
NoWord = WordStart # all chars up to the first occ. of match (at least one)
# match using search objects BMS or FS
sWordStart = 111 # all chars up to the first occ. of match (may be 0 chars)
sWordEnd = 112 # same as WordStart, accept that the text pointer
# is moved behind the match
sFindWord = 113 # find match and process the found slice only (ignoring
# the chars that lead up to the match); positions
# the text pointer right after the match like WordEnd
# functions & tables
Call = 201 # call match(text,x,len) as function (see above)
CallArg = 202 # match has to be a 2-tuple (fct,arg), then
# fct(text,x,len,arg) is called; the return value is taken
# as new x; it is considered matching if the new x is
# different than the x before the call -- like always
# (note: arg has to be *one* object, e.g. a tuple)
Table = 203 # match using table (given in match)
SubTable = 207 # match using sub table (given in match); the sub table
# uses the same taglist as the calling table
TableInList = 204 # same as Table, but match is a tuple (list,index)
# and the table list[index] is used as matching
# table
SubTableInList = 208
# same as TableInList, but the sub table
# uses the same taglist as the calling table
# specials
EOF = 1 # current position must be EOF, e.g. >= len(string)
Skip = 2 # skip match (must be an integer) chars; note: this cmd
# always matches ok, so jne doesn't have any meaning in
# this context
Move = 3 # move the current text position to match (if negative,
# the text length + 1 (!) is added, thus -1 moves to the
# EOF, -2 to the last char and so on); note: this cmd
# always matches ok, so jne doesn't have any meaning in
# this context
# loops
Loop = 205 # loop-construct
#
# (tagobj,Loop,Count,jne,je) - sets/decrements the
# loop variable for current table according to the
# following rules:
# 1. the first time the engine passes this entry
# sets the loop variable to Count and continues
# without reading any character, but saving the
# current position in text
# 2. the next time, it decrements the loop variable
# and checks if it is < 0:
# (a) if it is, then the tagobj is added to the
# taglist with the slice (saved position, current
# position) and processing continues at entry
# current + jne
# (b) else, processing continues at entry current + je
# Note: if you jump out of the loop while the loop
# variable is still > 0, then you *must*
# reset the loop mechanism with
# (None,LoopControl,Reset)
# Note: you can skip the remaining loops by calling
# (None,LoopControl,Break) and jumping back
# to the Loop-entry; this sets the loop
# variable to 0
# Note: tables cannot have nested loops within their
# context; you can have nested loops in nested
# tables though (there is one loop var per
# tag()-call which takes place every time
# a table match is done)
#
LoopControl = 206 # controls the loop variable (always succeeds, i.e.
# jne has no meaning);
# match may be one of:
Break = 0 # * sets the loop variable to 0, thereby allowing
# to skip the remaining loops
Reset = -1 # * resets the loop mechanism (see note above)
#
# See tagLoop.py for some examples.
##########################################################################
#
# Flags (to be '+'ed with the above command code)
#
CallTag = 256 # call tagobj(taglist,text,l,r,subtags)
# upon successfully matching the slice [l:r] in text
# * taglist is the current list tags found (may be None)
# * subtags is a sub-list, passed when a subtable was used
# to do the matching -- it is None otherwise !)
#
# example entry with CallTag-flag set:
#
# (found_a_tag,CallTag+Table,tagtable)
# -- if tagtable matches the current text position,
# found_a_tag(taglist,text,l,r,newtaglist) is called and
# the match is *not* appended to the taglist by the tagging
# engine (the function would have to do this, in case it is needed)
AppendToTagobj = 512 # this appends the slice found to the tagobj, assuming
# that it is a Python list:
# does a tagobj.append((None,l,r,subtags)) call
# Alias for b/w comp.
AppendToTag = AppendToTagobj
AppendTagobj = 1024 # don't append (tagobj,l,r,subtags) to the taglist,
# but only tagobj itself; the information in l,r,subtags
# is lost, yet this can be used to write tag tables
# whose output can be used directly by tag.join()
AppendMatch = 2048 # append the match to the taglist instead of
# the tag object; this produces non-standard
# taglists !
#########################################################################
# 3. Matching Functions
#
# syntax:
#
# fct(s,x,len_s)
# where s = string we are working on
# x = current index in s where we wnat to match something
# len_s = 'length' of s, this is how far the search may be
# conducted in s, not necessarily the true length of s
#
# * the function has to return the index of the char right after
# matched string, e.g.
#
# 'xyzabc' ---> 'xyz' matches ---> return x+3
#
# * if the string doesn't match simply return x; in other words:
# the function has to return the matching slice's right index
# * you can use this to match e.g. 10 characters of a certain kind,
# or any word out of a given list, etc.
# * note: you cannot give the function additional parameters from within
# the matching table, so it has to know everything it needs to
# know a priori; use dynamic programming !
#
# some examples (not needed, since all are implemented by commands)
#
#
#def matchword(x):
# s = """
#def a(s,x,len_text):
# y = x+%i
# if s[x:y] == %s: return y
# return x
#"""
# exec s % (len(x),repr(x))
# return a
#
#def rejectword(x):
# s = """
#def a(s,x,len_text):
# while x < len(s) and s[x:x+%i] != %s:
# x = x + 1
# return x
#"""
# exec s % (len(x),repr(x))
# return a
#
#def HTML_Comment(s,x,len_text):
# while x < len_text and s[x:x+3] != '-->':
# x = x + 1
# return x
#
#
#########################################################################
# 4. Callable tagobjects
#
# a sample callable tagobj:
#
#
#def test(taglist,text,l,r,newtaglist):
#
# print 'found',repr(text[l:r])[:40],(l,r)
#
#
#########################################################################
# 5. Calling the engine & Taglists
#
# The function
# tag(text,table,start=0,len_text=len(text),taglistinit=[])
# found in mxTextTools:
#
# This function does all the matching according to the above rules.
# You give it a text string and a tag table and it will
# start processing the string starting from 'start' (which defaults to 0)
# and continue working until it reaches the 'EOF', i.e. len_text (which
# defaults to the text length). It thus tags the slice text[start:len_text].
#
# The function will create a list of found tags in the following
# format (which I call taglist):
#
# (tagobj,l,r,subtaglist)
#
# where: tagobj = specified tag object taken from the table
# [l:r] = slice that matched the tag in text
# subtaglist = if matching was done using a subtable
# this is the taglist it produced; in all other
# cases this will be None
#
# * if you pass None as taglistinit, then no taglist will be created,
# i.e. only CallTag commands will have any effect. (This saves
# temporary memory for big files)
# * the function returns a tuple:
# (success, taglist, nextindex)
# where: success = 0/1
# taglist = the produced list or None
# nextindex = the index+1 of the last char that matched
# (in case of failure, this points to the beginning
# of the substring that caused the problem)
#
### Module init.
def _module_init():
global id2cmd
import types
id2cmd = {}
IntType = types.IntType
for cmd,value in globals().items():
if type(value) == IntType:
if value == 0:
id2cmd[0] = 'Fail/Jump'
else:
id2cmd[value] = cmd
_module_init()

View File

@@ -0,0 +1 @@

View File

@@ -0,0 +1,766 @@
""" mxTextTools - A tools package for fast text processing.
(c) Copyright Marc-Andre Lemburg; All Rights Reserved.
See the documentation for further information on copyrights,
or contact the author (mal@lemburg.com).
"""
import string,types
#
# import the C module and the version number
#
from mxTextTools import *
from mxTextTools import __version__
#
# import the symbols needed to write tag tables
#
from Constants.TagTables import *
#
# import the some handy character sets
#
from Constants.Sets import *
#
# format and print tables, taglists and joinlists:
#
def format_entry(table,i,
TupleType=types.TupleType):
""" Returns a pp-formatted tag table entry as string
"""
e = table[i]
jne = 0
je = 1
t,c,m = e[:3]
if len(e)>3: jne = e[3]
if len(e)>4: je = e[4]
flags,cmd = divmod(c,256)
c = id2cmd[cmd]
if type(m) == TupleType and c in ('Table','SubTable'):
m = '<table>'
elif m == None:
m = 'Here/To'
else:
m = repr(m)
if len(m) > 17:
m = m[:17]+'...'
return '%-15.15s : %-30s : jne=%+i : je=%+i' % \
(repr(t),'%-.15s : %s'%(c,m),jne,je)
def format_table(table,i=-1):
""" Returns a pp-formatted version of the tag table as string """
l = []
for j in range(len(table)):
if i == j:
l.append('--> '+format_entry(table,j))
else:
l.append(' '+format_entry(table,j))
return string.join(l,'\n')+'\n'
def print_tagtable(table):
""" Print the tag table
"""
print format_table(table)
def print_tags(text,tags,indent=0):
""" Print the taglist tags for text using the given indent level
"""
for tag,l,r,subtags in tags:
tagname = repr(tag)
if len(tagname) > 20:
tagname = tagname[:20] + '...'
target = repr(text[l:r])
if len(target) > 60:
target = target[:60] + '...'
if subtags == None:
print ' '+indent*' |',tagname,': ',target,(l,r)
else:
print ' '+indent*' |',tagname,': ',target,(l,r)
print_tags(text,subtags,indent+1)
def print_joinlist(joins,indent=0,
StringType=types.StringType):
""" Print the joinlist joins using the given indent level
"""
for j in joins:
if type(j) == StringType:
text = repr(j)
if len(text) > 40:
text = text[:40] + '...'
print ' '+indent*' |',text,' (len = %i)' % len(j)
else:
text = j[0]
l,r = j[1:3]
text = repr(text[l:r])
if len(text) > 40:
text = text[:40] + '...'
print ' '+indent*' |',text,' (len = %i)' % (r-l),(l,r)
def normlist(jlist,
StringType=types.StringType):
""" Return a normalized joinlist.
All tuples in the joinlist are turned into real strings. The
resulting list is a equivalent copy of the joinlist only
consisting of strings.
"""
l = [''] * len(jlist)
for i in range(len(jlist)):
entry = jlist[i]
if type(entry) == StringType:
l[i] = entry
else:
l[i] = entry[0][entry[1]:entry[2]]
return l
#
# aid for matching from a list of words
#
def _lookup_dict(l,index=0):
d = {}
for w in l:
c = w[index]
if d.has_key(c):
d[c].append(w)
else:
d[c] = [w]
return d
def word_in_list(l):
""" Creates a lookup table that matches the words in l
"""
t = []
d = _lookup_dict(l)
keys = d.keys()
if len(keys) < 18: # somewhat arbitrary bound
# fast hint for small sets
t.append((None,IsIn,string.join(d.keys(),'')))
t.append((None,Skip,-1))
# test groups
for c, group in d.items():
t.append(None) # hint will be filled in later
i = len(t)-1
for w in group:
t.append((None,Word,w[1:],+1,MatchOk))
t.append((None,Fail,Here))
# add hint
t[i] = (None,Is,c,len(t)-i)
t.append((None,Fail,Here))
return tuple(t)
#
# Extra stuff useful in combination with the C functions
#
def replace(text,what,with,start=0,stop=None,
SearchObject=BMS,join=join,joinlist=joinlist,tag=tag,
string_replace=string.replace,type=type,
StringType=types.StringType):
"""A fast replacement for string.replace.
what can be given as string or search object.
This function is a good example for the AppendTagobj-flag usage
(the taglist can be used directly as joinlist).
"""
if type(what) == StringType:
so = SearchObject(what)
else:
so = what
what = so.match
if stop is None:
if start == 0 and len(what) < 2:
return string_replace(text,what,with)
stop = len(text)
t = ((text,sWordStart,so,+2),
# Found something, replace and continue searching
(with,Skip+AppendTagobj,len(what),-1,-1),
# Rest of text
(text,Move,ToEOF)
)
found,taglist,last = tag(text,t,start,stop)
if not found:
return text
return join(taglist)
# Alternative (usually slower) versions using different techniques:
def _replace2(text,what,with,start=0,stop=None,
join=join,joinlist=joinlist,tag=tag,
StringType=types.StringType,BMS=BMS):
"""Analogon to string.replace; returns a string with all occurences
of what in text[start:stop] replaced by with
- uses a one entry tag-table and a Boyer-Moore-Search-object
- what can be a string or a BMS/FS search object
- it's faster than string.replace in those cases, where
the what-string gets long and/or many replacements are found;
faster meaning from a few percent up to many times as fast
- start and stop define the slice of text to work in
- stop defaults to len(text)
"""
if stop is None:
stop = len(text)
if type(what) == StringType:
what=BMS(what)
t = ((with,sFindWord,what,+1,+0),)
found,taglist,last = tag(text,t,start,stop)
if not found:
return text
return join(joinlist(text,taglist))
def _replace3(text,what,with,
join=string.join,FS=FS,
StringType=types.StringType):
if type(what) == StringType:
what=FS(what)
slices = what.findall(text)
if not slices:
return text
l = []
x = 0
for left,right in slices:
l.append(text[x:left] + with)
x = right
l.append(text[x:])
return join(l,'')
def _replace4(text,what,with,
join=join,joinlist=joinlist,tag=tag,FS=FS,
StringType=types.StringType):
if type(what) == StringType:
what=FS(what)
slices = what.findall(text)
if not slices:
return text
repl = [None]*len(slices)
for i in range(len(slices)):
repl[i] = (with,)+slices[i]
return join(joinlist(text,repl))
def find(text,what,start=0,stop=None,
SearchObject=FS):
""" A faster replacement for string.find().
Uses a search object for the task. Returns the position of the
first occurance of what in text[start:stop]. stop defaults to
len(text). Returns -1 in case no occurance was found.
"""
if stop:
return SearchObject(what).find(text,start,stop)
else:
return SearchObject(what).find(text,start)
def findall(text,what,start=0,stop=None,
SearchObject=FS):
""" Find all occurances of what in text.
Uses a search object for the task. Returns a list of slice
tuples (l,r) marking the all occurances in
text[start:stop]. stop defaults to len(text). Returns an
empty list in case no occurance was found.
"""
if stop:
return SearchObject(what).findall(text,start,stop)
else:
return SearchObject(what).findall(text,start)
def split(text,sep,start=0,stop=None,translate=None,
SearchObject=FS):
""" A faster replacement for string.split().
Uses a search object for the task. Returns the result of
cutting the text[start:stop] string into snippets at every sep
occurance in form of a list of substrings. translate is passed
to the search object as translation string.
XXX convert to a C function... or even better, add as method
to search objects.
"""
if translate:
so = SearchObject(sep,translate)
else:
so = SearchObject(sep)
if stop:
cuts = so.findall(text,start,stop)
else:
cuts = so.findall(text,start)
l = 0
list = []
append = list.append
for left,right in cuts:
append(text[l:left])
l = right
append(text[l:])
return list
# helper for tagdict
def _tagdict(text,dict,prefix,taglist):
for o,l,r,s in taglist:
pfx = prefix + str(o)
dict[pfx] = text[l:r]
if s:
_tagdict(text,dict,pfx+'.',s)
def tagdict(text,*args):
""" Tag a text just like the function tag() and then convert
its output into a dictionary where the tagobjects reference
their respective strings
- this function emulates the interface of tag()
- in contrast to tag() this funtion *does* make copies
of the found stings
- returns a tuple (rc,tagdict,next) with the same meaning
of rc and next as tag(); tagdict is the new dictionary -
None in case rc is 0
"""
rc,taglist,next = apply(tag,(text,)+args)
if not rc:
return (rc,None,next)
d = {}
tagdict = _tagdict
for o,l,r,s in taglist:
pfx = str(o)
d[pfx] = text[l:r]
if s:
tagdict(text,dict,pfx+'.',s)
return (rc,d,next)
def invset(chars):
""" Return a set with all characters *except* the ones in chars.
"""
return set(chars,0)
def is_whitespace(text,start=0,stop=None,
nonwhitespace=nonwhitespace_set,setfind=setfind):
""" Return 1 iff text[start:stop] only contains whitespace
characters (as defined in Constants/Sets.py), 0 otherwise.
"""
if stop is None:
stop = len(text)
i = setfind(text,nonwhitespace,start,stop)
return (i < 0)
def collapse(text,seperator=' ',
join=join,setsplit=setsplit,collapse_set=set(newline+whitespace)):
""" Eliminates newline characters and compresses whitespace
characters into one space.
The result is a one line text string. Tim Peters will like
this function called with '-' seperator ;-)
"""
return join(setsplit(text,collapse_set),seperator)
_linesplit_table = (
(None,Is,'\r',+1),
(None,Is,'\n',+1),
('line',AllInSet+AppendMatch,set('\r\n',0),+1,-2),
(None,EOF,Here,+1,MatchOk),
('empty line',Skip+AppendMatch,0,0,-4),
)
def splitlines(text,
tag=tag,linesplit_table=_linesplit_table):
""" Split text into a list of single lines.
The following combinations are considered to be line-ends:
'\r', '\r\n', '\n'; they may be used in any combination. The
line-end indicators are removed from the strings prior to
adding them to the list.
This function allows dealing with text files from Macs, PCs
and Unix origins in a portable way.
"""
return tag(text,linesplit_table)[1]
_linecount_table = (
(None,Is,'\r',+1),
(None,Is,'\n',+1),
('line',AllInSet+AppendTagobj,set('\r\n',0),+1,-2),
(None,EOF,Here,+1,MatchOk),
('empty line',Skip+AppendTagobj,0,0,-4),
)
def countlines(text,
linecount_table=_linecount_table):
""" Returns the number of lines in text.
Line ends are treated just like for splitlines() in a
portable way.
"""
return len(tag(text,linecount_table)[1])
_wordsplit_table = (
(None,AllInSet,whitespace_set,+1),
('word',AllInSet+AppendMatch,nonwhitespace_set,+1,-1),
(None,EOF,Here,+1,MatchOk),
)
def splitwords(text,
setsplit=setsplit,whitespace_set=whitespace_set):
""" Split text into a list of single words.
Words are separated by whitespace. The whitespace is stripped
before adding the words to the list.
"""
return setsplit(text,whitespace_set)
#
# Testing and benchmarking
#
# Taken from my hack.py module:
import time
class _timer:
""" timer class with a quite obvious interface
- .start() starts a fairly accurate CPU-time timer plus an
absolute timer
- .stop() stops the timer and returns a tuple: the CPU-time in seconds
and the absolute time elapsed since .start() was called
"""
utime = 0
atime = 0
def start(self,
clock=time.clock,time=time.time):
self.atime = time()
self.utime = clock()
def stop(self,
clock=time.clock,time=time.time):
self.utime = clock() - self.utime
self.atime = time() - self.atime
return self.utime,self.atime
def usertime(self,
clock=time.clock,time=time.time):
self.utime = clock() - self.utime
self.atime = time() - self.atime
return self.utime
def abstime(self,
clock=time.clock,time=time.time):
self.utime = clock() - self.utime
self.atime = time() - self.atime
return self.utime
def __str__(self):
return '%0.2fu %0.2fa sec.' % (self.utime,self.atime)
def _bench(file='mxTextTools/mxTextTools.c'):
def mismatch(orig,new):
print
for i in range(len(orig)):
if orig[i] != new[i]:
break
else:
print 'Length mismatch: orig=%i new=%i' % (len(orig),len(new))
if len(orig) > len(new):
print 'Missing chars:'+repr(orig[len(new):])
else:
print 'Excess chars:'+repr(new[len(orig):])
print
return
print 'Mismatch at offset %i:' % i
print (orig[i-100:i]
+ '<- %s != %s ->' % (repr(orig[i]),repr(new[i]))
+ orig[i+1:i+100])
print
text = open(file).read()
import string
t = _timer()
print 'Working on a %i byte string' % len(text)
if 0:
print
print 'Replacing strings'
print '-'*72
print
for what,with in (('m','M'),('mx','MX'),('mxText','MXTEXT'),
('hmm','HMM'),('hmmm','HMM'),('hmhmm','HMM')):
print 'Replace "%s" with "%s"' % (what,with)
t.start()
for i in range(100):
rtext = string.replace(text,what,with)
print 'with string.replace:',t.stop(),'sec.'
t.start()
for i in range(100):
ttext = replace(text,what,with)
print 'with tag.replace:',t.stop(),'sec.'
if ttext != rtext:
print 'results are NOT ok !'
print '-'*72
mismatch(rtext,ttext)
t.start()
for i in range(100):
ttext = _replace2(text,what,with)
print 'with tag._replace2:',t.stop(),'sec.'
if ttext != rtext:
print 'results are NOT ok !'
print '-'*72
print rtext
t.start()
for i in range(100):
ttext = _replace3(text,what,with)
print 'with tag._replace3:',t.stop(),'sec.'
if ttext != rtext:
print 'results are NOT ok !'
print '-'*72
print rtext
t.start()
for i in range(100):
ttext = _replace4(text,what,with)
print 'with tag._replace4:',t.stop(),'sec.'
if ttext != rtext:
print 'results are NOT ok !'
print '-'*72
print rtext
print
if 0:
print
print 'String lower/upper'
print '-'*72
print
op = string.lower
t.start()
for i in range(1000):
op(text)
t.stop()
print ' string.lower:',t
op = string.upper
t.start()
for i in range(1000):
op(text)
t.stop()
print ' string.upper:',t
op = upper
t.start()
for i in range(1000):
op(text)
t.stop()
print ' TextTools.upper:',t
op = lower
t.start()
for i in range(1000):
op(text)
t.stop()
print ' TextTools.lower:',t
print 'Testing...',
ltext = string.lower(text)
assert ltext == lower(text)
utext = string.upper(text)
assert utext == upper(text)
print 'ok.'
if 0:
print
print 'Joining lists'
print '-'*72
print
l = setsplit(text,whitespace_set)
op = string.join
t.start()
for i in range(1000):
op(l)
t.stop()
print ' string.join:',t
op = join
t.start()
for i in range(1000):
op(l)
t.stop()
print ' TextTools.join:',t
op = string.join
t.start()
for i in range(1000):
op(l,' ')
t.stop()
print ' string.join with seperator:',t
op = join
t.start()
for i in range(1000):
op(l,' ')
t.stop()
print ' TextTools.join with seperator:',t
if 0:
print
print 'Creating join lists'
print '-'*72
print
repl = []
for i in range(0,len(text),10):
repl.append(str(i),i,i+1)
op = joinlist
t.start()
for i in range(1000):
op(text,repl)
t.stop()
print ' TextTools.joinlist:',t
if 0:
print
print 'Splitting text'
print '-'*72
print
op = string.split
t.start()
for i in range(100):
op(text)
t.stop()
print ' string.split whitespace:',t,'(',len(op(text)),'snippets )'
op = setsplit
ws = whitespace_set
t.start()
for i in range(100):
op(text,ws)
t.stop()
print ' TextTools.setsplit whitespace:',t,'(',len(op(text,ws)),'snippets )'
assert string.split(text) == setsplit(text,ws)
op = string.split
sep = 'a'
t.start()
for i in range(100):
op(text,sep)
t.stop()
print ' string.split at "a":',t,'(',len(op(text,sep)),'snippets )'
op = split
sep = 'a'
t.start()
for i in range(100):
op(text,sep)
t.stop()
print ' TextTools.split at "a":',t,'(',len(op(text,sep)),'snippets )'
op = charsplit
sep = 'a'
t.start()
for i in range(100):
op(text,sep)
t.stop()
print ' TextTools.charsplit at "a":',t,'(',len(op(text,sep)),'snippets )'
op = setsplit
sep = set('a')
t.start()
for i in range(100):
op(text,sep)
t.stop()
print ' TextTools.setsplit at "a":',t,'(',len(op(text,sep)),'snippets )'
# Note: string.split and setsplit don't work identically !
op = string.split
sep = 'int'
t.start()
for i in range(100):
op(text,sep)
t.stop()
print ' string.split at "int":',t,'(',len(op(text,sep)),'snippets )'
op = split
sep = 'int'
t.start()
for i in range(100):
op(text,sep)
t.stop()
print ' TextTools.split at "int":',t,'(',len(op(text,sep)),'snippets )'
op = setsplit
sep = set('int')
t.start()
for i in range(100):
op(text,sep)
t.stop()
print ' TextTools.setsplit at "i", "n", "t":',t,'(',len(op(text,sep)),'snippets )'
op = string.split
sep = 'register'
t.start()
for i in range(100):
op(text,sep)
t.stop()
print ' string.split at "register":',t,'(',len(op(text,sep)),'snippets )'
op = split
sep = 'register'
t.start()
for i in range(100):
op(text,sep)
t.stop()
print ' TextTools.split at "register":',t,'(',len(op(text,sep)),'snippets )'
if __name__=='__main__':
_bench()

View File

@@ -0,0 +1,48 @@
""" mxTextTools - A tools package for fast text processing.
(c) Copyright Marc-Andre Lemburg; All Rights Reserved.
See the documentation for further information on copyrights,
or contact the author (mal@lemburg.com).
"""
__package_info__ = """
BEGIN PYTHON-PACKAGE-INFO 1.0
Title: mxTextTools - Tools for fast text processing
Current-Version: 1.1.1
Home-Page: http://starship.skyport.net/~lemburg/mxTextTools.html
Primary-Site: http://starship.skyport.net/~lemburg/mxTextTools-1.1.1.zip
This package provides several different functions and mechanisms
to do fast text text processing. Amongst these are character set
operations, parsing & tagging tools (using a finite state machine
executing byte code) and common things such as Boyer-Moore search
objects. For full documentation see the home page.
END PYTHON-PACKAGE-INFO
"""
from TextTools import *
from TextTools import __version__
### Make the types pickleable:
# Shortcuts for pickle (reduces the pickle's length)
def _BMS(match,translate):
return BMS(match,translate)
def _FS(match,translate):
return FS(match,translate)
# Module init
class modinit:
### Register the two types
import copy_reg
def pickle_BMS(so):
return _BMS,(so.match,so.translate)
def pickle_FS(so):
return _FS,(so.match,so.translate)
copy_reg.pickle(BMSType,
pickle_BMS,
_BMS)
copy_reg.pickle(FSType,
pickle_FS,
_FS)
del modinit

View File

@@ -0,0 +1,17 @@
""" mxTextTools - A tools package for fast text processing.
(c) Copyright Marc-Andre Lemburg; All Rights Reserved.
See the documentation for further information on copyrights,
or contact the author (mal@lemburg.com).
"""
from mxTextTools import *
from mxTextTools import __version__
#
# Make BMS take the role of FS in case the Fast Search object was not built
#
try:
FS
except NameError:
FS = BMS
FSType = BMSType