Initial revision

2002-10-12 11:37:38 +00:00
commit 12315f4d0e
1699 changed files with 444708 additions and 0 deletions
--- a/intern/python/modules/TextTools/Constants/Sets.py
+++ b/intern/python/modules/TextTools/Constants/Sets.py
@@ -0,0 +1,39 @@
+""" Constants for sets (of characters)
+
+    (c) Copyright Marc-Andre Lemburg; All Rights Reserved.
+    See the documentation for further information on copyrights,
+    or contact the author (mal@lemburg.com).
+"""
+import string
+
+# Simple character strings
+
+a2z = 'abcdefghijklmnopqrstuvwxyz'
+A2Z = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+umlaute = '<EFBFBD><EFBFBD><EFBFBD><EFBFBD>'
+Umlaute = '<EFBFBD><EFBFBD><EFBFBD>'
+alpha = A2Z + a2z
+german_alpha = A2Z + a2z + umlaute + Umlaute
+number = '0123456789'
+alphanumeric = alpha + number
+white = ' \t\v'
+newline = '\r\n'
+formfeed = '\f'
+whitespace = white + newline + formfeed
+any = '\000\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037 !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\177\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377'
+
+# Precompiled as sets, e.g. a2z_set = set(a2z)
+a2z_set = '\000\000\000\000\000\000\000\000\000\000\000\000\376\377\377\007\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000'
+A2Z_set = '\000\000\000\000\000\000\000\000\376\377\377\007\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000'
+alpha_set = '\000\000\000\000\000\000\000\000\376\377\377\007\376\377\377\007\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000'
+german_alpha_set = '\000\000\000\000\000\000\000\000\376\377\377\007\376\377\377\007\000\000\000\000\000\000\000\000\020\000@\220\020\000@\020'
+number_set = '\000\000\000\000\000\000\377\003\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000'
+alphanumeric_set = '\000\000\000\000\000\000\377\003\376\377\377\007\376\377\377\007\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000'
+white_set = '\000\002\000\000\001\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000'
+newline_set = '\000$\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000'
+whitespace_set = '\000&\000\000\001\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000'
+nonwhitespace_set = '\377\301\377\377\376\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377'
+any_set = '\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377'
+
+# Clean up
+del string
--- a/intern/python/modules/TextTools/Constants/TagTables.py
+++ b/intern/python/modules/TextTools/Constants/TagTables.py
@@ -0,0 +1,348 @@
+""" Constants for writing tag tables
+
+    The documentation in this file is obsoleted by the HTML docs in
+    the Doc/ subdirectory of the package. Constants defined here must
+    match those in mxTextTools/mxte.h.
+
+    (c) Copyright Marc-Andre Lemburg; All Rights Reserved.
+    See the documentation for further information on copyrights,
+    or contact the author (mal@lemburg.com).
+"""
+#########################################################################
+# This file contains the definitions and constants used by the tagging
+# engine:
+#
+# 1. Matching Tables
+# 2. Commands & Constants
+# 3. Matching Functions
+# 4. Callable tagobjects
+# 5. Calling the engine & Taglists
+#
+
+#########################################################################
+# 1. Matching Tables:
+#
+# these are tuples of tuples, each entry having the following meaning:
+#
+# syntax: (tag, cmd, chars|table|fct [,jne] [,je=1])
+#          tag = object used to mark this section, if it matches
+#          cmd = command (see below)
+#          chars = match one or more of these characters
+#          table = table to use for matching characters
+#          fct = function to call (see below)
+#          jne = if the current character doesn't match, jump this
+#                many table entries relative to the current entry
+#          je = if we have a match make a relative jump of this length
+#
+# * a table matches a string iff the end of the table is reached
+#   (that is: an index is requested that is beyond the end-of-table)
+# * a table is not matched if a tag is not matched and no jne is given;
+#   if it is matched then processing simply moves on to the next entry
+# * marking is done by adding the matching slice in the string
+#   together with the marking object to the tag list; if the object is
+#   None, then it will not be appended to the taglist list
+# * if the flag CallTag is set in cmd, then instead of appending
+#   matches to the taglist, the tagobj will be called (see below) 
+#
+# TIP: if you are getting an error 'call of a non-function' while
+#      writing a table definition, you probably have a missing ','
+#      somewhere in the tuple !
+#
+# For examples see the tag*.py - files that came with this engine.
+#
+
+#########################################################################
+# 2. Commands & Constants
+# 
+#
+
+#
+# some useful constants for writing matching tables
+#
+
+To = None		# good for cmd=Jump
+Here = None		# good for cmd=Fail and EOF
+MatchOk = 20000		# somewhere beyond the end of the tag table...
+MatchFail = -20000	# somewhere beyond the start of the tag table...
+ToEOF = -1		# good for cmd=Move
+
+ThisTable = 999		# to recursively match using the current table;
+			# can be passed as argument to Table and SubTable
+			# instead of a tuple
+
+#
+# commands and flags passed in cmd (see below)
+#
+# note: I might add some further commands to this list, if needed
+#       (the numbers will then probably change, but not the
+#        names)
+#
+# convention: a command "matches", if and only if it moves the
+#       current position at least one character; a command "reads" 
+#       characters the characters, if they match ok
+#
+# notations:
+#
+#  x    refers to the current position in the string
+#  len  refers to the string length or what the function tag() is told to
+#       believe it to be (i.e. the engine only looks at the slice text[x:len])
+#  text refers to the text string
+#  jne  is the optional relative jump distance in case the command
+#       did not match, i.e. x before and after applying the command
+#       are the same (if not given the current table is considered
+#       not to match)
+#  je   is the optional relative jump distance in case the command
+#       did match (it defaults to +1)
+#
+
+# commands
+Fail = 0           # this will always fail (position remains unchanged)
+Jump = 0           # jump to jne (position remains unchanged)
+
+# match & read chars
+AllIn = 11         # all chars in match (at least one)
+AllNotIn = 12      # all chars not in match (at least one)
+Is = 13            # current char must be == match (matches one char)
+IsIn = 14          # current char must be in match (matches one char)
+IsNot = 15         # current char must be be != match (matches one char)
+IsNotIn = 15       # current char must be not be in match (matches one char)
+
+AllInSet = 31
+IsInSet = 32
+
+# match & read for whole words
+Word = 21          # the next chars must be those in match
+WordStart = 22	   # all chars up to the first occ. of match (at least one)
+WordEnd = 23	   # same as WordStart, accept that the text pointer
+                   # is moved behind the match
+NoWord = WordStart # all chars up to the first occ. of match (at least one)
+
+
+# match using search objects BMS or FS
+sWordStart = 111   # all chars up to the first occ. of match (may be 0 chars)
+sWordEnd = 112	   # same as WordStart, accept that the text pointer
+                   # is moved behind the match
+sFindWord = 113    # find match and process the found slice only (ignoring
+		   # the chars that lead up to the match); positions
+		   # the text pointer right after the match like WordEnd
+
+# functions & tables
+Call = 201         # call match(text,x,len) as function (see above)
+CallArg = 202      # match has to be a 2-tuple (fct,arg), then
+                   # fct(text,x,len,arg) is called; the return value is taken
+		   # as new x; it is considered matching if the new x is
+		   # different than the x before the call -- like always
+		   # (note: arg has to be *one* object, e.g. a tuple)
+Table = 203        # match using table (given in match)
+SubTable = 207     # match using sub table (given in match); the sub table
+		   # uses the same taglist as the calling table
+TableInList = 204  # same as Table, but match is a tuple (list,index)
+                   # and the table list[index] is used as matching
+		   # table
+SubTableInList = 208
+                   # same as TableInList, but the sub table
+		   # uses the same taglist as the calling table
+
+# specials
+EOF = 1            # current position must be EOF, e.g. >= len(string)
+Skip = 2           # skip match (must be an integer) chars; note: this cmd
+                   # always matches ok, so jne doesn't have any meaning in
+		   # this context
+Move = 3	   # move the current text position to match (if negative,
+		   # the text length + 1 (!) is added, thus -1 moves to the
+		   # EOF, -2 to the last char and so on); note: this cmd
+                   # always matches ok, so jne doesn't have any meaning in
+		   # this context
+
+# loops
+Loop = 205         # loop-construct
+                   #               
+                   # (tagobj,Loop,Count,jne,je) - sets/decrements the
+		   # loop variable for current table according to the
+		   # following rules:
+		   # 1. the first time the engine passes this entry
+		   #    sets the loop variable to Count and continues
+		   #    without reading any character, but saving the
+		   #    current position in text
+		   # 2. the next time, it decrements the loop variable
+		   #    and checks if it is < 0:
+		   #    (a) if it is, then the tagobj is added to the
+		   #        taglist with the slice (saved position, current
+		   #        position) and processing continues at entry
+		   #        current + jne
+		   #    (b) else, processing continues at entry current + je
+		   # Note: if you jump out of the loop while the loop
+		   #       variable is still > 0, then you *must*
+		   #       reset the loop mechanism with 
+		   #       (None,LoopControl,Reset)
+		   # Note: you can skip the remaining loops by calling
+		   #       (None,LoopControl,Break) and jumping back
+		   #       to the Loop-entry; this sets the loop
+		   #       variable to 0
+		   # Note: tables cannot have nested loops within their
+		   #       context; you can have nested loops in nested
+		   #       tables though (there is one loop var per
+		   #       tag()-call which takes place every time
+		   #       a table match is done)
+		   #
+LoopControl = 206  # controls the loop variable (always succeeds, i.e.
+                   #                             jne has no meaning);
+                   # match may be one of:
+Break = 0          # * sets the loop variable to 0, thereby allowing
+                   #   to skip the remaining loops
+Reset = -1         # * resets the loop mechanism (see note above)
+                   #
+		   # See tagLoop.py for some examples.
+
+##########################################################################
+#
+# Flags (to be '+'ed with the above command code)
+#
+CallTag = 256      # call tagobj(taglist,text,l,r,subtags)
+		   # upon successfully matching the slice [l:r] in text
+		   # * taglist is the current list tags found (may be None)
+                   # * subtags is a sub-list, passed when a subtable was used
+                   #   to do the matching -- it is None otherwise !)
+#
+# example entry with CallTag-flag set:
+#
+# (found_a_tag,CallTag+Table,tagtable)
+#  -- if tagtable matches the current text position, 
+#     found_a_tag(taglist,text,l,r,newtaglist) is called and
+#     the match is *not* appended to the taglist by the tagging
+#     engine (the function would have to do this, in case it is needed)
+
+AppendToTagobj = 512  	# this appends the slice found to the tagobj, assuming
+                      	# that it is a Python list:
+		      	# does a tagobj.append((None,l,r,subtags)) call
+# Alias for b/w comp.
+AppendToTag = AppendToTagobj
+
+AppendTagobj = 1024   	# don't append (tagobj,l,r,subtags) to the taglist,
+			# but only tagobj itself; the information in l,r,subtags
+		   	# is lost, yet this can be used to write tag tables
+		   	# whose output can be used directly by tag.join()
+
+AppendMatch = 2048	# append the match to the taglist instead of
+			# the tag object; this produces non-standard
+			# taglists !
+
+#########################################################################
+# 3. Matching Functions
+#
+# syntax:
+#
+# fct(s,x,len_s)
+#          where s = string we are working on
+#                x = current index in s where we wnat to match something
+#                len_s = 'length' of s, this is how far the search may be
+#                    conducted in s, not necessarily the true length of s
+# 
+# * the function has to return the index of the char right after
+#   matched string, e.g.
+#
+#   'xyzabc' ---> 'xyz' matches ---> return x+3
+#
+# * if the string doesn't match simply return x; in other words:
+#   the function has to return the matching slice's right index
+# * you can use this to match e.g. 10 characters of a certain kind,
+#   or any word out of a given list, etc.
+# * note: you cannot give the function additional parameters from within
+#   the matching table, so it has to know everything it needs to
+#   know a priori; use dynamic programming !
+#
+# some examples (not needed, since all are implemented by commands)
+#
+#
+#def matchword(x):
+#    s = """
+#def a(s,x,len_text):
+#    y = x+%i
+#    if s[x:y] == %s: return y
+#    return x
+#"""
+#    exec s % (len(x),repr(x))
+#    return a
+#
+#def rejectword(x):
+#    s = """
+#def a(s,x,len_text):
+#    while x < len(s) and s[x:x+%i] != %s:
+#	x = x + 1
+#    return x
+#"""
+#    exec s % (len(x),repr(x))
+#    return a
+#
+#def HTML_Comment(s,x,len_text):
+#    while x < len_text and s[x:x+3] != '-->':
+#	x = x + 1
+#    return x
+#
+#
+
+#########################################################################
+# 4. Callable tagobjects
+#
+# a sample callable tagobj:
+#
+#
+#def test(taglist,text,l,r,newtaglist):
+#
+#    print 'found',repr(text[l:r])[:40],(l,r)
+#
+#
+
+#########################################################################
+# 5. Calling the engine & Taglists
+#
+# The function
+#      tag(text,table,start=0,len_text=len(text),taglistinit=[])
+# found in mxTextTools:
+#
+# This function does all the matching according to the above rules.
+# You give it a text string and a tag table and it will
+# start processing the string starting from 'start' (which defaults to 0)
+# and continue working until it reaches the 'EOF', i.e. len_text (which
+# defaults to the text length). It thus tags the slice text[start:len_text].
+#
+# The function will create a list of found tags in the following
+# format (which I call taglist):
+#
+#      (tagobj,l,r,subtaglist)
+#
+# where: tagobj = specified tag object taken from the table
+#        [l:r] = slice that matched the tag in text
+#        subtaglist = if matching was done using a subtable
+#                     this is the taglist it produced; in all other
+#                     cases this will be None
+#
+# * if you pass None as taglistinit, then no taglist will be created,
+#   i.e. only CallTag commands will have any effect. (This saves
+#   temporary memory for big files)
+# * the function returns a tuple:
+#      (success, taglist, nextindex)
+#   where: success = 0/1
+#          taglist = the produced list or None
+#          nextindex = the index+1 of the last char that matched
+#                    (in case of failure, this points to the beginning
+#                     of the substring that caused the problem)
+# 
+
+### Module init.
+
+def _module_init():
+
+    global id2cmd
+
+    import types
+    id2cmd = {}
+    IntType = types.IntType
+    for cmd,value in globals().items():
+	if type(value) == IntType:
+	    if value == 0:
+		id2cmd[0] = 'Fail/Jump'
+	    else:
+		id2cmd[value] = cmd
+
+_module_init()
--- a/intern/python/modules/TextTools/Constants/init.py
+++ b/intern/python/modules/TextTools/Constants/init.py
@@ -0,0 +1 @@
+ 
--- a/intern/python/modules/TextTools/TextTools.py
+++ b/intern/python/modules/TextTools/TextTools.py
@@ -0,0 +1,766 @@
+""" mxTextTools - A tools package for fast text processing.
+
+    (c) Copyright Marc-Andre Lemburg; All Rights Reserved.
+    See the documentation for further information on copyrights,
+    or contact the author (mal@lemburg.com).
+"""
+import string,types
+
+#
+# import the C module and the version number
+#
+from mxTextTools import *
+from mxTextTools import __version__
+
+#
+# import the symbols needed to write tag tables
+#
+from Constants.TagTables import *
+
+#
+# import the some handy character sets
+#
+from Constants.Sets import *
+
+#
+# format and print tables, taglists and joinlists:
+#
+def format_entry(table,i,
+
+		 TupleType=types.TupleType):
+
+    """ Returns a pp-formatted tag table entry as string 
+    """
+    e = table[i]
+    jne = 0
+    je = 1
+    t,c,m = e[:3]
+    if len(e)>3: jne = e[3]
+    if len(e)>4: je = e[4]
+    flags,cmd = divmod(c,256)
+    c = id2cmd[cmd]
+    if type(m) == TupleType and c in ('Table','SubTable'):
+	m = '<table>'
+    elif m == None:
+	m = 'Here/To'
+    else:
+	m = repr(m)
+	if len(m) > 17:
+	    m = m[:17]+'...'
+    return '%-15.15s : %-30s : jne=%+i : je=%+i' % \
+	   (repr(t),'%-.15s : %s'%(c,m),jne,je)
+
+def format_table(table,i=-1):
+    
+    """ Returns a pp-formatted version of the tag table as string """
+
+    l = []
+    for j in range(len(table)):
+	if i == j:
+	    l.append('--> '+format_entry(table,j))
+	else:
+	    l.append('    '+format_entry(table,j))
+    return string.join(l,'\n')+'\n'
+
+def print_tagtable(table):
+
+    """ Print the tag table 
+    """
+    print format_table(table)
+
+def print_tags(text,tags,indent=0):
+
+    """ Print the taglist tags for text using the given indent level 
+    """
+    for tag,l,r,subtags in tags:
+	tagname = repr(tag)
+	if len(tagname) > 20:
+	    tagname = tagname[:20] + '...'
+	target = repr(text[l:r])
+	if len(target) > 60:
+	    target = target[:60] + '...'
+	if subtags == None:
+	    print ' '+indent*' |',tagname,': ',target,(l,r)
+	else:
+	    print ' '+indent*' |',tagname,': ',target,(l,r)
+	    print_tags(text,subtags,indent+1)
+
+def print_joinlist(joins,indent=0,
+		   
+		   StringType=types.StringType):
+
+    """ Print the joinlist joins using the given indent level 
+    """
+    for j in joins:
+	if type(j) == StringType:
+	    text = repr(j)
+	    if len(text) > 40:
+		text = text[:40] + '...'
+	    print ' '+indent*' |',text,' (len = %i)' % len(j)
+	else:
+	    text = j[0]
+	    l,r = j[1:3]
+	    text = repr(text[l:r])
+	    if len(text) > 40:
+		text = text[:40] + '...'
+	    print ' '+indent*' |',text,' (len = %i)' % (r-l),(l,r)
+
+def normlist(jlist,
+		   
+             StringType=types.StringType):
+
+    """ Return a normalized joinlist.
+
+        All tuples in the joinlist are turned into real strings.  The
+        resulting list is a equivalent copy of the joinlist only
+        consisting of strings.
+        
+    """
+    l = [''] * len(jlist)
+    for i in range(len(jlist)):
+        entry = jlist[i]
+	if type(entry) == StringType:
+	    l[i] = entry
+	else:
+            l[i] = entry[0][entry[1]:entry[2]]
+    return l
+
+#
+# aid for matching from a list of words
+#
+def _lookup_dict(l,index=0):
+    
+    d = {}
+    for w in l:
+	c = w[index]
+	if d.has_key(c):
+	    d[c].append(w)
+	else:
+	    d[c] = [w]
+    return d
+
+def word_in_list(l):
+
+    """ Creates a lookup table that matches the words in l 
+    """
+    t = []
+    d = _lookup_dict(l)
+    keys = d.keys()
+    if len(keys) < 18: # somewhat arbitrary bound
+	# fast hint for small sets
+	t.append((None,IsIn,string.join(d.keys(),'')))
+	t.append((None,Skip,-1))
+    # test groups
+    for c, group in d.items():
+	t.append(None) # hint will be filled in later
+	i = len(t)-1
+	for w in group:
+	    t.append((None,Word,w[1:],+1,MatchOk))
+	t.append((None,Fail,Here))
+	# add hint
+	t[i] = (None,Is,c,len(t)-i)
+    t.append((None,Fail,Here))
+    return tuple(t)
+
+#
+# Extra stuff useful in combination with the C functions
+#
+
+def replace(text,what,with,start=0,stop=None,
+
+	    SearchObject=BMS,join=join,joinlist=joinlist,tag=tag,
+	    string_replace=string.replace,type=type,
+	    StringType=types.StringType):
+
+    """A fast replacement for string.replace.
+    
+       what can be given as string or search object.
+
+       This function is a good example for the AppendTagobj-flag usage
+       (the taglist can be used directly as joinlist).
+       
+    """
+    if type(what) == StringType:
+	so = SearchObject(what)
+    else:
+	so = what
+	what = so.match
+    if stop is None:
+	if start == 0 and len(what) < 2:
+	    return string_replace(text,what,with)
+	stop = len(text)
+    t = ((text,sWordStart,so,+2),
+	 # Found something, replace and continue searching
+	 (with,Skip+AppendTagobj,len(what),-1,-1),
+	 # Rest of text
+	 (text,Move,ToEOF)
+	 )
+    found,taglist,last = tag(text,t,start,stop)
+    if not found:
+	return text
+    return join(taglist)
+
+# Alternative (usually slower) versions using different techniques:
+
+def _replace2(text,what,with,start=0,stop=None,
+
+	      join=join,joinlist=joinlist,tag=tag,
+	      StringType=types.StringType,BMS=BMS):
+
+    """Analogon to string.replace; returns a string with all occurences
+       of what in text[start:stop] replaced by with
+       - uses a one entry tag-table and a Boyer-Moore-Search-object
+       - what can be a string or a BMS/FS search object
+       - it's faster than string.replace in those cases, where
+	 the what-string gets long and/or many replacements are found;
+	 faster meaning from a few percent up to many times as fast
+       - start and stop define the slice of text to work in
+       - stop defaults to len(text)
+    """
+    if stop is None:
+	stop = len(text)
+    if type(what) == StringType:
+	what=BMS(what)
+    t = ((with,sFindWord,what,+1,+0),)
+    found,taglist,last = tag(text,t,start,stop)
+    if not found: 
+	return text
+    return join(joinlist(text,taglist))
+
+def _replace3(text,what,with,
+
+	      join=string.join,FS=FS,
+	      StringType=types.StringType):
+
+    if type(what) == StringType:
+	what=FS(what)
+    slices = what.findall(text)
+    if not slices:
+	return text
+    l = []
+    x = 0
+    for left,right in slices:
+	l.append(text[x:left] + with)
+	x = right
+    l.append(text[x:])
+    return join(l,'')
+
+def _replace4(text,what,with,
+
+	      join=join,joinlist=joinlist,tag=tag,FS=FS,
+	      StringType=types.StringType):
+
+    if type(what) == StringType:
+	what=FS(what)
+    slices = what.findall(text)
+    if not slices:
+	return text
+    repl = [None]*len(slices)
+    for i in range(len(slices)):
+	repl[i] = (with,)+slices[i]
+    return join(joinlist(text,repl))
+
+
+def find(text,what,start=0,stop=None,
+
+	 SearchObject=FS):
+
+    """ A faster replacement for string.find().
+
+        Uses a search object for the task. Returns the position of the
+        first occurance of what in text[start:stop]. stop defaults to
+        len(text).  Returns -1 in case no occurance was found.
+        
+    """
+    if stop:
+	return SearchObject(what).find(text,start,stop)
+    else:
+	return SearchObject(what).find(text,start)
+
+def findall(text,what,start=0,stop=None,
+
+	    SearchObject=FS):
+
+    """ Find all occurances of what in text.
+
+        Uses a search object for the task. Returns a list of slice
+        tuples (l,r) marking the all occurances in
+        text[start:stop]. stop defaults to len(text).  Returns an
+        empty list in case no occurance was found.
+        
+    """
+    if stop:
+	return SearchObject(what).findall(text,start,stop)
+    else:
+	return SearchObject(what).findall(text,start)
+
+def split(text,sep,start=0,stop=None,translate=None,
+
+	  SearchObject=FS):
+
+    """ A faster replacement for string.split().
+
+        Uses a search object for the task. Returns the result of
+        cutting the text[start:stop] string into snippets at every sep
+        occurance in form of a list of substrings. translate is passed
+        to the search object as translation string.
+
+	XXX convert to a C function... or even better, add as method
+	to search objects.
+
+    """
+    if translate:
+	so = SearchObject(sep,translate)
+    else:
+	so = SearchObject(sep)
+    if stop:
+	cuts = so.findall(text,start,stop)
+    else:
+	cuts = so.findall(text,start)
+    l = 0
+    list = []
+    append = list.append
+    for left,right in cuts:
+	append(text[l:left])
+	l = right
+    append(text[l:])
+    return list
+
+# helper for tagdict
+def _tagdict(text,dict,prefix,taglist):
+
+    for o,l,r,s in taglist:
+	pfx = prefix + str(o)
+	dict[pfx] = text[l:r]
+	if s:
+	    _tagdict(text,dict,pfx+'.',s)
+
+def tagdict(text,*args):
+
+    """ Tag a text just like the function tag() and then convert
+        its output into a dictionary where the tagobjects reference
+	their respective strings
+	- this function emulates the interface of tag()
+	- in contrast to tag() this funtion *does* make copies
+	  of the found stings
+        - returns a tuple (rc,tagdict,next) with the same meaning
+	  of rc and next as tag(); tagdict is the new dictionary - 
+	  None in case rc is 0
+    """
+    rc,taglist,next = apply(tag,(text,)+args)
+    if not rc:
+	return (rc,None,next)
+    d = {}
+    tagdict = _tagdict
+    for o,l,r,s in taglist:
+	pfx = str(o)
+	d[pfx] = text[l:r]
+	if s:
+	    tagdict(text,dict,pfx+'.',s)
+    return (rc,d,next)
+
+def invset(chars):
+    
+    """ Return a set with all characters *except* the ones in chars.
+    """
+    return set(chars,0)
+
+def is_whitespace(text,start=0,stop=None,
+
+		  nonwhitespace=nonwhitespace_set,setfind=setfind):
+
+    """ Return 1 iff text[start:stop] only contains whitespace
+        characters (as defined in Constants/Sets.py), 0 otherwise.
+    """
+    if stop is None:
+	stop = len(text)
+    i = setfind(text,nonwhitespace,start,stop)
+    return (i < 0)
+
+def collapse(text,seperator=' ',
+
+	     join=join,setsplit=setsplit,collapse_set=set(newline+whitespace)):
+
+    """ Eliminates newline characters and compresses whitespace
+        characters into one space.
+
+        The result is a one line text string. Tim Peters will like
+        this function called with '-' seperator ;-)
+        
+    """
+    return join(setsplit(text,collapse_set),seperator)
+
+_linesplit_table = (
+    (None,Is,'\r',+1),
+    (None,Is,'\n',+1),
+    ('line',AllInSet+AppendMatch,set('\r\n',0),+1,-2),
+    (None,EOF,Here,+1,MatchOk),
+    ('empty line',Skip+AppendMatch,0,0,-4),
+    )
+
+def splitlines(text,
+
+               tag=tag,linesplit_table=_linesplit_table):
+
+    """ Split text into a list of single lines.
+
+        The following combinations are considered to be line-ends:
+        '\r', '\r\n', '\n'; they may be used in any combination.  The
+        line-end indicators are removed from the strings prior to
+        adding them to the list.
+
+        This function allows dealing with text files from Macs, PCs
+        and Unix origins in a portable way.
+        
+    """
+    return tag(text,linesplit_table)[1]
+
+_linecount_table = (
+    (None,Is,'\r',+1),
+    (None,Is,'\n',+1),
+    ('line',AllInSet+AppendTagobj,set('\r\n',0),+1,-2),
+    (None,EOF,Here,+1,MatchOk),
+    ('empty line',Skip+AppendTagobj,0,0,-4),
+    )
+
+def countlines(text,
+
+               linecount_table=_linecount_table):
+
+    """ Returns the number of lines in text.
+
+        Line ends are treated just like for splitlines() in a
+        portable way.
+    """
+    return len(tag(text,linecount_table)[1])
+
+_wordsplit_table = (
+    (None,AllInSet,whitespace_set,+1),
+    ('word',AllInSet+AppendMatch,nonwhitespace_set,+1,-1),
+    (None,EOF,Here,+1,MatchOk),
+    )
+
+def splitwords(text,
+
+               setsplit=setsplit,whitespace_set=whitespace_set):
+
+    """ Split text into a list of single words.
+
+        Words are separated by whitespace. The whitespace is stripped
+	before adding the words to the list.
+        
+    """
+    return setsplit(text,whitespace_set)
+
+#
+# Testing and benchmarking
+#
+
+# Taken from my hack.py module:
+import time
+class _timer:
+
+    """ timer class with a quite obvious interface
+	- .start() starts a fairly accurate CPU-time timer plus an
+	  absolute timer
+	- .stop() stops the timer and returns a tuple: the CPU-time in seconds
+	  and the absolute time elapsed since .start() was called
+    """
+
+    utime = 0
+    atime = 0
+
+    def start(self,
+	      clock=time.clock,time=time.time):
+	self.atime = time()
+	self.utime = clock()
+
+    def stop(self,
+	     clock=time.clock,time=time.time):
+	self.utime = clock() - self.utime
+	self.atime = time() - self.atime
+	return self.utime,self.atime
+
+    def usertime(self,
+		 clock=time.clock,time=time.time):
+	self.utime = clock() - self.utime
+	self.atime = time() - self.atime
+	return self.utime
+
+    def abstime(self,
+		clock=time.clock,time=time.time):
+	self.utime = clock() - self.utime
+	self.atime = time() - self.atime
+	return self.utime
+
+    def __str__(self):
+
+	return '%0.2fu %0.2fa sec.' % (self.utime,self.atime)
+
+def _bench(file='mxTextTools/mxTextTools.c'):
+
+    def mismatch(orig,new):
+	print
+	for i in range(len(orig)):
+	    if orig[i] != new[i]:
+		break
+	else:
+	    print 'Length mismatch: orig=%i new=%i' % (len(orig),len(new))
+	    if len(orig) > len(new):
+		print 'Missing chars:'+repr(orig[len(new):])
+	    else:
+		print 'Excess chars:'+repr(new[len(orig):])
+	    print
+	    return
+	print 'Mismatch at offset %i:' % i
+	print (orig[i-100:i] 
+	       + '<- %s != %s ->' % (repr(orig[i]),repr(new[i]))
+	       + orig[i+1:i+100])
+	print
+	
+    text = open(file).read()
+    import string
+
+    t = _timer()
+    print 'Working on a %i byte string' % len(text)
+
+    if 0:
+	print
+	print 'Replacing strings'
+	print '-'*72
+	print
+	for what,with in (('m','M'),('mx','MX'),('mxText','MXTEXT'),
+			  ('hmm','HMM'),('hmmm','HMM'),('hmhmm','HMM')):
+	    print 'Replace "%s" with "%s"' % (what,with)
+	    t.start()
+	    for i in range(100):
+		rtext = string.replace(text,what,with)
+	    print 'with string.replace:',t.stop(),'sec.'
+	    t.start()
+	    for i in range(100):
+		ttext = replace(text,what,with)
+	    print 'with tag.replace:',t.stop(),'sec.'
+	    if ttext != rtext:
+		print 'results are NOT ok !'
+		print '-'*72
+		mismatch(rtext,ttext)
+	    t.start()
+	    for i in range(100):
+		ttext = _replace2(text,what,with)
+	    print 'with tag._replace2:',t.stop(),'sec.'
+	    if ttext != rtext:
+		print 'results are NOT ok !'
+		print '-'*72
+		print rtext
+	    t.start()
+	    for i in range(100):
+		ttext = _replace3(text,what,with)
+	    print 'with tag._replace3:',t.stop(),'sec.'
+	    if ttext != rtext:
+		print 'results are NOT ok !'
+		print '-'*72
+		print rtext
+	    t.start()
+	    for i in range(100):
+		ttext = _replace4(text,what,with)
+	    print 'with tag._replace4:',t.stop(),'sec.'
+	    if ttext != rtext:
+		print 'results are NOT ok !'
+		print '-'*72
+		print rtext
+	    print
+
+    if 0:
+	print
+	print 'String lower/upper'
+	print '-'*72
+	print
+
+	op = string.lower
+	t.start()
+	for i in range(1000):
+	    op(text)
+	t.stop()
+	print ' string.lower:',t
+
+	op = string.upper
+	t.start()
+	for i in range(1000):
+	    op(text)
+	t.stop()
+	print ' string.upper:',t
+
+	op = upper
+	t.start()
+	for i in range(1000):
+	    op(text)
+	t.stop()
+	print ' TextTools.upper:',t
+
+	op = lower
+	t.start()
+	for i in range(1000):
+	    op(text)
+	t.stop()
+	print ' TextTools.lower:',t
+
+	print 'Testing...',
+	ltext = string.lower(text)
+	assert ltext == lower(text)
+	utext = string.upper(text)
+	assert utext == upper(text)
+	print 'ok.'
+
+    if 0:
+	print
+	print 'Joining lists'
+	print '-'*72
+	print
+
+	l = setsplit(text,whitespace_set)
+
+	op = string.join
+	t.start()
+	for i in range(1000):
+	    op(l)
+	t.stop()
+	print ' string.join:',t
+
+	op = join
+	t.start()
+	for i in range(1000):
+	    op(l)
+	t.stop()
+	print ' TextTools.join:',t
+
+	op = string.join
+	t.start()
+	for i in range(1000):
+	    op(l,' ')
+	t.stop()
+	print ' string.join with seperator:',t
+
+	op = join
+	t.start()
+	for i in range(1000):
+	    op(l,' ')
+	t.stop()
+	print ' TextTools.join with seperator:',t
+
+    if 0:
+	print
+	print 'Creating join lists'
+	print '-'*72
+	print
+
+	repl = []
+	for i in range(0,len(text),10):
+	    repl.append(str(i),i,i+1)
+
+	op = joinlist
+	t.start()
+	for i in range(1000):
+	    op(text,repl)
+	t.stop()
+	print ' TextTools.joinlist:',t
+
+    if 0:
+	print
+	print 'Splitting text'
+	print '-'*72
+	print
+
+	op = string.split
+	t.start()
+	for i in range(100):
+	    op(text)
+	t.stop()
+	print ' string.split whitespace:',t,'(',len(op(text)),'snippets )'
+
+	op = setsplit
+	ws = whitespace_set
+	t.start()
+	for i in range(100):
+	    op(text,ws)
+	t.stop()
+	print ' TextTools.setsplit whitespace:',t,'(',len(op(text,ws)),'snippets )'
+
+	assert string.split(text) == setsplit(text,ws)
+
+	op = string.split
+	sep = 'a'
+	t.start()
+	for i in range(100):
+	    op(text,sep)
+	t.stop()
+	print ' string.split at "a":',t,'(',len(op(text,sep)),'snippets )'
+
+	op = split
+	sep = 'a'
+	t.start()
+	for i in range(100):
+	    op(text,sep)
+	t.stop()
+	print ' TextTools.split at "a":',t,'(',len(op(text,sep)),'snippets )'
+
+	op = charsplit
+	sep = 'a'
+	t.start()
+	for i in range(100):
+	    op(text,sep)
+	t.stop()
+	print ' TextTools.charsplit at "a":',t,'(',len(op(text,sep)),'snippets )'
+
+	op = setsplit
+        sep = set('a')
+	t.start()
+	for i in range(100):
+	    op(text,sep)
+	t.stop()
+	print ' TextTools.setsplit at "a":',t,'(',len(op(text,sep)),'snippets )'
+
+	# Note: string.split and setsplit don't work identically !
+
+	op = string.split
+	sep = 'int'
+	t.start()
+	for i in range(100):
+	    op(text,sep)
+	t.stop()
+	print ' string.split at "int":',t,'(',len(op(text,sep)),'snippets )'
+
+	op = split
+	sep = 'int'
+	t.start()
+	for i in range(100):
+	    op(text,sep)
+	t.stop()
+	print ' TextTools.split at "int":',t,'(',len(op(text,sep)),'snippets )'
+
+	op = setsplit
+        sep = set('int')
+	t.start()
+	for i in range(100):
+	    op(text,sep)
+	t.stop()
+	print ' TextTools.setsplit at "i", "n", "t":',t,'(',len(op(text,sep)),'snippets )'
+
+	op = string.split
+	sep = 'register'
+	t.start()
+	for i in range(100):
+	    op(text,sep)
+	t.stop()
+	print ' string.split at "register":',t,'(',len(op(text,sep)),'snippets )'
+
+	op = split
+	sep = 'register'
+	t.start()
+	for i in range(100):
+	    op(text,sep)
+	t.stop()
+	print ' TextTools.split at "register":',t,'(',len(op(text,sep)),'snippets )'
+
+if __name__=='__main__':
+    _bench()
+
--- a/intern/python/modules/TextTools/init.py
+++ b/intern/python/modules/TextTools/init.py
@@ -0,0 +1,48 @@
+""" mxTextTools - A tools package for fast text processing.
+
+    (c) Copyright Marc-Andre Lemburg; All Rights Reserved.
+    See the documentation for further information on copyrights,
+    or contact the author (mal@lemburg.com).
+"""
+__package_info__ = """
+BEGIN PYTHON-PACKAGE-INFO 1.0
+Title:			mxTextTools - Tools for fast text processing
+Current-Version:	1.1.1
+Home-Page:		http://starship.skyport.net/~lemburg/mxTextTools.html
+Primary-Site:		http://starship.skyport.net/~lemburg/mxTextTools-1.1.1.zip
+
+This package provides several different functions and mechanisms
+to do fast text text processing. Amongst these are character set
+operations, parsing & tagging tools (using a finite state machine
+executing byte code) and common things such as Boyer-Moore search
+objects. For full documentation see the home page.
+END PYTHON-PACKAGE-INFO
+"""
+from TextTools import *
+from TextTools import __version__
+
+### Make the types pickleable:
+
+# Shortcuts for pickle (reduces the pickle's length)
+def _BMS(match,translate):
+    return BMS(match,translate)
+def _FS(match,translate):
+    return FS(match,translate)
+
+# Module init
+class modinit:
+
+    ### Register the two types
+    import copy_reg
+    def pickle_BMS(so):
+	return _BMS,(so.match,so.translate)
+    def pickle_FS(so):
+	return _FS,(so.match,so.translate)
+    copy_reg.pickle(BMSType,
+		    pickle_BMS,
+		    _BMS)
+    copy_reg.pickle(FSType,
+		    pickle_FS,
+		    _FS)
+
+del modinit
--- a/intern/python/modules/TextTools/mxTextTools/init.py
+++ b/intern/python/modules/TextTools/mxTextTools/init.py
@@ -0,0 +1,17 @@
+""" mxTextTools - A tools package for fast text processing.
+
+    (c) Copyright Marc-Andre Lemburg; All Rights Reserved.
+    See the documentation for further information on copyrights,
+    or contact the author (mal@lemburg.com).
+"""
+from mxTextTools import *
+from mxTextTools import __version__
+
+#
+# Make BMS take the role of FS in case the Fast Search object was not built
+#
+try:
+    FS
+except NameError:
+    FS = BMS
+    FSType = BMSType