source: trunk/python/experimental/lexer.py @ 223

Last change on this file since 223 was 196, checked in by tim, 15 years ago

experimental python bindings generator as provided by Michael Cohen

File size: 7.1 KB
Line 
1#!/usr/bin/env python
2# ******************************************************
3# Michael Cohen <scudette@users.sourceforge.net>
4#
5# ******************************************************
6#  Version: FLAG $Version: 0.87-pre1 Date: Thu Jun 12 00:48:38 EST 2008$
7# ******************************************************
8#
9# * This program is free software; you can redistribute it and/or
10# * modify it under the terms of the GNU General Public License
11# * as published by the Free Software Foundation; either version 2
12# * of the License, or (at your option) any later version.
13# *
14# * This program is distributed in the hope that it will be useful,
15# * but WITHOUT ANY WARRANTY; without even the implied warranty of
16# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17# * GNU General Public License for more details.
18# *
19# * You should have received a copy of the GNU General Public License
20# * along with this program; if not, write to the Free Software
21# * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
22# ******************************************************
23""" A simple feed lexer.
24"""
25
26import re,sys
27
28class Lexer:
29    """ A generic feed lexer """
30    ## The following is a description of the states we have and the
31    ## way we move through them: format is an array of
32    ## [ state_re, re, token/action, next state ]
33    tokens = []
34    state = "INITIAL"
35    buffer = ''
36    error = 0
37    verbose = 0
38    state_stack = []
39    processed = 0
40    processed_buffer = ''
41    saved_state = None
42    flags = 0
43   
44    def __init__(self, verbose=0, fd=None):
45        if not self.verbose:
46            self.verbose = verbose
47
48        if len(self.tokens[0])==4:
49            for row in self.tokens:
50                row.append(re.compile(row[0], re.DOTALL))
51                row.append(re.compile(row[1], re.DOTALL | re.M | re.S | self.flags ))
52               
53        self.fd = fd
54
55    def save_state(self, t=None, m=None):
56        """ Returns a dict which represents the current state of the lexer.
57
58        When provided to restore_state, the lexer is guaranteed to be
59        in the same state as when the save_state was called.
60
61        Note that derived classes may need to extend this.
62        """
63        ## Cant save our state if we have errors. We need to guarantee
64        ## that we rewind to a good part of the file.
65        if self.error: return
66        try:
67            end = m.end()
68        except: end = 0
69       
70        self.saved_state = dict(state_stack = self.state_stack[:],
71                                processed = self.processed - end,
72                                processed_buffer = self.processed_buffer,
73                                readptr = self.fd.tell() - len(self.buffer) - end,
74                                state = self.state,
75                                objects = self.objects[:],
76                                error = self.error,
77                                )
78
79        if self.verbose>1:
80            print "Saving state %s" % self.processed
81
82    def restore_state(self):
83        state = self.saved_state
84        if not state: return
85       
86        self.state_stack = state['state_stack']
87        self.processed = state['processed']
88        self.processed_buffer = state['processed_buffer']
89        self.buffer = ''
90        self.fd.seek(state['readptr'])
91        self.state = state['state']
92        self.objects = state['objects']
93        self.error = state['error']
94        if self.verbose>1:
95            print "Restoring state to offset %s" % self.processed
96
97    def next_token(self, end = True):
98        ## Now try to match any of the regexes in order:
99        current_state = self.state
100        for state_re, re_str, token, next, state, regex in self.tokens:
101            ## Does the rule apply for us now?
102            if state.match(current_state):
103                if self.verbose > 2:
104                    print "%s: Trying to match %r with %r" % (self.state, self.buffer[:10], re_str)
105                m = regex.match(self.buffer)
106                if m:
107                    if self.verbose > 3:
108                        print "%s matched %s" % (re_str, m.group(0).encode("utf8"))
109                    ## The match consumes the data off the buffer (the
110                    ## handler can put it back if it likes)
111                    self.processed_buffer += self.buffer[:m.end()]
112                    self.buffer = self.buffer[m.end():]
113                    self.processed += m.end()
114
115                    ## Try to iterate over all the callbacks specified:
116                    for t in token.split(','):
117                        try:
118                            if self.verbose > 0:
119                                print "0x%X: Calling %s %r" % (self.processed, t, m.group(0))
120                            cb = getattr(self, t, self.default_handler)
121                        except AttributeError:
122                            continue
123
124                        ## Is there a callback to handle this action?
125                        next_state = cb(t, m)
126                        if next_state == "CONTINUE":
127                            continue
128                       
129                        elif next_state:
130                            next = next_state
131                            self.state = next
132
133                   
134                    if next:
135                        self.state = next
136               
137                    return token
138
139        ## Check that we are making progress - if we are too full, we
140        ## assume we are stuck:
141        if end and len(self.buffer)>0 or len(self.buffer)>1024:
142            self.processed_buffer += self.buffer[:1]
143            self.buffer = self.buffer[1:]
144            self.ERROR("Lexer Stuck, discarding 1 byte (%r) - state %s" % (self.buffer[:10], self.state))
145            return "ERROR"
146
147        ## No token were found
148        return None
149   
150    def feed(self, data):
151        self.buffer += data
152
153    def empty(self):
154        return not len(self.buffer)
155
156    def default_handler(self, token, match):
157        if self.verbose > 2:
158            print "Default handler: %s with %r" % (token,match.group(0))
159
160    def ERROR(self, message = None, weight =1):
161        if self.verbose > 0 and message:
162            print "Error(%s): %s" % (weight,message)
163
164        self.error += weight
165
166    def PUSH_STATE(self, token = None, match = None):
167        if self.verbose > 1:
168            print "Storing state %s" % self.state
169        self.state_stack.append(self.state)
170
171    def POP_STATE(self, token = None, match = None):
172        try:
173            state = self.state_stack.pop()
174            if self.verbose > 1:
175                print "Returned state to %s" % state
176               
177            return state
178        except IndexError:
179            print "Tried to pop the state but failed - possible recursion error"
180            return None
181
182    def close(self):
183        """ Just a conveniece function to force us to parse all the data """
184        while self.next_token(): pass
185
186class SelfFeederMixIn(Lexer):
187    """ This mixin is used to make a lexer which feeds itself one
188    sector at the time.
189
190    Note that self.fd must be the fd we read from.
191    """
192    def parse_fd(self, fd):
193        self.feed(fd.read())
194        while self.next_token(): pass
195
Note: See TracBrowser for help on using the repository browser.