source: trunk/python/experimental/lexer.py@ 218

Last change on this file since 218 was 196, checked in by tim, 15 years ago

experimental python bindings generator as provided by Michael Cohen

File size: 7.1 KB
Line 
1#!/usr/bin/env python
2# ******************************************************
3# Michael Cohen <scudette@users.sourceforge.net>
4#
5# ******************************************************
6# Version: FLAG $Version: 0.87-pre1 Date: Thu Jun 12 00:48:38 EST 2008$
7# ******************************************************
8#
9# * This program is free software; you can redistribute it and/or
10# * modify it under the terms of the GNU General Public License
11# * as published by the Free Software Foundation; either version 2
12# * of the License, or (at your option) any later version.
13# *
14# * This program is distributed in the hope that it will be useful,
15# * but WITHOUT ANY WARRANTY; without even the implied warranty of
16# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# * GNU General Public License for more details.
18# *
19# * You should have received a copy of the GNU General Public License
20# * along with this program; if not, write to the Free Software
21# * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
22# ******************************************************
23""" A simple feed lexer.
24"""
25
26import re,sys
27
28class Lexer:
29 """ A generic feed lexer """
30 ## The following is a description of the states we have and the
31 ## way we move through them: format is an array of
32 ## [ state_re, re, token/action, next state ]
33 tokens = []
34 state = "INITIAL"
35 buffer = ''
36 error = 0
37 verbose = 0
38 state_stack = []
39 processed = 0
40 processed_buffer = ''
41 saved_state = None
42 flags = 0
43
44 def __init__(self, verbose=0, fd=None):
45 if not self.verbose:
46 self.verbose = verbose
47
48 if len(self.tokens[0])==4:
49 for row in self.tokens:
50 row.append(re.compile(row[0], re.DOTALL))
51 row.append(re.compile(row[1], re.DOTALL | re.M | re.S | self.flags ))
52
53 self.fd = fd
54
55 def save_state(self, t=None, m=None):
56 """ Returns a dict which represents the current state of the lexer.
57
58 When provided to restore_state, the lexer is guaranteed to be
59 in the same state as when the save_state was called.
60
61 Note that derived classes may need to extend this.
62 """
63 ## Cant save our state if we have errors. We need to guarantee
64 ## that we rewind to a good part of the file.
65 if self.error: return
66 try:
67 end = m.end()
68 except: end = 0
69
70 self.saved_state = dict(state_stack = self.state_stack[:],
71 processed = self.processed - end,
72 processed_buffer = self.processed_buffer,
73 readptr = self.fd.tell() - len(self.buffer) - end,
74 state = self.state,
75 objects = self.objects[:],
76 error = self.error,
77 )
78
79 if self.verbose>1:
80 print "Saving state %s" % self.processed
81
82 def restore_state(self):
83 state = self.saved_state
84 if not state: return
85
86 self.state_stack = state['state_stack']
87 self.processed = state['processed']
88 self.processed_buffer = state['processed_buffer']
89 self.buffer = ''
90 self.fd.seek(state['readptr'])
91 self.state = state['state']
92 self.objects = state['objects']
93 self.error = state['error']
94 if self.verbose>1:
95 print "Restoring state to offset %s" % self.processed
96
97 def next_token(self, end = True):
98 ## Now try to match any of the regexes in order:
99 current_state = self.state
100 for state_re, re_str, token, next, state, regex in self.tokens:
101 ## Does the rule apply for us now?
102 if state.match(current_state):
103 if self.verbose > 2:
104 print "%s: Trying to match %r with %r" % (self.state, self.buffer[:10], re_str)
105 m = regex.match(self.buffer)
106 if m:
107 if self.verbose > 3:
108 print "%s matched %s" % (re_str, m.group(0).encode("utf8"))
109 ## The match consumes the data off the buffer (the
110 ## handler can put it back if it likes)
111 self.processed_buffer += self.buffer[:m.end()]
112 self.buffer = self.buffer[m.end():]
113 self.processed += m.end()
114
115 ## Try to iterate over all the callbacks specified:
116 for t in token.split(','):
117 try:
118 if self.verbose > 0:
119 print "0x%X: Calling %s %r" % (self.processed, t, m.group(0))
120 cb = getattr(self, t, self.default_handler)
121 except AttributeError:
122 continue
123
124 ## Is there a callback to handle this action?
125 next_state = cb(t, m)
126 if next_state == "CONTINUE":
127 continue
128
129 elif next_state:
130 next = next_state
131 self.state = next
132
133
134 if next:
135 self.state = next
136
137 return token
138
139 ## Check that we are making progress - if we are too full, we
140 ## assume we are stuck:
141 if end and len(self.buffer)>0 or len(self.buffer)>1024:
142 self.processed_buffer += self.buffer[:1]
143 self.buffer = self.buffer[1:]
144 self.ERROR("Lexer Stuck, discarding 1 byte (%r) - state %s" % (self.buffer[:10], self.state))
145 return "ERROR"
146
147 ## No token were found
148 return None
149
150 def feed(self, data):
151 self.buffer += data
152
153 def empty(self):
154 return not len(self.buffer)
155
156 def default_handler(self, token, match):
157 if self.verbose > 2:
158 print "Default handler: %s with %r" % (token,match.group(0))
159
160 def ERROR(self, message = None, weight =1):
161 if self.verbose > 0 and message:
162 print "Error(%s): %s" % (weight,message)
163
164 self.error += weight
165
166 def PUSH_STATE(self, token = None, match = None):
167 if self.verbose > 1:
168 print "Storing state %s" % self.state
169 self.state_stack.append(self.state)
170
171 def POP_STATE(self, token = None, match = None):
172 try:
173 state = self.state_stack.pop()
174 if self.verbose > 1:
175 print "Returned state to %s" % state
176
177 return state
178 except IndexError:
179 print "Tried to pop the state but failed - possible recursion error"
180 return None
181
182 def close(self):
183 """ Just a conveniece function to force us to parse all the data """
184 while self.next_token(): pass
185
186class SelfFeederMixIn(Lexer):
187 """ This mixin is used to make a lexer which feeds itself one
188 sector at the time.
189
190 Note that self.fd must be the fd we read from.
191 """
192 def parse_fd(self, fd):
193 self.feed(fd.read())
194 while self.next_token(): pass
195
Note: See TracBrowser for help on using the repository browser.