-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtoken_utils.py
458 lines (367 loc) · 14.6 KB
/
token_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
"""token_utils.py
------------------
A collection of useful functions and methods to deal with tokenizing
source code.
"""
import ast
import keyword
import tokenize as py_tokenize
from io import StringIO
__version__ = "0.1.5"
_token_format = "type={type} string={string} start={start} end={end} line={line}"
class Token:
"""Token as generated from Python's tokenize.generate_tokens written here in
a more convenient form, and with some custom methods.
The various parameters are::
type: token type
string: the token written as a string
start = (start_row, start_col)
end = (end_row, end_col)
line: entire line of code where the token is found.
Token instances are mutable objects. Therefore, given a list of tokens,
we can change the value of any token's attribute, untokenize the list and
automatically obtain a transformed source.
"""
def __init__(self, token):
self.type = token[0]
self.string = token[1]
self.start = self.start_row, self.start_col = token[2]
self.end = self.end_row, self.end_col = token[3]
self.line = token[4]
def __eq__(self, other):
"""Compares a Token with another object; returns true if
self.string == other.string or if self.string == other.
"""
if hasattr(other, "string"):
return self.string == other.string
elif isinstance(other, str):
return self.string == other
else:
raise TypeError(
"A token can only be compared to another token or to a string."
)
def __repr__(self):
"""Nicely formatted token to help with debugging session.
Note that it does **not** print a string representation that could be
used to create a new ``Token`` instance, which is something you should
never need to do other than indirectly by using the functions
provided in this module.
"""
return _token_format.format(
type="%s (%s)" % (self.type, py_tokenize.tok_name[self.type]),
string=repr(self.string),
start=str(self.start),
end=str(self.end),
line=repr(self.line),
)
def __str__(self):
"""Returns the string attribute."""
return self.string
def is_comment(self):
"""Returns True if the token is a comment."""
return self.type == py_tokenize.COMMENT
def is_identifier(self):
"""Returns ``True`` if the token represents a valid Python identifier
excluding Python keywords.
Note: this is different from Python's string method ``isidentifier``
which also returns ``True`` if the string is a keyword.
"""
return self.string.isidentifier() and not self.is_keyword()
def is_name(self):
"""Returns ``True`` if the token is a type NAME"""
return self.type == py_tokenize.NAME
def is_keyword(self):
"""Returns True if the token represents a Python keyword."""
return keyword.iskeyword(self.string)
def is_number(self):
"""Returns True if the token represents a number"""
return self.type == py_tokenize.NUMBER
def is_float(self):
"""Returns True if the token represents a float"""
return self.is_number() and isinstance(ast.literal_eval(self.string), float)
def is_integer(self):
"""Returns True if the token represents an integer"""
return self.is_number() and isinstance(ast.literal_eval(self.string), int)
def is_complex(self):
"""Returns True if the token represents a complex number"""
return self.is_number() and isinstance(ast.literal_eval(self.string), complex)
def is_space(self):
"""Returns True if the token indicates a change in indentation,
the end of a line, or the end of the source
(``INDENT``, ``DEDENT``, ``NEWLINE``, ``NL``, and ``ENDMARKER``).
Note that spaces, including tab charcters ``\\t``, between tokens
on a given line are not considered to be tokens themselves.
"""
return self.type in (
py_tokenize.INDENT,
py_tokenize.DEDENT,
py_tokenize.NEWLINE,
py_tokenize.NL,
py_tokenize.ENDMARKER,
)
def is_string(self):
"""Returns True if the token is a string"""
return self.type == py_tokenize.STRING
def is_in(self, iterable):
"""Returns True if the string attribute is found as an item of iterable."""
return self.string in iterable
def is_not_in(self, iterable):
"""Returns True if the string attribute is found as an item of iterable."""
return self.string not in iterable
def find_token_by_position(tokens, row, column):
"""Given a list of tokens, a specific row (linenumber) and column,
a two-tuple is returned that includes the token
found at that position as well as its list index.
If no such token can be found, ``None, None`` is returned.
"""
for index, tok in enumerate(tokens):
if (
tok.start_row <= row <= tok.end_row
and tok.start_col <= column < tok.end_col
):
return tok, index
return None, None
def fix_empty_line(source, tokens):
"""Python's tokenizer drops entirely a last line if it consists only of
space characters and/or tab characters. To ensure that we can always have::
untokenize(tokenize(source)) == source
we correct the last token content if needed.
"""
nb = 0
for char in reversed(source):
if char in (" ", "\t"):
nb += 1
else:
break
tokens[-1].string = source[-nb:]
def tokenize(source, warning=True):
"""Transforms a source (string) into a list of Tokens.
If an exception is raised by Python's tokenize module, the list of tokens
accumulated up to that point is returned.
"""
tokens = []
for tok in py_tokenize.generate_tokens(StringIO(source).readline):
try:
token = Token(tok)
tokens.append(token)
except (py_tokenize.TokenError, Exception) as exc:
if warning:
print(
"WARNING: the following error was raised in ",
f"{__name__}.tokenize",
)
print(exc)
return tokens
if source.endswith((" ", "\t")):
fix_empty_line(source, tokens)
return tokens
def get_significant_tokens(source):
"""Gets a list of tokens from a source (str), ignoring comments
as well as any token whose string value is either null or
consists of spaces, newline or tab characters.
If an exception is raised by Python's tokenize module, the list of tokens
accumulated up to that point is returned.
"""
tokens = []
try:
for tok in py_tokenize.generate_tokens(StringIO(source).readline):
token = Token(tok)
if not token.string.strip():
continue
if token.is_comment():
continue
tokens.append(token)
except py_tokenize.TokenError:
return tokens
return tokens
def get_lines(source):
"""Transforms a source (string) into a list of Tokens, with each
(inner) list containing all the tokens found on a given line of code.
"""
lines = []
current_row = -1
new_line = []
for tok in py_tokenize.generate_tokens(StringIO(source).readline):
try:
token = Token(tok)
if token.start_row != current_row:
current_row = token.start_row
if new_line:
lines.append(new_line)
new_line = []
new_line.append(token)
except (py_tokenize.TokenError, Exception) as exc:
print(
"WARNING: the following tokenize error was raised in "
f"{__name__}.get_lines"
)
print(exc)
if new_line:
lines.append(new_line)
if source.endswith((" ", "\t")):
fix_empty_line(source, lines[-1])
return lines
def get_number(tokens, exclude_comment=True):
"""Given a list of tokens, gives a count of the number of
tokens which are not space tokens (such as ``NEWLINE``, ``INDENT``,
``DEDENT``, etc.)
By default, ``COMMMENT`` tokens are not included in the count.
If you wish to include them, set ``exclude_comment`` to ``False``.
"""
nb = len(tokens)
for token in tokens:
if token.is_space():
nb -= 1
elif exclude_comment and token.is_comment():
nb -= 1
return nb
def strip_comment(line):
"""Removes comments from a line"""
tokens = []
try:
for tok in py_tokenize.generate_tokens(StringIO(line).readline):
token = Token(tok)
if token.is_comment():
continue
tokens.append(token)
except py_tokenize.TokenError:
pass
return untokenize(tokens)
# TODO: add unit test for this
def find_substring_index(main, substring):
"""Somewhat similar to the find() method for strings,
this function determines if the tokens for substring appear
as a subsequence of the tokens for main. If so, the index
of the first token in returned, otherwise -1 is returned.
"""
main_tokens = [tok.string for tok in get_significant_tokens(main)]
sub_tokens = [tok.string for tok in get_significant_tokens(substring)]
for index, token in enumerate(main_tokens):
if (
token == sub_tokens[0]
and main_tokens[index : index + len(sub_tokens)] == sub_tokens
):
return index
return -1
def get_first(tokens, exclude_comment=True):
"""Given a list of tokens, find the first token which is not a space token
(such as a ``NEWLINE``, ``INDENT``, ``DEDENT``, etc.) and,
by default, also not a ``COMMMENT``.
``COMMMENT`` tokens can be included by setting ``exclude_comment`` to ``False``.
Returns ``None`` if none is found.
"""
for token in tokens:
if token.is_space() or (exclude_comment and token.is_comment()):
continue
return token
return None
def get_first_index(tokens, exclude_comment=True):
"""Given a list of tokens, find the index of the first token which is
not a space token (such as a ``NEWLINE``, ``INDENT``, ``DEDENT``, etc.) nor
a ``COMMMENT``. If it is desired to include COMMENT, set ``exclude_comment``
to ``True``.
Returns ``None`` if none is found.
"""
for index, token in enumerate(tokens):
if token.is_space() or (exclude_comment and token.is_comment()):
continue
return index
return None
def get_last(tokens, exclude_comment=True):
"""Given a list of tokens, find the last token which is not a space token
(such as a ``NEWLINE``, ``INDENT``, ``DEDENT``, etc.) and, by default,
also not a ``COMMMENT``.
``COMMMENT`` tokens can be included by setting``exclude_comment``
to ``False``.
Returns ``None`` if none is found.
"""
return get_first(reversed(tokens), exclude_comment=exclude_comment)
def get_last_index(tokens, exclude_comment=True):
"""Given a list of tokens, find the index of the last token which is
not a space token (such as a ``NEWLINE``, ``INDENT``, ``DEDENT``, etc.) nor
a ``COMMMENT``. If it is desired to include COMMENT, set ``exclude_comment``
to True.
Returns ``None`` if none is found.
"""
return (
len(tokens)
- 1
- get_first_index(reversed(tokens), exclude_comment=exclude_comment)
)
def dedent(tokens, nb):
"""Given a list of tokens, produces an equivalent list corresponding
to a line of code with the first nb characters removed.
"""
line = untokenize(tokens)
line = line[nb:]
return tokenize(line)
def indent(tokens, nb, tab=False):
"""Given a list of tokens, produces an equivalent list corresponding
to a line of code with nb space characters inserted at the beginning.
If ``tab`` is specified to be ``True``, ``nb`` tab characters are inserted
instead of spaces.
"""
line = untokenize(tokens)
if tab:
line = "\t" * nb + line
else:
line = " " * nb + line
return tokenize(line)
def untokenize(tokens):
"""Return source code based on tokens.
Adapted from https://github.com/myint/untokenize,
Copyright (C) 2013-2018 Steven Myint, MIT License (same as this project).
This is similar to Python's own tokenize.untokenize(), except that it
preserves spacing between tokens, by using the line
information recorded by Python's tokenize.generate_tokens.
As a result, if the original soure code had multiple spaces between
some tokens or if escaped newlines were used or if tab characters
were present in the original source, those will also be present
in the source code produced by untokenize.
Thus ``source == untokenize(tokenize(source))``.
Note: if you you modifying tokens from an original source:
Instead of full token object, ``untokenize`` will accept simple
strings; however, it will only insert them *as is* without taking them
into account when it comes with figuring out spacing between tokens.
"""
words = []
previous_line = ""
last_row = 0
last_column = -1
last_non_whitespace_token_type = None
for token in tokens:
if isinstance(token, str):
words.append(token)
continue
if token.type == py_tokenize.ENCODING:
continue
# Preserve escaped newlines.
if (
last_non_whitespace_token_type != py_tokenize.COMMENT
and token.start_row > last_row
):
if previous_line.endswith(("\\\n", "\\\r\n", "\\\r")):
words.append(previous_line[len(previous_line.rstrip(" \t\n\r\\")) :])
# Preserve spacing.
if token.start_row > last_row:
last_column = 0
if token.start_col > last_column:
words.append(token.line[last_column : token.start_col])
words.append(token.string)
previous_line = token.line
last_row = token.end_row
last_column = token.end_col
if not token.is_space():
last_non_whitespace_token_type = token.type
return "".join(words)
def print_tokens(source):
"""Prints tokens found in source, excluding spaces and comments.
``source`` is either a string to be tokenized, or a list of Token objects.
This is occasionally useful as a debugging tool.
"""
if isinstance(source[0], Token):
source = untokenize(source)
for lines in get_lines(source):
for token in lines:
print(repr(token))
print()