* Add detokenize method and test

This commit is contained in:
Matthew Honnibal 2014-10-18 18:02:05 +11:00
parent df110476d5
commit 12742f4f83
2 changed files with 43 additions and 0 deletions

View File

@ -81,3 +81,25 @@ def align_tokens(ref, indices):
yield token, emit
start = end
assert not queue
def detokenize(token_rules, words):
"""To align with treebanks, return a list of "chunks", where a chunk is a
sequence of tokens that are separated by whitespace in actual strings. Each
chunk should be a tuple of token indices, e.g.
>>> detokenize(["ca<SEP>n't", '<SEP>!'], ["I", "ca", "n't", "!"])
[(0,), (1, 2, 3)]
"""
string = ' '.join(words)
for subtoks in token_rules:
# Algorithmically this is dumb, but writing a little list-based match
# machine? Ain't nobody got time for that.
string = string.replace(subtoks.replace('<SEP>', ' '), subtoks)
positions = []
i = 0
for chunk in string.split():
subtoks = chunk.split('<SEP>')
positions.append(tuple(range(i, i+len(subtoks))))
i += len(subtoks)
return positions

21
tests/test_detokenize.py Normal file
View File

@ -0,0 +1,21 @@
from spacy.util import detokenize
def test_punct():
tokens = 'Pierre Vinken , 61 years old .'.split()
detoks = [(0,), (1, 2), (3,), (4,), (5, 6)]
token_rules = ('<SEP>,', '<SEP>.')
assert detokenize(token_rules, tokens) == detoks
def test_contractions():
tokens = "I ca n't even".split()
detoks = [(0,), (1, 2), (3,)]
token_rules = ("ca<SEP>n't",)
assert detokenize(token_rules, tokens) == detoks
def test_contractions_punct():
tokens = "I ca n't !".split()
detoks = [(0,), (1, 2, 3)]
token_rules = ("ca<SEP>n't", '<SEP>!')
assert detokenize(token_rules, tokens) == detoks