mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
* Add detokenize method and test
This commit is contained in:
parent
df110476d5
commit
12742f4f83
|
@ -81,3 +81,25 @@ def align_tokens(ref, indices):
|
||||||
yield token, emit
|
yield token, emit
|
||||||
start = end
|
start = end
|
||||||
assert not queue
|
assert not queue
|
||||||
|
|
||||||
|
|
||||||
|
def detokenize(token_rules, words):
|
||||||
|
"""To align with treebanks, return a list of "chunks", where a chunk is a
|
||||||
|
sequence of tokens that are separated by whitespace in actual strings. Each
|
||||||
|
chunk should be a tuple of token indices, e.g.
|
||||||
|
|
||||||
|
>>> detokenize(["ca<SEP>n't", '<SEP>!'], ["I", "ca", "n't", "!"])
|
||||||
|
[(0,), (1, 2, 3)]
|
||||||
|
"""
|
||||||
|
string = ' '.join(words)
|
||||||
|
for subtoks in token_rules:
|
||||||
|
# Algorithmically this is dumb, but writing a little list-based match
|
||||||
|
# machine? Ain't nobody got time for that.
|
||||||
|
string = string.replace(subtoks.replace('<SEP>', ' '), subtoks)
|
||||||
|
positions = []
|
||||||
|
i = 0
|
||||||
|
for chunk in string.split():
|
||||||
|
subtoks = chunk.split('<SEP>')
|
||||||
|
positions.append(tuple(range(i, i+len(subtoks))))
|
||||||
|
i += len(subtoks)
|
||||||
|
return positions
|
||||||
|
|
21
tests/test_detokenize.py
Normal file
21
tests/test_detokenize.py
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
from spacy.util import detokenize
|
||||||
|
|
||||||
|
def test_punct():
|
||||||
|
tokens = 'Pierre Vinken , 61 years old .'.split()
|
||||||
|
detoks = [(0,), (1, 2), (3,), (4,), (5, 6)]
|
||||||
|
token_rules = ('<SEP>,', '<SEP>.')
|
||||||
|
assert detokenize(token_rules, tokens) == detoks
|
||||||
|
|
||||||
|
|
||||||
|
def test_contractions():
|
||||||
|
tokens = "I ca n't even".split()
|
||||||
|
detoks = [(0,), (1, 2), (3,)]
|
||||||
|
token_rules = ("ca<SEP>n't",)
|
||||||
|
assert detokenize(token_rules, tokens) == detoks
|
||||||
|
|
||||||
|
|
||||||
|
def test_contractions_punct():
|
||||||
|
tokens = "I ca n't !".split()
|
||||||
|
detoks = [(0,), (1, 2, 3)]
|
||||||
|
token_rules = ("ca<SEP>n't", '<SEP>!')
|
||||||
|
assert detokenize(token_rules, tokens) == detoks
|
Loading…
Reference in New Issue
Block a user