* Add detokenize method and test

2025-07-17 11:42:30 +03:00 · 2014-10-18 18:02:05 +11:00 · 2014-10-18 18:02:05 +11:00 · 12742f4f83
commit 12742f4f83
parent df110476d5
2 changed files with 43 additions and 0 deletions
--- a/spacy/util.py
+++ b/spacy/util.py
@ -81,3 +81,25 @@ def align_tokens(ref, indices):
        yield token, emit
        start = end
    assert not queue
 def detokenize(token_rules, words):
    """To align with treebanks, return a list of "chunks", where a chunk is a 
    sequence of tokens that are separated by whitespace in actual strings. Each
    chunk should be a tuple of token indices, e.g.
    >>> detokenize(["ca<SEP>n't", '<SEP>!'], ["I", "ca", "n't", "!"])
    [(0,), (1, 2, 3)]
    """
    string = ' '.join(words)
    for subtoks in token_rules:
        # Algorithmically this is dumb, but writing a little list-based match
        # machine? Ain't nobody got time for that.
        string = string.replace(subtoks.replace('<SEP>', ' '), subtoks)
    positions = []
    i = 0
    for chunk in string.split():
        subtoks = chunk.split('<SEP>')
        positions.append(tuple(range(i, i+len(subtoks))))
        i += len(subtoks)
    return positions
--- a/tests/test_detokenize.py
+++ b/tests/test_detokenize.py
@ -0,0 +1,21 @@
 from spacy.util import detokenize
 def test_punct():
    tokens = 'Pierre Vinken , 61 years old .'.split()
    detoks = [(0,), (1, 2), (3,), (4,), (5, 6)]
    token_rules = ('<SEP>,', '<SEP>.')
    assert detokenize(token_rules, tokens) == detoks
 def test_contractions():
    tokens = "I ca n't even".split()
    detoks = [(0,), (1, 2), (3,)]
    token_rules = ("ca<SEP>n't",)
    assert detokenize(token_rules, tokens) == detoks
 def test_contractions_punct():
    tokens = "I ca n't !".split()
    detoks = [(0,), (1, 2, 3)]
    token_rules = ("ca<SEP>n't", '<SEP>!')
    assert detokenize(token_rules, tokens) == detoks