spaCy/spacy/tests/regression/test_issue1061.py

from __future__ import unicode_literals

from ...symbols import ORTH

from ...vocab import Vocab
from ...en import English


def test_issue1061():
    '''Test special-case works after tokenizing. Was caching problem.'''
    text = 'I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_.'
    tokenizer = English.Defaults.create_tokenizer()
    doc = tokenizer(text)
    assert 'MATH' in [w.text for w in doc]
    assert '_MATH_' not in [w.text for w in doc]

    tokenizer.add_special_case('_MATH_', [{ORTH: '_MATH_'}])
    doc = tokenizer(text)
    assert '_MATH_' in [w.text for w in doc]
    assert 'MATH' not in [w.text for w in doc]

    # For sanity, check it works when pipeline is clean.
    tokenizer = English.Defaults.create_tokenizer()
    tokenizer.add_special_case('_MATH_', [{ORTH: '_MATH_'}])
    doc = tokenizer(text)
    assert '_MATH_' in [w.text for w in doc]
    assert 'MATH' not in [w.text for w in doc]
Add flush_cache method to tokenizer, to fix #1061 The tokenizer caches output for common chunks, for efficiency. This cache is be invalidated when the tokenizer rules change, e.g. when a new special-case rule is introduced. That's what was causing #1061. When the cache is flushed, we free the intermediate token chunks. I think this is safe --- but if we start getting segfaults, this patch is to blame. The resolution would be to simply not free those bits of memory. They'll be freed when the tokenizer exits anyway. 2017-07-22 16:06:50 +03:00			`from __future__ import unicode_literals`

			`from ...symbols import ORTH`

			`from ...vocab import Vocab`
			`from ...en import English`


			`def test_issue1061():`
			`'''Test special-case works after tokenizing. Was caching problem.'''`
			`text = 'I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_.'`
			`tokenizer = English.Defaults.create_tokenizer()`
			`doc = tokenizer(text)`
			`assert 'MATH' in [w.text for w in doc]`
			`assert '_MATH_' not in [w.text for w in doc]`

			`tokenizer.add_special_case('_MATH_', [{ORTH: '_MATH_'}])`
			`doc = tokenizer(text)`
			`assert '_MATH_' in [w.text for w in doc]`
			`assert 'MATH' not in [w.text for w in doc]`

			`# For sanity, check it works when pipeline is clean.`
			`tokenizer = English.Defaults.create_tokenizer()`
			`tokenizer.add_special_case('_MATH_', [{ORTH: '_MATH_'}])`
			`doc = tokenizer(text)`
			`assert '_MATH_' in [w.text for w in doc]`
			`assert 'MATH' not in [w.text for w in doc]`