mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	* Migrate regressions 1-1000 * Move serialize test to correct file * Remove tests that won't work in v3 * Migrate regressions 1000-1500 Removed regression test 1250 because v3 doesn't support the old LEX scheme anymore. * Add missing imports in serializer tests * Migrate tests 1500-2000 * Migrate regressions from 2000-2500 * Migrate regressions from 2501-3000 * Migrate regressions from 3000-3501 * Migrate regressions from 3501-4000 * Migrate regressions from 4001-4500 * Migrate regressions from 4501-5000 * Migrate regressions from 5001-5501 * Migrate regressions from 5501 to 7000 * Migrate regressions from 7001 to 8000 * Migrate remaining regression tests * Fixing missing imports * Update docs with new system [ci skip] * Update CONTRIBUTING.md - Fix formatting - Update wording * Remove lemmatizer tests in el lang * Move a few tests into the general tokenizer * Separate Doc and DocBin tests
		
			
				
	
	
		
			506 lines
		
	
	
		
			17 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			506 lines
		
	
	
		
			17 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import re
 | 
						|
 | 
						|
import numpy
 | 
						|
import pytest
 | 
						|
 | 
						|
from spacy.lang.en import English
 | 
						|
from spacy.lang.de import German
 | 
						|
from spacy.tokenizer import Tokenizer
 | 
						|
from spacy.tokens import Doc
 | 
						|
from spacy.training import Example
 | 
						|
from spacy.util import compile_prefix_regex, compile_suffix_regex, ensure_path
 | 
						|
from spacy.vocab import Vocab
 | 
						|
from spacy.symbols import ORTH
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.issue(743)
 | 
						|
def test_issue743():
 | 
						|
    doc = Doc(Vocab(), ["hello", "world"])
 | 
						|
    token = doc[0]
 | 
						|
    s = set([token])
 | 
						|
    items = list(s)
 | 
						|
    assert items[0] is token
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.issue(801)
 | 
						|
@pytest.mark.skip(
 | 
						|
    reason="Can not be fixed unless with variable-width lookbehinds, cf. PR #3218"
 | 
						|
)
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "text,tokens",
 | 
						|
    [
 | 
						|
        ('"deserve,"--and', ['"', "deserve", ',"--', "and"]),
 | 
						|
        ("exception;--exclusive", ["exception", ";--", "exclusive"]),
 | 
						|
        ("day.--Is", ["day", ".--", "Is"]),
 | 
						|
        ("refinement:--just", ["refinement", ":--", "just"]),
 | 
						|
        ("memories?--To", ["memories", "?--", "To"]),
 | 
						|
        ("Useful.=--Therefore", ["Useful", ".=--", "Therefore"]),
 | 
						|
        ("=Hope.=--Pandora", ["=", "Hope", ".=--", "Pandora"]),
 | 
						|
    ],
 | 
						|
)
 | 
						|
def test_issue801(en_tokenizer, text, tokens):
 | 
						|
    """Test that special characters + hyphens are split correctly."""
 | 
						|
    doc = en_tokenizer(text)
 | 
						|
    assert len(doc) == len(tokens)
 | 
						|
    assert [t.text for t in doc] == tokens
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.issue(1061)
 | 
						|
def test_issue1061():
 | 
						|
    """Test special-case works after tokenizing. Was caching problem."""
 | 
						|
    text = "I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_."
 | 
						|
    tokenizer = English().tokenizer
 | 
						|
    doc = tokenizer(text)
 | 
						|
    assert "MATH" in [w.text for w in doc]
 | 
						|
    assert "_MATH_" not in [w.text for w in doc]
 | 
						|
 | 
						|
    tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}])
 | 
						|
    doc = tokenizer(text)
 | 
						|
    assert "_MATH_" in [w.text for w in doc]
 | 
						|
    assert "MATH" not in [w.text for w in doc]
 | 
						|
 | 
						|
    # For sanity, check it works when pipeline is clean.
 | 
						|
    tokenizer = English().tokenizer
 | 
						|
    tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}])
 | 
						|
    doc = tokenizer(text)
 | 
						|
    assert "_MATH_" in [w.text for w in doc]
 | 
						|
    assert "MATH" not in [w.text for w in doc]
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.issue(1963)
 | 
						|
def test_issue1963(en_tokenizer):
 | 
						|
    """Test that doc.merge() resizes doc.tensor"""
 | 
						|
    doc = en_tokenizer("a b c d")
 | 
						|
    doc.tensor = numpy.ones((len(doc), 128), dtype="f")
 | 
						|
    with doc.retokenize() as retokenizer:
 | 
						|
        retokenizer.merge(doc[0:2])
 | 
						|
    assert len(doc) == 3
 | 
						|
    assert doc.tensor.shape == (3, 128)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.skip(
 | 
						|
    reason="Can not be fixed without variable-width look-behind (which we don't want)"
 | 
						|
)
 | 
						|
@pytest.mark.issue(1235)
 | 
						|
def test_issue1235():
 | 
						|
    """Test that g is not split of if preceded by a number and a letter"""
 | 
						|
    nlp = English()
 | 
						|
    testwords = "e2g 2g 52g"
 | 
						|
    doc = nlp(testwords)
 | 
						|
    assert len(doc) == 5
 | 
						|
    assert doc[0].text == "e2g"
 | 
						|
    assert doc[1].text == "2"
 | 
						|
    assert doc[2].text == "g"
 | 
						|
    assert doc[3].text == "52"
 | 
						|
    assert doc[4].text == "g"
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.issue(1242)
 | 
						|
def test_issue1242():
 | 
						|
    nlp = English()
 | 
						|
    doc = nlp("")
 | 
						|
    assert len(doc) == 0
 | 
						|
    docs = list(nlp.pipe(["", "hello"]))
 | 
						|
    assert len(docs[0]) == 0
 | 
						|
    assert len(docs[1]) == 1
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.issue(1257)
 | 
						|
def test_issue1257():
 | 
						|
    """Test that tokens compare correctly."""
 | 
						|
    doc1 = Doc(Vocab(), words=["a", "b", "c"])
 | 
						|
    doc2 = Doc(Vocab(), words=["a", "c", "e"])
 | 
						|
    assert doc1[0] != doc2[0]
 | 
						|
    assert not doc1[0] == doc2[0]
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.issue(1375)
 | 
						|
def test_issue1375():
 | 
						|
    """Test that token.nbor() raises IndexError for out-of-bounds access."""
 | 
						|
    doc = Doc(Vocab(), words=["0", "1", "2"])
 | 
						|
    with pytest.raises(IndexError):
 | 
						|
        assert doc[0].nbor(-1)
 | 
						|
    assert doc[1].nbor(-1).text == "0"
 | 
						|
    with pytest.raises(IndexError):
 | 
						|
        assert doc[2].nbor(1)
 | 
						|
    assert doc[1].nbor(1).text == "2"
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.issue(1488)
 | 
						|
def test_issue1488():
 | 
						|
    """Test that tokenizer can parse DOT inside non-whitespace separators"""
 | 
						|
    prefix_re = re.compile(r"""[\[\("']""")
 | 
						|
    suffix_re = re.compile(r"""[\]\)"']""")
 | 
						|
    infix_re = re.compile(r"""[-~\.]""")
 | 
						|
    simple_url_re = re.compile(r"""^https?://""")
 | 
						|
 | 
						|
    def my_tokenizer(nlp):
 | 
						|
        return Tokenizer(
 | 
						|
            nlp.vocab,
 | 
						|
            {},
 | 
						|
            prefix_search=prefix_re.search,
 | 
						|
            suffix_search=suffix_re.search,
 | 
						|
            infix_finditer=infix_re.finditer,
 | 
						|
            token_match=simple_url_re.match,
 | 
						|
        )
 | 
						|
 | 
						|
    nlp = English()
 | 
						|
    nlp.tokenizer = my_tokenizer(nlp)
 | 
						|
    doc = nlp("This is a test.")
 | 
						|
    for token in doc:
 | 
						|
        assert token.text
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.issue(1494)
 | 
						|
def test_issue1494():
 | 
						|
    """Test if infix_finditer works correctly"""
 | 
						|
    infix_re = re.compile(r"""[^a-z]""")
 | 
						|
    test_cases = [
 | 
						|
        ("token 123test", ["token", "1", "2", "3", "test"]),
 | 
						|
        ("token 1test", ["token", "1test"]),
 | 
						|
        ("hello...test", ["hello", ".", ".", ".", "test"]),
 | 
						|
    ]
 | 
						|
 | 
						|
    def new_tokenizer(nlp):
 | 
						|
        return Tokenizer(nlp.vocab, {}, infix_finditer=infix_re.finditer)
 | 
						|
 | 
						|
    nlp = English()
 | 
						|
    nlp.tokenizer = new_tokenizer(nlp)
 | 
						|
    for text, expected in test_cases:
 | 
						|
        assert [token.text for token in nlp(text)] == expected
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.skip(
 | 
						|
    reason="Can not be fixed without iterative looping between prefix/suffix and infix"
 | 
						|
)
 | 
						|
@pytest.mark.issue(2070)
 | 
						|
def test_issue2070():
 | 
						|
    """Test that checks that a dot followed by a quote is handled
 | 
						|
    appropriately.
 | 
						|
    """
 | 
						|
    # Problem: The dot is now properly split off, but the prefix/suffix rules
 | 
						|
    # are not applied again afterwards. This means that the quote will still be
 | 
						|
    # attached to the remaining token.
 | 
						|
    nlp = English()
 | 
						|
    doc = nlp('First sentence."A quoted sentence" he said ...')
 | 
						|
    assert len(doc) == 11
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.issue(2926)
 | 
						|
def test_issue2926(fr_tokenizer):
 | 
						|
    """Test that the tokenizer correctly splits tokens separated by a slash (/)
 | 
						|
    ending in a digit.
 | 
						|
    """
 | 
						|
    doc = fr_tokenizer("Learn html5/css3/javascript/jquery")
 | 
						|
    assert len(doc) == 8
 | 
						|
    assert doc[0].text == "Learn"
 | 
						|
    assert doc[1].text == "html5"
 | 
						|
    assert doc[2].text == "/"
 | 
						|
    assert doc[3].text == "css3"
 | 
						|
    assert doc[4].text == "/"
 | 
						|
    assert doc[5].text == "javascript"
 | 
						|
    assert doc[6].text == "/"
 | 
						|
    assert doc[7].text == "jquery"
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "text",
 | 
						|
    [
 | 
						|
        "ABLEItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume TABLE ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume",
 | 
						|
        "oow.jspsearch.eventoracleopenworldsearch.technologyoraclesolarissearch.technologystoragesearch.technologylinuxsearch.technologyserverssearch.technologyvirtualizationsearch.technologyengineeredsystemspcodewwmkmppscem:",
 | 
						|
    ],
 | 
						|
)
 | 
						|
@pytest.mark.issue(2626)
 | 
						|
def test_issue2626_2835(en_tokenizer, text):
 | 
						|
    """Check that sentence doesn't cause an infinite loop in the tokenizer."""
 | 
						|
    doc = en_tokenizer(text)
 | 
						|
    assert doc
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.issue(2656)
 | 
						|
def test_issue2656(en_tokenizer):
 | 
						|
    """Test that tokenizer correctly splits off punctuation after numbers with
 | 
						|
    decimal points.
 | 
						|
    """
 | 
						|
    doc = en_tokenizer("I went for 40.3, and got home by 10.0.")
 | 
						|
    assert len(doc) == 11
 | 
						|
    assert doc[0].text == "I"
 | 
						|
    assert doc[1].text == "went"
 | 
						|
    assert doc[2].text == "for"
 | 
						|
    assert doc[3].text == "40.3"
 | 
						|
    assert doc[4].text == ","
 | 
						|
    assert doc[5].text == "and"
 | 
						|
    assert doc[6].text == "got"
 | 
						|
    assert doc[7].text == "home"
 | 
						|
    assert doc[8].text == "by"
 | 
						|
    assert doc[9].text == "10.0"
 | 
						|
    assert doc[10].text == "."
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.issue(2754)
 | 
						|
def test_issue2754(en_tokenizer):
 | 
						|
    """Test that words like 'a' and 'a.m.' don't get exceptional norm values."""
 | 
						|
    a = en_tokenizer("a")
 | 
						|
    assert a[0].norm_ == "a"
 | 
						|
    am = en_tokenizer("am")
 | 
						|
    assert am[0].norm_ == "am"
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.issue(3002)
 | 
						|
def test_issue3002():
 | 
						|
    """Test that the tokenizer doesn't hang on a long list of dots"""
 | 
						|
    nlp = German()
 | 
						|
    doc = nlp(
 | 
						|
        "880.794.982.218.444.893.023.439.794.626.120.190.780.624.990.275.671 ist eine lange Zahl"
 | 
						|
    )
 | 
						|
    assert len(doc) == 5
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.skip(reason="default suffix rules avoid one upper-case letter before dot")
 | 
						|
@pytest.mark.issue(3449)
 | 
						|
def test_issue3449():
 | 
						|
    nlp = English()
 | 
						|
    nlp.add_pipe("sentencizer")
 | 
						|
    text1 = "He gave the ball to I. Do you want to go to the movies with I?"
 | 
						|
    text2 = "He gave the ball to I.  Do you want to go to the movies with I?"
 | 
						|
    text3 = "He gave the ball to I.\nDo you want to go to the movies with I?"
 | 
						|
    t1 = nlp(text1)
 | 
						|
    t2 = nlp(text2)
 | 
						|
    t3 = nlp(text3)
 | 
						|
    assert t1[5].text == "I"
 | 
						|
    assert t2[5].text == "I"
 | 
						|
    assert t3[5].text == "I"
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
 | 
						|
)
 | 
						|
def test_gold_misaligned(en_tokenizer, text, words):
 | 
						|
    doc = en_tokenizer(text)
 | 
						|
    Example.from_dict(doc, {"words": words})
 | 
						|
 | 
						|
 | 
						|
def test_tokenizer_handles_no_word(tokenizer):
 | 
						|
    tokens = tokenizer("")
 | 
						|
    assert len(tokens) == 0
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize("text", ["lorem"])
 | 
						|
def test_tokenizer_handles_single_word(tokenizer, text):
 | 
						|
    tokens = tokenizer(text)
 | 
						|
    assert tokens[0].text == text
 | 
						|
 | 
						|
 | 
						|
def test_tokenizer_handles_punct(tokenizer):
 | 
						|
    text = "Lorem, ipsum."
 | 
						|
    tokens = tokenizer(text)
 | 
						|
    assert len(tokens) == 4
 | 
						|
    assert tokens[0].text == "Lorem"
 | 
						|
    assert tokens[1].text == ","
 | 
						|
    assert tokens[2].text == "ipsum"
 | 
						|
    assert tokens[1].text != "Lorem"
 | 
						|
 | 
						|
 | 
						|
def test_tokenizer_handles_punct_braces(tokenizer):
 | 
						|
    text = "Lorem, (ipsum)."
 | 
						|
    tokens = tokenizer(text)
 | 
						|
    assert len(tokens) == 6
 | 
						|
 | 
						|
 | 
						|
def test_tokenizer_handles_digits(tokenizer):
 | 
						|
    exceptions = ["hu", "bn"]
 | 
						|
    text = "Lorem ipsum: 1984."
 | 
						|
    tokens = tokenizer(text)
 | 
						|
 | 
						|
    if tokens[0].lang_ not in exceptions:
 | 
						|
        assert len(tokens) == 5
 | 
						|
        assert tokens[0].text == "Lorem"
 | 
						|
        assert tokens[3].text == "1984"
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "text",
 | 
						|
    ["google.com", "python.org", "spacy.io", "explosion.ai", "http://www.google.com"],
 | 
						|
)
 | 
						|
def test_tokenizer_keep_urls(tokenizer, text):
 | 
						|
    tokens = tokenizer(text)
 | 
						|
    assert len(tokens) == 1
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize("text", ["NASDAQ:GOOG"])
 | 
						|
def test_tokenizer_colons(tokenizer, text):
 | 
						|
    tokens = tokenizer(text)
 | 
						|
    assert len(tokens) == 3
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "text", ["hello123@example.com", "hi+there@gmail.it", "matt@explosion.ai"]
 | 
						|
)
 | 
						|
def test_tokenizer_keeps_email(tokenizer, text):
 | 
						|
    tokens = tokenizer(text)
 | 
						|
    assert len(tokens) == 1
 | 
						|
 | 
						|
 | 
						|
def test_tokenizer_handles_long_text(tokenizer):
 | 
						|
    text = """Lorem ipsum dolor sit amet, consectetur adipiscing elit
 | 
						|
 | 
						|
Cras egestas orci non porttitor maximus.
 | 
						|
Maecenas quis odio id dolor rhoncus dignissim. Curabitur sed velit at orci ultrices sagittis. Nulla commodo euismod arcu eget vulputate.
 | 
						|
 | 
						|
Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, non lacinia enim nibh eget ipsum. Vestibulum in bibendum mauris.
 | 
						|
 | 
						|
"Nullam porta fringilla enim, a dictum orci consequat in." Mauris nec malesuada justo."""
 | 
						|
 | 
						|
    tokens = tokenizer(text)
 | 
						|
    assert len(tokens) > 5
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize("file_name", ["sun.txt"])
 | 
						|
def test_tokenizer_handle_text_from_file(tokenizer, file_name):
 | 
						|
    loc = ensure_path(__file__).parent / file_name
 | 
						|
    with loc.open("r", encoding="utf8") as infile:
 | 
						|
        text = infile.read()
 | 
						|
    assert len(text) != 0
 | 
						|
    tokens = tokenizer(text)
 | 
						|
    assert len(tokens) > 100
 | 
						|
 | 
						|
 | 
						|
def test_tokenizer_suspected_freeing_strings(tokenizer):
 | 
						|
    text1 = "Lorem dolor sit amet, consectetur adipiscing elit."
 | 
						|
    text2 = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
 | 
						|
    tokens1 = tokenizer(text1)
 | 
						|
    tokens2 = tokenizer(text2)
 | 
						|
    assert tokens1[0].text == "Lorem"
 | 
						|
    assert tokens2[0].text == "Lorem"
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize("text,tokens", [("lorem", [{"orth": "lo"}, {"orth": "rem"}])])
 | 
						|
def test_tokenizer_add_special_case(tokenizer, text, tokens):
 | 
						|
    tokenizer.add_special_case(text, tokens)
 | 
						|
    doc = tokenizer(text)
 | 
						|
    assert doc[0].text == tokens[0]["orth"]
 | 
						|
    assert doc[1].text == tokens[1]["orth"]
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "text,tokens",
 | 
						|
    [
 | 
						|
        ("lorem", [{"orth": "lo"}, {"orth": "re"}]),
 | 
						|
        ("lorem", [{"orth": "lo", "tag": "A"}, {"orth": "rem"}]),
 | 
						|
    ],
 | 
						|
)
 | 
						|
def test_tokenizer_validate_special_case(tokenizer, text, tokens):
 | 
						|
    with pytest.raises(ValueError):
 | 
						|
        tokenizer.add_special_case(text, tokens)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "text,tokens", [("lorem", [{"orth": "lo", "norm": "LO"}, {"orth": "rem"}])]
 | 
						|
)
 | 
						|
def test_tokenizer_add_special_case_tag(text, tokens):
 | 
						|
    vocab = Vocab()
 | 
						|
    tokenizer = Tokenizer(vocab, {}, None, None, None)
 | 
						|
    tokenizer.add_special_case(text, tokens)
 | 
						|
    doc = tokenizer(text)
 | 
						|
    assert doc[0].text == tokens[0]["orth"]
 | 
						|
    assert doc[0].norm_ == tokens[0]["norm"]
 | 
						|
    assert doc[1].text == tokens[1]["orth"]
 | 
						|
 | 
						|
 | 
						|
def test_tokenizer_special_cases_with_affixes(tokenizer):
 | 
						|
    text = '(((_SPECIAL_ A/B, A/B-A/B")'
 | 
						|
    tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}])
 | 
						|
    tokenizer.add_special_case("A/B", [{"orth": "A/B"}])
 | 
						|
    doc = tokenizer(text)
 | 
						|
    assert [token.text for token in doc] == [
 | 
						|
        "(",
 | 
						|
        "(",
 | 
						|
        "(",
 | 
						|
        "_SPECIAL_",
 | 
						|
        "A/B",
 | 
						|
        ",",
 | 
						|
        "A/B",
 | 
						|
        "-",
 | 
						|
        "A/B",
 | 
						|
        '"',
 | 
						|
        ")",
 | 
						|
    ]
 | 
						|
 | 
						|
 | 
						|
def test_tokenizer_special_cases_with_affixes_preserve_spacy():
 | 
						|
    tokenizer = English().tokenizer
 | 
						|
    # reset all special cases
 | 
						|
    tokenizer.rules = {}
 | 
						|
 | 
						|
    # in-place modification (only merges)
 | 
						|
    text = "''a'' "
 | 
						|
    tokenizer.add_special_case("''", [{"ORTH": "''"}])
 | 
						|
    assert tokenizer(text).text == text
 | 
						|
 | 
						|
    # not in-place (splits and merges)
 | 
						|
    tokenizer.add_special_case("ab", [{"ORTH": "a"}, {"ORTH": "b"}])
 | 
						|
    text = "ab ab ab ''ab ab'' ab'' ''ab"
 | 
						|
    assert tokenizer(text).text == text
 | 
						|
 | 
						|
 | 
						|
def test_tokenizer_special_cases_with_period(tokenizer):
 | 
						|
    text = "_SPECIAL_."
 | 
						|
    tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}])
 | 
						|
    doc = tokenizer(text)
 | 
						|
    assert [token.text for token in doc] == ["_SPECIAL_", "."]
 | 
						|
 | 
						|
 | 
						|
def test_tokenizer_special_cases_idx(tokenizer):
 | 
						|
    text = "the _ID'X_"
 | 
						|
    tokenizer.add_special_case("_ID'X_", [{"orth": "_ID"}, {"orth": "'X_"}])
 | 
						|
    doc = tokenizer(text)
 | 
						|
    assert doc[1].idx == 4
 | 
						|
    assert doc[2].idx == 7
 | 
						|
 | 
						|
 | 
						|
def test_tokenizer_special_cases_spaces(tokenizer):
 | 
						|
    assert [t.text for t in tokenizer("a b c")] == ["a", "b", "c"]
 | 
						|
    tokenizer.add_special_case("a b c", [{"ORTH": "a b c"}])
 | 
						|
    assert [t.text for t in tokenizer("a b c")] == ["a b c"]
 | 
						|
 | 
						|
 | 
						|
def test_tokenizer_flush_cache(en_vocab):
 | 
						|
    suffix_re = re.compile(r"[\.]$")
 | 
						|
    tokenizer = Tokenizer(
 | 
						|
        en_vocab,
 | 
						|
        suffix_search=suffix_re.search,
 | 
						|
    )
 | 
						|
    assert [t.text for t in tokenizer("a.")] == ["a", "."]
 | 
						|
    tokenizer.suffix_search = None
 | 
						|
    assert [t.text for t in tokenizer("a.")] == ["a."]
 | 
						|
 | 
						|
 | 
						|
def test_tokenizer_flush_specials(en_vocab):
 | 
						|
    suffix_re = re.compile(r"[\.]$")
 | 
						|
    rules = {"a a": [{"ORTH": "a a"}]}
 | 
						|
    tokenizer1 = Tokenizer(
 | 
						|
        en_vocab,
 | 
						|
        suffix_search=suffix_re.search,
 | 
						|
        rules=rules,
 | 
						|
    )
 | 
						|
    assert [t.text for t in tokenizer1("a a.")] == ["a a", "."]
 | 
						|
    tokenizer1.rules = {}
 | 
						|
    assert [t.text for t in tokenizer1("a a.")] == ["a", "a", "."]
 | 
						|
 | 
						|
 | 
						|
def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab):
 | 
						|
    # the prefix and suffix matches overlap in the suffix lookbehind
 | 
						|
    prefixes = ["a(?=.)"]
 | 
						|
    suffixes = [r"(?<=\w)\.", r"(?<=a)\d+\."]
 | 
						|
    prefix_re = compile_prefix_regex(prefixes)
 | 
						|
    suffix_re = compile_suffix_regex(suffixes)
 | 
						|
    tokenizer = Tokenizer(
 | 
						|
        en_vocab,
 | 
						|
        prefix_search=prefix_re.search,
 | 
						|
        suffix_search=suffix_re.search,
 | 
						|
    )
 | 
						|
    tokens = [t.text for t in tokenizer("a10.")]
 | 
						|
    assert tokens == ["a", "10", "."]
 | 
						|
    explain_tokens = [t[1] for t in tokenizer.explain("a10.")]
 | 
						|
    assert tokens == explain_tokens
 |