Merge remote-tracking branch 'upstream/master' into bugfix/tokenizer-special-cases-matcher

This commit is contained in:
Adriane Boyd 2019-09-11 15:23:03 +02:00
commit b097b0b83d
14 changed files with 1710 additions and 2132 deletions

View File

@ -1,7 +1,6 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import re
from wasabi import Printer from wasabi import Printer
from ...gold import iob_to_biluo from ...gold import iob_to_biluo

View File

@ -46,6 +46,11 @@ class GreekLemmatizer(object):
) )
return lemmas return lemmas
def lookup(self, string):
if string in self.lookup_table:
return self.lookup_table[string]
return string
def lemmatize(string, index, exceptions, rules): def lemmatize(string, index, exceptions, rules):
string = string.lower() string = string.lower()

View File

@ -1,8 +1,6 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from pathlib import Path
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS

File diff suppressed because it is too large Load Diff

View File

@ -15,7 +15,6 @@ _abbrev_exc = [
{ORTH: "пет", LEMMA: "петак", NORM: "петак"}, {ORTH: "пет", LEMMA: "петак", NORM: "петак"},
{ORTH: "суб", LEMMA: "субота", NORM: "субота"}, {ORTH: "суб", LEMMA: "субота", NORM: "субота"},
{ORTH: "нед", LEMMA: "недеља", NORM: "недеља"}, {ORTH: "нед", LEMMA: "недеља", NORM: "недеља"},
# Months abbreviations # Months abbreviations
{ORTH: "јан", LEMMA: "јануар", NORM: "јануар"}, {ORTH: "јан", LEMMA: "јануар", NORM: "јануар"},
{ORTH: "феб", LEMMA: "фебруар", NORM: "фебруар"}, {ORTH: "феб", LEMMA: "фебруар", NORM: "фебруар"},
@ -28,7 +27,7 @@ _abbrev_exc = [
{ORTH: "септ", LEMMA: "септембар", NORM: "септембар"}, {ORTH: "септ", LEMMA: "септембар", NORM: "септембар"},
{ORTH: "окт", LEMMA: "октобар", NORM: "октобар"}, {ORTH: "окт", LEMMA: "октобар", NORM: "октобар"},
{ORTH: "нов", LEMMA: "новембар", NORM: "новембар"}, {ORTH: "нов", LEMMA: "новембар", NORM: "новембар"},
{ORTH: "дец", LEMMA: "децембар", NORM: "децембар"} {ORTH: "дец", LEMMA: "децембар", NORM: "децембар"},
] ]

View File

@ -103,7 +103,13 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
text = "The players start." text = "The players start."
heads = [1, 1, 0, -1] heads = [1, 1, 0, -1]
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=["DT", "NN", "VBZ", "."], pos=["DET", "NOUN", "VERB", "PUNCT"], heads=heads) doc = get_doc(
tokens.vocab,
words=[t.text for t in tokens],
tags=["DT", "NN", "VBZ", "."],
pos=["DET", "NOUN", "VERB", "PUNCT"],
heads=heads,
)
assert len(doc) == 4 assert len(doc) == 4
assert doc[0].text == "The" assert doc[0].text == "The"
assert doc[0].tag_ == "DT" assert doc[0].tag_ == "DT"
@ -115,7 +121,13 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
assert doc[0].tag_ == "NN" assert doc[0].tag_ == "NN"
assert doc[0].pos_ == "NOUN" assert doc[0].pos_ == "NOUN"
assert doc[0].lemma_ == "The players" assert doc[0].lemma_ == "The players"
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=["DT", "NN", "VBZ", "."], pos=["DET", "NOUN", "VERB", "PUNCT"], heads=heads) doc = get_doc(
tokens.vocab,
words=[t.text for t in tokens],
tags=["DT", "NN", "VBZ", "."],
pos=["DET", "NOUN", "VERB", "PUNCT"],
heads=heads,
)
assert len(doc) == 4 assert len(doc) == 4
assert doc[0].text == "The" assert doc[0].text == "The"
assert doc[0].tag_ == "DT" assert doc[0].tag_ == "DT"
@ -269,18 +281,15 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
# if there is a parse, span.root provides default values # if there is a parse, span.root provides default values
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"] words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
heads = [ 0, -1, 1, -3, -4, -5, -1, -7, -8 ] heads = [0, -1, 1, -3, -4, -5, -1, -7, -8]
ents = [ ents = [(3, 5, "ent-de"), (5, 7, "ent-fg")]
(3, 5, "ent-de"), deps = ["dep"] * len(words)
(5, 7, "ent-fg"),
]
deps = ["dep"] * len(words)
en_vocab.strings.add("ent-de") en_vocab.strings.add("ent-de")
en_vocab.strings.add("ent-fg") en_vocab.strings.add("ent-fg")
en_vocab.strings.add("dep") en_vocab.strings.add("dep")
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents) doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
assert doc[2:4].root == doc[3] # root of 'c d' is d assert doc[2:4].root == doc[3] # root of 'c d' is d
assert doc[4:6].root == doc[4] # root is 'e f' is e assert doc[4:6].root == doc[4] # root is 'e f' is e
with doc.retokenize() as retokenizer: with doc.retokenize() as retokenizer:
retokenizer.merge(doc[2:4]) retokenizer.merge(doc[2:4])
retokenizer.merge(doc[4:6]) retokenizer.merge(doc[4:6])
@ -295,12 +304,9 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
# check that B is preserved if span[start] is B # check that B is preserved if span[start] is B
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"] words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
heads = [ 0, -1, 1, 1, -4, -5, -1, -7, -8 ] heads = [0, -1, 1, 1, -4, -5, -1, -7, -8]
ents = [ ents = [(3, 5, "ent-de"), (5, 7, "ent-de")]
(3, 5, "ent-de"), deps = ["dep"] * len(words)
(5, 7, "ent-de"),
]
deps = ["dep"] * len(words)
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents) doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
with doc.retokenize() as retokenizer: with doc.retokenize() as retokenizer:
retokenizer.merge(doc[3:5]) retokenizer.merge(doc[3:5])

View File

@ -14,24 +14,24 @@ from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part
def test_issue1061(): def test_issue1061():
'''Test special-case works after tokenizing. Was caching problem.''' """Test special-case works after tokenizing. Was caching problem."""
text = 'I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_.' text = "I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_."
tokenizer = English.Defaults.create_tokenizer() tokenizer = English.Defaults.create_tokenizer()
doc = tokenizer(text) doc = tokenizer(text)
assert 'MATH' in [w.text for w in doc] assert "MATH" in [w.text for w in doc]
assert '_MATH_' not in [w.text for w in doc] assert "_MATH_" not in [w.text for w in doc]
tokenizer.add_special_case('_MATH_', [{ORTH: '_MATH_'}]) tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}])
doc = tokenizer(text) doc = tokenizer(text)
assert '_MATH_' in [w.text for w in doc] assert "_MATH_" in [w.text for w in doc]
assert 'MATH' not in [w.text for w in doc] assert "MATH" not in [w.text for w in doc]
# For sanity, check it works when pipeline is clean. # For sanity, check it works when pipeline is clean.
tokenizer = English.Defaults.create_tokenizer() tokenizer = English.Defaults.create_tokenizer()
tokenizer.add_special_case('_MATH_', [{ORTH: '_MATH_'}]) tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}])
doc = tokenizer(text) doc = tokenizer(text)
assert '_MATH_' in [w.text for w in doc] assert "_MATH_" in [w.text for w in doc]
assert 'MATH' not in [w.text for w in doc] assert "MATH" not in [w.text for w in doc]
@pytest.mark.xfail( @pytest.mark.xfail(

View File

@ -1,7 +1,6 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest
from spacy.matcher import Matcher from spacy.matcher import Matcher
from spacy.tokens import Doc from spacy.tokens import Doc

View File

@ -1,7 +1,6 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest
from spacy.matcher import Matcher from spacy.matcher import Matcher
from spacy.tokens import Doc from spacy.tokens import Doc

View File

@ -1,7 +1,6 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest
from spacy.matcher import PhraseMatcher from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc from spacy.tokens import Doc

View File

@ -1,7 +1,6 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest
from spacy.matcher import Matcher from spacy.matcher import Matcher
from spacy.tokens import Doc from spacy.tokens import Doc

View File

@ -2,44 +2,37 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.lang.en import English from spacy.lang.en import English
import spacy
from spacy.tokenizer import Tokenizer from spacy.tokenizer import Tokenizer
from spacy import util
from spacy.tests.util import make_tempdir from ..util import make_tempdir
def test_issue4190(): def test_issue4190():
test_string = "Test c." test_string = "Test c."
# Load default language # Load default language
nlp_1 = English() nlp_1 = English()
doc_1a = nlp_1(test_string) doc_1a = nlp_1(test_string)
result_1a = [token.text for token in doc_1a] result_1a = [token.text for token in doc_1a] # noqa: F841
# Modify tokenizer # Modify tokenizer
customize_tokenizer(nlp_1) customize_tokenizer(nlp_1)
doc_1b = nlp_1(test_string) doc_1b = nlp_1(test_string)
result_1b = [token.text for token in doc_1b] result_1b = [token.text for token in doc_1b]
# Save and Reload # Save and Reload
with make_tempdir() as model_dir: with make_tempdir() as model_dir:
nlp_1.to_disk(model_dir) nlp_1.to_disk(model_dir)
nlp_2 = spacy.load(model_dir) nlp_2 = util.load_model(model_dir)
# This should be the modified tokenizer # This should be the modified tokenizer
doc_2 = nlp_2(test_string) doc_2 = nlp_2(test_string)
result_2 = [token.text for token in doc_2] result_2 = [token.text for token in doc_2]
assert result_1b == result_2 assert result_1b == result_2
def customize_tokenizer(nlp): def customize_tokenizer(nlp):
prefix_re = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes) prefix_re = util.compile_prefix_regex(nlp.Defaults.prefixes)
suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes) suffix_re = util.compile_suffix_regex(nlp.Defaults.suffixes)
infix_re = spacy.util.compile_infix_regex(nlp.Defaults.infixes) infix_re = util.compile_infix_regex(nlp.Defaults.infixes)
# Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
# remove all exceptions where a single letter is followed by a period (e.g. 'h.')
exceptions = { exceptions = {
k: v k: v
for k, v in dict(nlp.Defaults.tokenizer_exceptions).items() for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
@ -53,5 +46,4 @@ def customize_tokenizer(nlp):
infix_finditer=infix_re.finditer, infix_finditer=infix_re.finditer,
token_match=nlp.tokenizer.token_match, token_match=nlp.tokenizer.token_match,
) )
nlp.tokenizer = new_tokenizer nlp.tokenizer = new_tokenizer

View File

@ -0,0 +1,12 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.lang.el import Greek
def test_issue4272():
"""Test that lookup table can be accessed from Token.lemma if no POS tags
are available."""
nlp = Greek()
doc = nlp("Χθες")
assert doc[0].lemma_

View File

@ -56,6 +56,7 @@ def test_lookups_to_from_bytes():
assert table2.get("b") == 2 assert table2.get("b") == 2
assert new_lookups.to_bytes() == lookups_bytes assert new_lookups.to_bytes() == lookups_bytes
# This fails on Python 3.5 # This fails on Python 3.5
@pytest.mark.xfail @pytest.mark.xfail
def test_lookups_to_from_disk(): def test_lookups_to_from_disk():
@ -76,6 +77,7 @@ def test_lookups_to_from_disk():
assert len(table2) == 3 assert len(table2) == 3
assert table2.get("b") == 2 assert table2.get("b") == 2
# This fails on Python 3.5 # This fails on Python 3.5
@pytest.mark.xfail @pytest.mark.xfail
def test_lookups_to_from_bytes_via_vocab(): def test_lookups_to_from_bytes_via_vocab():