mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-15 12:06:25 +03:00
Merge remote-tracking branch 'upstream/master' into bugfix/tokenizer-special-cases-matcher
This commit is contained in:
commit
b097b0b83d
|
@ -1,7 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
from wasabi import Printer
|
||||
|
||||
from ...gold import iob_to_biluo
|
||||
|
|
|
@ -46,6 +46,11 @@ class GreekLemmatizer(object):
|
|||
)
|
||||
return lemmas
|
||||
|
||||
def lookup(self, string):
|
||||
if string in self.lookup_table:
|
||||
return self.lookup_table[string]
|
||||
return string
|
||||
|
||||
|
||||
def lemmatize(string, index, exceptions, rules):
|
||||
string = string.lower()
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -15,7 +15,6 @@ _abbrev_exc = [
|
|||
{ORTH: "пет", LEMMA: "петак", NORM: "петак"},
|
||||
{ORTH: "суб", LEMMA: "субота", NORM: "субота"},
|
||||
{ORTH: "нед", LEMMA: "недеља", NORM: "недеља"},
|
||||
|
||||
# Months abbreviations
|
||||
{ORTH: "јан", LEMMA: "јануар", NORM: "јануар"},
|
||||
{ORTH: "феб", LEMMA: "фебруар", NORM: "фебруар"},
|
||||
|
@ -28,7 +27,7 @@ _abbrev_exc = [
|
|||
{ORTH: "септ", LEMMA: "септембар", NORM: "септембар"},
|
||||
{ORTH: "окт", LEMMA: "октобар", NORM: "октобар"},
|
||||
{ORTH: "нов", LEMMA: "новембар", NORM: "новембар"},
|
||||
{ORTH: "дец", LEMMA: "децембар", NORM: "децембар"}
|
||||
{ORTH: "дец", LEMMA: "децембар", NORM: "децембар"},
|
||||
]
|
||||
|
||||
|
||||
|
|
|
@ -103,7 +103,13 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
|
|||
text = "The players start."
|
||||
heads = [1, 1, 0, -1]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=["DT", "NN", "VBZ", "."], pos=["DET", "NOUN", "VERB", "PUNCT"], heads=heads)
|
||||
doc = get_doc(
|
||||
tokens.vocab,
|
||||
words=[t.text for t in tokens],
|
||||
tags=["DT", "NN", "VBZ", "."],
|
||||
pos=["DET", "NOUN", "VERB", "PUNCT"],
|
||||
heads=heads,
|
||||
)
|
||||
assert len(doc) == 4
|
||||
assert doc[0].text == "The"
|
||||
assert doc[0].tag_ == "DT"
|
||||
|
@ -115,7 +121,13 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
|
|||
assert doc[0].tag_ == "NN"
|
||||
assert doc[0].pos_ == "NOUN"
|
||||
assert doc[0].lemma_ == "The players"
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=["DT", "NN", "VBZ", "."], pos=["DET", "NOUN", "VERB", "PUNCT"], heads=heads)
|
||||
doc = get_doc(
|
||||
tokens.vocab,
|
||||
words=[t.text for t in tokens],
|
||||
tags=["DT", "NN", "VBZ", "."],
|
||||
pos=["DET", "NOUN", "VERB", "PUNCT"],
|
||||
heads=heads,
|
||||
)
|
||||
assert len(doc) == 4
|
||||
assert doc[0].text == "The"
|
||||
assert doc[0].tag_ == "DT"
|
||||
|
@ -269,18 +281,15 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
|
|||
|
||||
# if there is a parse, span.root provides default values
|
||||
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
||||
heads = [ 0, -1, 1, -3, -4, -5, -1, -7, -8 ]
|
||||
ents = [
|
||||
(3, 5, "ent-de"),
|
||||
(5, 7, "ent-fg"),
|
||||
]
|
||||
deps = ["dep"] * len(words)
|
||||
heads = [0, -1, 1, -3, -4, -5, -1, -7, -8]
|
||||
ents = [(3, 5, "ent-de"), (5, 7, "ent-fg")]
|
||||
deps = ["dep"] * len(words)
|
||||
en_vocab.strings.add("ent-de")
|
||||
en_vocab.strings.add("ent-fg")
|
||||
en_vocab.strings.add("dep")
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
|
||||
assert doc[2:4].root == doc[3] # root of 'c d' is d
|
||||
assert doc[4:6].root == doc[4] # root is 'e f' is e
|
||||
assert doc[2:4].root == doc[3] # root of 'c d' is d
|
||||
assert doc[4:6].root == doc[4] # root is 'e f' is e
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[2:4])
|
||||
retokenizer.merge(doc[4:6])
|
||||
|
@ -295,12 +304,9 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
|
|||
|
||||
# check that B is preserved if span[start] is B
|
||||
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
||||
heads = [ 0, -1, 1, 1, -4, -5, -1, -7, -8 ]
|
||||
ents = [
|
||||
(3, 5, "ent-de"),
|
||||
(5, 7, "ent-de"),
|
||||
]
|
||||
deps = ["dep"] * len(words)
|
||||
heads = [0, -1, 1, 1, -4, -5, -1, -7, -8]
|
||||
ents = [(3, 5, "ent-de"), (5, 7, "ent-de")]
|
||||
deps = ["dep"] * len(words)
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[3:5])
|
||||
|
|
|
@ -14,24 +14,24 @@ from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part
|
|||
|
||||
|
||||
def test_issue1061():
|
||||
'''Test special-case works after tokenizing. Was caching problem.'''
|
||||
text = 'I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_.'
|
||||
"""Test special-case works after tokenizing. Was caching problem."""
|
||||
text = "I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_."
|
||||
tokenizer = English.Defaults.create_tokenizer()
|
||||
doc = tokenizer(text)
|
||||
assert 'MATH' in [w.text for w in doc]
|
||||
assert '_MATH_' not in [w.text for w in doc]
|
||||
assert "MATH" in [w.text for w in doc]
|
||||
assert "_MATH_" not in [w.text for w in doc]
|
||||
|
||||
tokenizer.add_special_case('_MATH_', [{ORTH: '_MATH_'}])
|
||||
tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}])
|
||||
doc = tokenizer(text)
|
||||
assert '_MATH_' in [w.text for w in doc]
|
||||
assert 'MATH' not in [w.text for w in doc]
|
||||
assert "_MATH_" in [w.text for w in doc]
|
||||
assert "MATH" not in [w.text for w in doc]
|
||||
|
||||
# For sanity, check it works when pipeline is clean.
|
||||
tokenizer = English.Defaults.create_tokenizer()
|
||||
tokenizer.add_special_case('_MATH_', [{ORTH: '_MATH_'}])
|
||||
tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}])
|
||||
doc = tokenizer(text)
|
||||
assert '_MATH_' in [w.text for w in doc]
|
||||
assert 'MATH' not in [w.text for w in doc]
|
||||
assert "_MATH_" in [w.text for w in doc]
|
||||
assert "MATH" not in [w.text for w in doc]
|
||||
|
||||
|
||||
@pytest.mark.xfail(
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy.matcher import PhraseMatcher
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
|
|
@ -2,44 +2,37 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.lang.en import English
|
||||
|
||||
import spacy
|
||||
from spacy.tokenizer import Tokenizer
|
||||
from spacy import util
|
||||
|
||||
from spacy.tests.util import make_tempdir
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
def test_issue4190():
|
||||
test_string = "Test c."
|
||||
|
||||
# Load default language
|
||||
nlp_1 = English()
|
||||
doc_1a = nlp_1(test_string)
|
||||
result_1a = [token.text for token in doc_1a]
|
||||
|
||||
result_1a = [token.text for token in doc_1a] # noqa: F841
|
||||
# Modify tokenizer
|
||||
customize_tokenizer(nlp_1)
|
||||
doc_1b = nlp_1(test_string)
|
||||
result_1b = [token.text for token in doc_1b]
|
||||
|
||||
# Save and Reload
|
||||
with make_tempdir() as model_dir:
|
||||
nlp_1.to_disk(model_dir)
|
||||
nlp_2 = spacy.load(model_dir)
|
||||
|
||||
nlp_2 = util.load_model(model_dir)
|
||||
# This should be the modified tokenizer
|
||||
doc_2 = nlp_2(test_string)
|
||||
result_2 = [token.text for token in doc_2]
|
||||
|
||||
assert result_1b == result_2
|
||||
|
||||
|
||||
def customize_tokenizer(nlp):
|
||||
prefix_re = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes)
|
||||
suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes)
|
||||
infix_re = spacy.util.compile_infix_regex(nlp.Defaults.infixes)
|
||||
|
||||
# remove all exceptions where a single letter is followed by a period (e.g. 'h.')
|
||||
prefix_re = util.compile_prefix_regex(nlp.Defaults.prefixes)
|
||||
suffix_re = util.compile_suffix_regex(nlp.Defaults.suffixes)
|
||||
infix_re = util.compile_infix_regex(nlp.Defaults.infixes)
|
||||
# Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
|
||||
exceptions = {
|
||||
k: v
|
||||
for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
|
||||
|
@ -53,5 +46,4 @@ def customize_tokenizer(nlp):
|
|||
infix_finditer=infix_re.finditer,
|
||||
token_match=nlp.tokenizer.token_match,
|
||||
)
|
||||
|
||||
nlp.tokenizer = new_tokenizer
|
||||
|
|
12
spacy/tests/regression/test_issue4272.py
Normal file
12
spacy/tests/regression/test_issue4272.py
Normal file
|
@ -0,0 +1,12 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.lang.el import Greek
|
||||
|
||||
|
||||
def test_issue4272():
|
||||
"""Test that lookup table can be accessed from Token.lemma if no POS tags
|
||||
are available."""
|
||||
nlp = Greek()
|
||||
doc = nlp("Χθες")
|
||||
assert doc[0].lemma_
|
|
@ -56,6 +56,7 @@ def test_lookups_to_from_bytes():
|
|||
assert table2.get("b") == 2
|
||||
assert new_lookups.to_bytes() == lookups_bytes
|
||||
|
||||
|
||||
# This fails on Python 3.5
|
||||
@pytest.mark.xfail
|
||||
def test_lookups_to_from_disk():
|
||||
|
@ -76,6 +77,7 @@ def test_lookups_to_from_disk():
|
|||
assert len(table2) == 3
|
||||
assert table2.get("b") == 2
|
||||
|
||||
|
||||
# This fails on Python 3.5
|
||||
@pytest.mark.xfail
|
||||
def test_lookups_to_from_bytes_via_vocab():
|
||||
|
|
Loading…
Reference in New Issue
Block a user