mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-15 20:16:23 +03:00
Merge remote-tracking branch 'upstream/master' into bugfix/tokenizer-special-cases-matcher
This commit is contained in:
commit
b097b0b83d
|
@ -1,7 +1,6 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import re
|
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
|
||||||
from ...gold import iob_to_biluo
|
from ...gold import iob_to_biluo
|
||||||
|
|
|
@ -46,6 +46,11 @@ class GreekLemmatizer(object):
|
||||||
)
|
)
|
||||||
return lemmas
|
return lemmas
|
||||||
|
|
||||||
|
def lookup(self, string):
|
||||||
|
if string in self.lookup_table:
|
||||||
|
return self.lookup_table[string]
|
||||||
|
return string
|
||||||
|
|
||||||
|
|
||||||
def lemmatize(string, index, exceptions, rules):
|
def lemmatize(string, index, exceptions, rules):
|
||||||
string = string.lower()
|
string = string.lower()
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -15,7 +15,6 @@ _abbrev_exc = [
|
||||||
{ORTH: "пет", LEMMA: "петак", NORM: "петак"},
|
{ORTH: "пет", LEMMA: "петак", NORM: "петак"},
|
||||||
{ORTH: "суб", LEMMA: "субота", NORM: "субота"},
|
{ORTH: "суб", LEMMA: "субота", NORM: "субота"},
|
||||||
{ORTH: "нед", LEMMA: "недеља", NORM: "недеља"},
|
{ORTH: "нед", LEMMA: "недеља", NORM: "недеља"},
|
||||||
|
|
||||||
# Months abbreviations
|
# Months abbreviations
|
||||||
{ORTH: "јан", LEMMA: "јануар", NORM: "јануар"},
|
{ORTH: "јан", LEMMA: "јануар", NORM: "јануар"},
|
||||||
{ORTH: "феб", LEMMA: "фебруар", NORM: "фебруар"},
|
{ORTH: "феб", LEMMA: "фебруар", NORM: "фебруар"},
|
||||||
|
@ -28,7 +27,7 @@ _abbrev_exc = [
|
||||||
{ORTH: "септ", LEMMA: "септембар", NORM: "септембар"},
|
{ORTH: "септ", LEMMA: "септембар", NORM: "септембар"},
|
||||||
{ORTH: "окт", LEMMA: "октобар", NORM: "октобар"},
|
{ORTH: "окт", LEMMA: "октобар", NORM: "октобар"},
|
||||||
{ORTH: "нов", LEMMA: "новембар", NORM: "новембар"},
|
{ORTH: "нов", LEMMA: "новембар", NORM: "новембар"},
|
||||||
{ORTH: "дец", LEMMA: "децембар", NORM: "децембар"}
|
{ORTH: "дец", LEMMA: "децембар", NORM: "децембар"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -103,7 +103,13 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
|
||||||
text = "The players start."
|
text = "The players start."
|
||||||
heads = [1, 1, 0, -1]
|
heads = [1, 1, 0, -1]
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=["DT", "NN", "VBZ", "."], pos=["DET", "NOUN", "VERB", "PUNCT"], heads=heads)
|
doc = get_doc(
|
||||||
|
tokens.vocab,
|
||||||
|
words=[t.text for t in tokens],
|
||||||
|
tags=["DT", "NN", "VBZ", "."],
|
||||||
|
pos=["DET", "NOUN", "VERB", "PUNCT"],
|
||||||
|
heads=heads,
|
||||||
|
)
|
||||||
assert len(doc) == 4
|
assert len(doc) == 4
|
||||||
assert doc[0].text == "The"
|
assert doc[0].text == "The"
|
||||||
assert doc[0].tag_ == "DT"
|
assert doc[0].tag_ == "DT"
|
||||||
|
@ -115,7 +121,13 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
|
||||||
assert doc[0].tag_ == "NN"
|
assert doc[0].tag_ == "NN"
|
||||||
assert doc[0].pos_ == "NOUN"
|
assert doc[0].pos_ == "NOUN"
|
||||||
assert doc[0].lemma_ == "The players"
|
assert doc[0].lemma_ == "The players"
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=["DT", "NN", "VBZ", "."], pos=["DET", "NOUN", "VERB", "PUNCT"], heads=heads)
|
doc = get_doc(
|
||||||
|
tokens.vocab,
|
||||||
|
words=[t.text for t in tokens],
|
||||||
|
tags=["DT", "NN", "VBZ", "."],
|
||||||
|
pos=["DET", "NOUN", "VERB", "PUNCT"],
|
||||||
|
heads=heads,
|
||||||
|
)
|
||||||
assert len(doc) == 4
|
assert len(doc) == 4
|
||||||
assert doc[0].text == "The"
|
assert doc[0].text == "The"
|
||||||
assert doc[0].tag_ == "DT"
|
assert doc[0].tag_ == "DT"
|
||||||
|
@ -269,18 +281,15 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
|
||||||
|
|
||||||
# if there is a parse, span.root provides default values
|
# if there is a parse, span.root provides default values
|
||||||
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
||||||
heads = [ 0, -1, 1, -3, -4, -5, -1, -7, -8 ]
|
heads = [0, -1, 1, -3, -4, -5, -1, -7, -8]
|
||||||
ents = [
|
ents = [(3, 5, "ent-de"), (5, 7, "ent-fg")]
|
||||||
(3, 5, "ent-de"),
|
deps = ["dep"] * len(words)
|
||||||
(5, 7, "ent-fg"),
|
|
||||||
]
|
|
||||||
deps = ["dep"] * len(words)
|
|
||||||
en_vocab.strings.add("ent-de")
|
en_vocab.strings.add("ent-de")
|
||||||
en_vocab.strings.add("ent-fg")
|
en_vocab.strings.add("ent-fg")
|
||||||
en_vocab.strings.add("dep")
|
en_vocab.strings.add("dep")
|
||||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
|
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
|
||||||
assert doc[2:4].root == doc[3] # root of 'c d' is d
|
assert doc[2:4].root == doc[3] # root of 'c d' is d
|
||||||
assert doc[4:6].root == doc[4] # root is 'e f' is e
|
assert doc[4:6].root == doc[4] # root is 'e f' is e
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
retokenizer.merge(doc[2:4])
|
retokenizer.merge(doc[2:4])
|
||||||
retokenizer.merge(doc[4:6])
|
retokenizer.merge(doc[4:6])
|
||||||
|
@ -295,12 +304,9 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
|
||||||
|
|
||||||
# check that B is preserved if span[start] is B
|
# check that B is preserved if span[start] is B
|
||||||
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
||||||
heads = [ 0, -1, 1, 1, -4, -5, -1, -7, -8 ]
|
heads = [0, -1, 1, 1, -4, -5, -1, -7, -8]
|
||||||
ents = [
|
ents = [(3, 5, "ent-de"), (5, 7, "ent-de")]
|
||||||
(3, 5, "ent-de"),
|
deps = ["dep"] * len(words)
|
||||||
(5, 7, "ent-de"),
|
|
||||||
]
|
|
||||||
deps = ["dep"] * len(words)
|
|
||||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
|
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
retokenizer.merge(doc[3:5])
|
retokenizer.merge(doc[3:5])
|
||||||
|
|
|
@ -14,24 +14,24 @@ from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part
|
||||||
|
|
||||||
|
|
||||||
def test_issue1061():
|
def test_issue1061():
|
||||||
'''Test special-case works after tokenizing. Was caching problem.'''
|
"""Test special-case works after tokenizing. Was caching problem."""
|
||||||
text = 'I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_.'
|
text = "I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_."
|
||||||
tokenizer = English.Defaults.create_tokenizer()
|
tokenizer = English.Defaults.create_tokenizer()
|
||||||
doc = tokenizer(text)
|
doc = tokenizer(text)
|
||||||
assert 'MATH' in [w.text for w in doc]
|
assert "MATH" in [w.text for w in doc]
|
||||||
assert '_MATH_' not in [w.text for w in doc]
|
assert "_MATH_" not in [w.text for w in doc]
|
||||||
|
|
||||||
tokenizer.add_special_case('_MATH_', [{ORTH: '_MATH_'}])
|
tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}])
|
||||||
doc = tokenizer(text)
|
doc = tokenizer(text)
|
||||||
assert '_MATH_' in [w.text for w in doc]
|
assert "_MATH_" in [w.text for w in doc]
|
||||||
assert 'MATH' not in [w.text for w in doc]
|
assert "MATH" not in [w.text for w in doc]
|
||||||
|
|
||||||
# For sanity, check it works when pipeline is clean.
|
# For sanity, check it works when pipeline is clean.
|
||||||
tokenizer = English.Defaults.create_tokenizer()
|
tokenizer = English.Defaults.create_tokenizer()
|
||||||
tokenizer.add_special_case('_MATH_', [{ORTH: '_MATH_'}])
|
tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}])
|
||||||
doc = tokenizer(text)
|
doc = tokenizer(text)
|
||||||
assert '_MATH_' in [w.text for w in doc]
|
assert "_MATH_" in [w.text for w in doc]
|
||||||
assert 'MATH' not in [w.text for w in doc]
|
assert "MATH" not in [w.text for w in doc]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail(
|
@pytest.mark.xfail(
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
|
||||||
from spacy.matcher import Matcher
|
from spacy.matcher import Matcher
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
|
||||||
from spacy.matcher import Matcher
|
from spacy.matcher import Matcher
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
|
||||||
from spacy.matcher import PhraseMatcher
|
from spacy.matcher import PhraseMatcher
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
|
||||||
from spacy.matcher import Matcher
|
from spacy.matcher import Matcher
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
|
|
@ -2,44 +2,37 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
|
||||||
import spacy
|
|
||||||
from spacy.tokenizer import Tokenizer
|
from spacy.tokenizer import Tokenizer
|
||||||
|
from spacy import util
|
||||||
|
|
||||||
from spacy.tests.util import make_tempdir
|
from ..util import make_tempdir
|
||||||
|
|
||||||
|
|
||||||
def test_issue4190():
|
def test_issue4190():
|
||||||
test_string = "Test c."
|
test_string = "Test c."
|
||||||
|
|
||||||
# Load default language
|
# Load default language
|
||||||
nlp_1 = English()
|
nlp_1 = English()
|
||||||
doc_1a = nlp_1(test_string)
|
doc_1a = nlp_1(test_string)
|
||||||
result_1a = [token.text for token in doc_1a]
|
result_1a = [token.text for token in doc_1a] # noqa: F841
|
||||||
|
|
||||||
# Modify tokenizer
|
# Modify tokenizer
|
||||||
customize_tokenizer(nlp_1)
|
customize_tokenizer(nlp_1)
|
||||||
doc_1b = nlp_1(test_string)
|
doc_1b = nlp_1(test_string)
|
||||||
result_1b = [token.text for token in doc_1b]
|
result_1b = [token.text for token in doc_1b]
|
||||||
|
|
||||||
# Save and Reload
|
# Save and Reload
|
||||||
with make_tempdir() as model_dir:
|
with make_tempdir() as model_dir:
|
||||||
nlp_1.to_disk(model_dir)
|
nlp_1.to_disk(model_dir)
|
||||||
nlp_2 = spacy.load(model_dir)
|
nlp_2 = util.load_model(model_dir)
|
||||||
|
|
||||||
# This should be the modified tokenizer
|
# This should be the modified tokenizer
|
||||||
doc_2 = nlp_2(test_string)
|
doc_2 = nlp_2(test_string)
|
||||||
result_2 = [token.text for token in doc_2]
|
result_2 = [token.text for token in doc_2]
|
||||||
|
|
||||||
assert result_1b == result_2
|
assert result_1b == result_2
|
||||||
|
|
||||||
|
|
||||||
def customize_tokenizer(nlp):
|
def customize_tokenizer(nlp):
|
||||||
prefix_re = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes)
|
prefix_re = util.compile_prefix_regex(nlp.Defaults.prefixes)
|
||||||
suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes)
|
suffix_re = util.compile_suffix_regex(nlp.Defaults.suffixes)
|
||||||
infix_re = spacy.util.compile_infix_regex(nlp.Defaults.infixes)
|
infix_re = util.compile_infix_regex(nlp.Defaults.infixes)
|
||||||
|
# Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
|
||||||
# remove all exceptions where a single letter is followed by a period (e.g. 'h.')
|
|
||||||
exceptions = {
|
exceptions = {
|
||||||
k: v
|
k: v
|
||||||
for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
|
for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
|
||||||
|
@ -53,5 +46,4 @@ def customize_tokenizer(nlp):
|
||||||
infix_finditer=infix_re.finditer,
|
infix_finditer=infix_re.finditer,
|
||||||
token_match=nlp.tokenizer.token_match,
|
token_match=nlp.tokenizer.token_match,
|
||||||
)
|
)
|
||||||
|
|
||||||
nlp.tokenizer = new_tokenizer
|
nlp.tokenizer = new_tokenizer
|
||||||
|
|
12
spacy/tests/regression/test_issue4272.py
Normal file
12
spacy/tests/regression/test_issue4272.py
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from spacy.lang.el import Greek
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4272():
|
||||||
|
"""Test that lookup table can be accessed from Token.lemma if no POS tags
|
||||||
|
are available."""
|
||||||
|
nlp = Greek()
|
||||||
|
doc = nlp("Χθες")
|
||||||
|
assert doc[0].lemma_
|
|
@ -56,6 +56,7 @@ def test_lookups_to_from_bytes():
|
||||||
assert table2.get("b") == 2
|
assert table2.get("b") == 2
|
||||||
assert new_lookups.to_bytes() == lookups_bytes
|
assert new_lookups.to_bytes() == lookups_bytes
|
||||||
|
|
||||||
|
|
||||||
# This fails on Python 3.5
|
# This fails on Python 3.5
|
||||||
@pytest.mark.xfail
|
@pytest.mark.xfail
|
||||||
def test_lookups_to_from_disk():
|
def test_lookups_to_from_disk():
|
||||||
|
@ -76,6 +77,7 @@ def test_lookups_to_from_disk():
|
||||||
assert len(table2) == 3
|
assert len(table2) == 3
|
||||||
assert table2.get("b") == 2
|
assert table2.get("b") == 2
|
||||||
|
|
||||||
|
|
||||||
# This fails on Python 3.5
|
# This fails on Python 3.5
|
||||||
@pytest.mark.xfail
|
@pytest.mark.xfail
|
||||||
def test_lookups_to_from_bytes_via_vocab():
|
def test_lookups_to_from_bytes_via_vocab():
|
||||||
|
|
Loading…
Reference in New Issue
Block a user