Merge remote-tracking branch 'upstream/master' into bugfix/tokenizer-special-cases-matcher

This commit is contained in:
Adriane Boyd 2019-09-08 21:30:01 +02:00
commit 64f86b7e97
16 changed files with 1567214 additions and 118 deletions

View File

@ -18,6 +18,7 @@ class CroatianDefaults(Language.Defaults):
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = STOP_WORDS
resources = {"lemma_lookup": "lemma_lookup.json"}
class Croatian(Language):

1313609
spacy/lang/hr/lemma_lookup.json Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,15 @@
The list of Croatian lemmas was extracted from the reldi-tagger repository (https://github.com/clarinsi/reldi-tagger).
Reldi-tagger is licesned under the Apache 2.0 licence.
@InProceedings{ljubesic16-new,
author = {Nikola Ljubešić and Filip Klubička and Željko Agić and Ivo-Pavao Jazbec},
title = {New Inflectional Lexicons and Training Corpora for Improved Morphosyntactic Annotation of Croatian and Serbian},
booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)},
year = {2016},
date = {23-28},
location = {Portorož, Slovenia},
editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Sara Goggi and Marko Grobelnik and Bente Maegaard and Joseph Mariani and Helene Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis},
publisher = {European Language Resources Association (ELRA)},
address = {Paris, France},
isbn = {978-2-9517408-9-1}
}

View File

@ -21,6 +21,7 @@ class SerbianDefaults(Language.Defaults):
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
resources = {"lemma_lookup": "lemma_lookup.json"}
class Serbian(Language):

View File

@ -12,13 +12,14 @@ Example sentences to test spaCy and its language models.
sentences = [
# Translations from English
"Apple планира куповину америчког стартапа за $1 милијарду."
"Apple планира куповину америчког стартапа за $1 милијарду.",
"Беспилотни аутомобили пребацују одговорност осигурања на произвођаче.",
"Лондон је велики град у Уједињеном Краљевству.",
"Где си ти?",
"Ко је председник Француске?",
# Serbian common and slang
"Moj ћале је инжењер!",
"Новак Ђоковић је најбољи тенисер света." "У Пироту има добрих кафана!",
"Новак Ђоковић је најбољи тенисер света.",
"У Пироту има добрих кафана!",
"Музеј Николе Тесле се налази у Београду.",
]

253316
spacy/lang/sr/lemma_lookup.json Executable file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,32 @@
Copyright @InProceedings{ljubesic16-new,
author = {Nikola Ljubešić and Filip Klubička and Željko Agić and Ivo-Pavao Jazbec},
title = {New Inflectional Lexicons and Training Corpora for Improved Morphosyntactic Annotation of Croatian and Serbian},
booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)},
year = {2016},
date = {23-28},
location = {Portorož, Slovenia},
editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Sara Goggi and Marko Grobelnik and Bente Maegaard and Joseph Mariani and Helene Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis},
publisher = {European Language Resources Association (ELRA)},
address = {Paris, France},
isbn = {978-2-9517408-9-1}
}
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
The licence of Serbian lemmas was adopted from Serbian lexicon:
- sr.lexicon (https://github.com/clarinsi/reldi-tagger/blob/master/sr.lexicon)
Changelog:
- Lexicon is translated into cyrilic
- Word order is sorted

View File

@ -15,6 +15,7 @@ _abbrev_exc = [
{ORTH: "пет", LEMMA: "петак", NORM: "петак"},
{ORTH: "суб", LEMMA: "субота", NORM: "субота"},
{ORTH: "нед", LEMMA: "недеља", NORM: "недеља"},
# Months abbreviations
{ORTH: "јан", LEMMA: "јануар", NORM: "јануар"},
{ORTH: "феб", LEMMA: "фебруар", NORM: "фебруар"},
@ -27,7 +28,7 @@ _abbrev_exc = [
{ORTH: "септ", LEMMA: "септембар", NORM: "септембар"},
{ORTH: "окт", LEMMA: "октобар", NORM: "октобар"},
{ORTH: "нов", LEMMA: "новембар", NORM: "новембар"},
{ORTH: "дец", LEMMA: "децембар", NORM: "децембар"},
{ORTH: "дец", LEMMA: "децембар", NORM: "децембар"}
]

View File

@ -103,6 +103,11 @@ def he_tokenizer():
return get_lang_class("he").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def hr_tokenizer():
return get_lang_class("hr").Defaults.create_tokenizer()
@pytest.fixture
def hu_tokenizer():
return get_lang_class("hu").Defaults.create_tokenizer()

View File

@ -99,6 +99,41 @@ def test_doc_retokenize_spans_merge_tokens(en_tokenizer):
assert doc[0].ent_type_ == "GPE"
def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
text = "The players start."
heads = [1, 1, 0, -1]
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=["DT", "NN", "VBZ", "."], pos=["DET", "NOUN", "VERB", "PUNCT"], heads=heads)
assert len(doc) == 4
assert doc[0].text == "The"
assert doc[0].tag_ == "DT"
assert doc[0].pos_ == "DET"
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[0:2])
assert len(doc) == 3
assert doc[0].text == "The players"
assert doc[0].tag_ == "NN"
assert doc[0].pos_ == "NOUN"
assert doc[0].lemma_ == "The players"
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=["DT", "NN", "VBZ", "."], pos=["DET", "NOUN", "VERB", "PUNCT"], heads=heads)
assert len(doc) == 4
assert doc[0].text == "The"
assert doc[0].tag_ == "DT"
assert doc[0].pos_ == "DET"
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[0:2])
retokenizer.merge(doc[2:4])
assert len(doc) == 2
assert doc[0].text == "The players"
assert doc[0].tag_ == "NN"
assert doc[0].pos_ == "NOUN"
assert doc[0].lemma_ == "The players"
assert doc[1].text == "start ."
assert doc[1].tag_ == "VBZ"
assert doc[1].pos_ == "VERB"
assert doc[1].lemma_ == "start ."
def test_doc_retokenize_spans_merge_heads(en_tokenizer):
text = "I found a pilates class near work."
heads = [1, 0, 2, 1, -3, -1, -1, -6]
@ -182,7 +217,7 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer):
assert len(doc) == 15
def test_doc_retokenize_spans_entity_merge_iob():
def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
# Test entity IOB stays consistent after merging
words = ["a", "b", "c", "d", "e"]
doc = Doc(Vocab(), words=words)
@ -195,10 +230,23 @@ def test_doc_retokenize_spans_entity_merge_iob():
assert doc[2].ent_iob_ == "I"
assert doc[3].ent_iob_ == "B"
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[0:1])
retokenizer.merge(doc[0:2])
assert len(doc) == len(words) - 1
assert doc[0].ent_iob_ == "B"
assert doc[1].ent_iob_ == "I"
# Test that IOB stays consistent with provided IOB
words = ["a", "b", "c", "d", "e"]
doc = Doc(Vocab(), words=words)
with doc.retokenize() as retokenizer:
attrs = {"ent_type": "ent-abc", "ent_iob": 1}
retokenizer.merge(doc[0:3], attrs=attrs)
retokenizer.merge(doc[3:5], attrs=attrs)
assert doc[0].ent_iob_ == "B"
assert doc[1].ent_iob_ == "I"
# if no parse/heads, the first word in the span is the root and provides
# default values
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
doc = Doc(Vocab(), words=words)
doc.ents = [
@ -215,7 +263,53 @@ def test_doc_retokenize_spans_entity_merge_iob():
retokenizer.merge(doc[7:9])
assert len(doc) == 6
assert doc[3].ent_iob_ == "B"
assert doc[4].ent_iob_ == "I"
assert doc[3].ent_type_ == "ent-de"
assert doc[4].ent_iob_ == "B"
assert doc[4].ent_type_ == "ent-fg"
# if there is a parse, span.root provides default values
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
heads = [ 0, -1, 1, -3, -4, -5, -1, -7, -8 ]
ents = [
(3, 5, "ent-de"),
(5, 7, "ent-fg"),
]
deps = ["dep"] * len(words)
en_vocab.strings.add("ent-de")
en_vocab.strings.add("ent-fg")
en_vocab.strings.add("dep")
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
assert doc[2:4].root == doc[3] # root of 'c d' is d
assert doc[4:6].root == doc[4] # root is 'e f' is e
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[2:4])
retokenizer.merge(doc[4:6])
retokenizer.merge(doc[7:9])
assert len(doc) == 6
assert doc[2].ent_iob_ == "B"
assert doc[2].ent_type_ == "ent-de"
assert doc[3].ent_iob_ == "I"
assert doc[3].ent_type_ == "ent-de"
assert doc[4].ent_iob_ == "B"
assert doc[4].ent_type_ == "ent-fg"
# check that B is preserved if span[start] is B
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
heads = [ 0, -1, 1, 1, -4, -5, -1, -7, -8 ]
ents = [
(3, 5, "ent-de"),
(5, 7, "ent-de"),
]
deps = ["dep"] * len(words)
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[3:5])
retokenizer.merge(doc[5:7])
assert len(doc) == 7
assert doc[3].ent_iob_ == "B"
assert doc[3].ent_type_ == "ent-de"
assert doc[4].ent_iob_ == "B"
assert doc[4].ent_type_ == "ent-de"
def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer):

View File

@ -0,0 +1,20 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize(
"string,lemma",
[
("trčao", "trčati"),
("adekvatnim", "adekvatan"),
("dekontaminacijama", "dekontaminacija"),
("filologovih", "filologov"),
("je", "biti"),
("se", "sebe"),
],
)
def test_hr_lemmatizer_lookup_assigns(hr_tokenizer, string, lemma):
tokens = hr_tokenizer(string)
assert tokens[0].lemma_ == lemma

View File

@ -0,0 +1,20 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize(
"string,lemma",
[
("најадекватнији", "адекватан"),
("матурирао", "матурирати"),
("планираћемо", "планирати"),
("певају", "певати"),
("нама", "ми"),
("се", "себе"),
],
)
def test_sr_lemmatizer_lookup_assigns(sr_tokenizer, string, lemma):
tokens = sr_tokenizer(string)
assert tokens[0].lemma_ == lemma

View File

@ -13,7 +13,6 @@ from spacy.lemmatizer import Lemmatizer
from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part
@pytest.mark.xfail
def test_issue1061():
'''Test special-case works after tokenizing. Was caching problem.'''
text = 'I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_.'

View File

@ -16,10 +16,10 @@ cdef class Tokenizer:
cdef PreshMap _specials
cpdef readonly Vocab vocab
cdef public object token_match
cdef public object prefix_search
cdef public object suffix_search
cdef public object infix_finditer
cdef object _token_match
cdef object _prefix_search
cdef object _suffix_search
cdef object _infix_finditer
cdef object _rules
cdef object _special_matcher

View File

@ -63,6 +63,38 @@ cdef class Tokenizer:
self._special_matcher = Matcher(self.vocab)
self._load_special_cases(rules)
property token_match:
def __get__(self):
return self._token_match
def __set__(self, token_match):
self._token_match = token_match
self._flush_cache()
property prefix_search:
def __get__(self):
return self._prefix_search
def __set__(self, prefix_search):
self._prefix_search = prefix_search
self._flush_cache()
property suffix_search:
def __get__(self):
return self._suffix_search
def __set__(self, suffix_search):
self._suffix_search = suffix_search
self._flush_cache()
property infix_finditer:
def __get__(self):
return self._infix_finditer
def __set__(self, infix_finditer):
self._infix_finditer = infix_finditer
self._flush_cache()
def __reduce__(self):
args = (self.vocab,
self._rules,
@ -153,9 +185,23 @@ cdef class Tokenizer:
for text in texts:
yield self(text)
def _flush_cache(self):
self._reset_cache([key for key in self._cache if not key in self._specials])
def _reset_cache(self, keys):
for k in keys:
del self._cache[k]
if not k in self._specials:
cached = <_Cached*>self._cache.get(k)
if cached is not NULL:
self.mem.free(cached)
def _reset_specials(self):
for k in self._specials:
cached = <_Cached*>self._specials.get(k)
del self._specials[k]
if cached is not NULL:
self.mem.free(cached)
cdef int _apply_special_cases(self, Doc doc):
"""Retokenize doc according to special cases.
@ -409,7 +455,14 @@ cdef class Tokenizer:
cached.is_lex = False
cached.data.tokens = self.vocab.make_fused_token(substrings)
key = hash_string(string)
stale_special = <_Cached*>self._specials.get(key)
stale_cached = <_Cached*>self._cache.get(key)
self._flush_cache()
self._specials.set(key, cached)
if stale_special is not NULL:
self.mem.free(stale_special)
if stale_special != stale_cached and stale_cached is not NULL:
self.mem.free(stale_cached)
self._rules[string] = substrings
self._special_matcher.add(string, None, [{ORTH: token.text} for token in self._tokenize_affixes(string)])
@ -493,7 +546,10 @@ cdef class Tokenizer:
if data.get("rules"):
# make sure to hard reset the cache to remove data from the default exceptions
self._rules = {}
self._reset_cache([key for key in self._cache])
self._reset_specials()
self._cache = PreshMap()
self._specials = PreshMap()
for string, substrings in data.get("rules", {}).items():
self.add_special_case(string, substrings)

View File

@ -109,13 +109,8 @@ cdef class Retokenizer:
def __exit__(self, *args):
# Do the actual merging here
if len(self.merges) > 1:
_bulk_merge(self.doc, self.merges)
elif len(self.merges) == 1:
(span, attrs) = self.merges[0]
start = span.start
end = span.end
_merge(self.doc, start, end, attrs)
if len(self.merges) >= 1:
_merge(self.doc, self.merges)
# Iterate in order, to keep things simple.
for start_char, orths, heads, attrs in sorted(self.splits):
# Resolve token index
@ -140,95 +135,7 @@ cdef class Retokenizer:
_split(self.doc, token_index, orths, head_indices, attrs)
def _merge(Doc doc, int start, int end, attributes):
"""Retokenize the document, such that the span at
`doc.text[start_idx : end_idx]` is merged into a single token. If
`start_idx` and `end_idx `do not mark start and end token boundaries,
the document remains unchanged.
start_idx (int): Character index of the start of the slice to merge.
end_idx (int): Character index after the end of the slice to merge.
**attributes: Attributes to assign to the merged token. By default,
attributes are inherited from the syntactic root of the span.
RETURNS (Token): The newly merged token, or `None` if the start and end
indices did not fall at token boundaries.
"""
cdef Span span = doc[start:end]
cdef int start_char = span.start_char
cdef int end_char = span.end_char
# Resize the doc.tensor, if it's set. Let the last row for each token stand
# for the merged region. To do this, we create a boolean array indicating
# whether the row is to be deleted, then use numpy.delete
if doc.tensor is not None and doc.tensor.size != 0:
doc.tensor = _resize_tensor(doc.tensor, [(start, end)])
# Get LexemeC for newly merged token
new_orth = ''.join([t.text_with_ws for t in span])
if span[-1].whitespace_:
new_orth = new_orth[:-len(span[-1].whitespace_)]
cdef const LexemeC* lex = doc.vocab.get(doc.mem, new_orth)
# House the new merged token where it starts
cdef TokenC* token = &doc.c[start]
token.spacy = doc.c[end-1].spacy
for attr_name, attr_value in attributes.items():
if attr_name == "_": # Set extension attributes
for ext_attr_key, ext_attr_value in attr_value.items():
doc[start]._.set(ext_attr_key, ext_attr_value)
elif attr_name == TAG:
doc.vocab.morphology.assign_tag(token, attr_value)
else:
# Set attributes on both token and lexeme to take care of token
# attribute vs. lexical attribute without having to enumerate them.
# If an attribute name is not valid, set_struct_attr will ignore it.
Token.set_struct_attr(token, attr_name, attr_value)
Lexeme.set_struct_attr(<LexemeC*>lex, attr_name, attr_value)
# Make sure ent_iob remains consistent
if doc.c[end].ent_iob == 1 and token.ent_iob in (0, 2):
if token.ent_type == doc.c[end].ent_type:
token.ent_iob = 3
else:
# If they're not the same entity type, let them be two entities
doc.c[end].ent_iob = 3
# Begin by setting all the head indices to absolute token positions
# This is easier to work with for now than the offsets
# Before thinking of something simpler, beware the case where a
# dependency bridges over the entity. Here the alignment of the
# tokens changes.
span_root = span.root.i
token.dep = span.root.dep
# We update token.lex after keeping span root and dep, since
# setting token.lex will change span.start and span.end properties
# as it modifies the character offsets in the doc
token.lex = lex
for i in range(doc.length):
doc.c[i].head += i
# Set the head of the merged token, and its dep relation, from the Span
token.head = doc.c[span_root].head
# Adjust deps before shrinking tokens
# Tokens which point into the merged token should now point to it
# Subtract the offset from all tokens which point to >= end
offset = (end - start) - 1
for i in range(doc.length):
head_idx = doc.c[i].head
if start <= head_idx < end:
doc.c[i].head = start
elif head_idx >= end:
doc.c[i].head -= offset
# Now compress the token array
for i in range(end, doc.length):
doc.c[i - offset] = doc.c[i]
for i in range(doc.length - offset, doc.length):
memset(&doc.c[i], 0, sizeof(TokenC))
doc.c[i].lex = &EMPTY_LEXEME
doc.length -= offset
for i in range(doc.length):
# ...And, set heads back to a relative position
doc.c[i].head -= i
# Set the left/right children, left/right edges
set_children_from_heads(doc.c, doc.length)
# Return the merged Python object
return doc[start]
def _bulk_merge(Doc doc, merges):
def _merge(Doc doc, merges):
"""Retokenize the document, such that the spans described in 'merges'
are merged into a single token. This method assumes that the merges
are in the same order at which they appear in the doc, and that merges
@ -256,6 +163,26 @@ def _bulk_merge(Doc doc, merges):
spans.append(span)
# House the new merged token where it starts
token = &doc.c[start]
# Initially set attributes to attributes of span root
token.tag = doc.c[span.root.i].tag
token.pos = doc.c[span.root.i].pos
token.morph = doc.c[span.root.i].morph
token.ent_iob = doc.c[span.root.i].ent_iob
token.ent_type = doc.c[span.root.i].ent_type
merged_iob = token.ent_iob
# If span root is part of an entity, merged token is B-ENT
if token.ent_iob in (1, 3):
merged_iob = 3
# If start token is I-ENT and previous token is of the same
# type, then I-ENT (could check I-ENT from start to span root)
if doc.c[start].ent_iob == 1 and start > 0 \
and doc.c[start].ent_type == token.ent_type \
and doc.c[start - 1].ent_type == token.ent_type:
merged_iob = 1
token.ent_iob = merged_iob
# Unset attributes that don't match new token
token.lemma = 0
token.norm = 0
tokens[merge_index] = token
# Resize the doc.tensor, if it's set. Let the last row for each token stand
# for the merged region. To do this, we create a boolean array indicating
@ -351,17 +278,7 @@ def _bulk_merge(Doc doc, merges):
# Set the left/right children, left/right edges
set_children_from_heads(doc.c, doc.length)
# Make sure ent_iob remains consistent
for (span, _) in merges:
if(span.end < len(offsets)):
# If it's not the last span
token_after_span_position = offsets[span.end]
if doc.c[token_after_span_position].ent_iob == 1\
and doc.c[token_after_span_position - 1].ent_iob in (0, 2):
if doc.c[token_after_span_position - 1].ent_type == doc.c[token_after_span_position].ent_type:
doc.c[token_after_span_position - 1].ent_iob = 3
else:
# If they're not the same entity type, let them be two entities
doc.c[token_after_span_position].ent_iob = 3
make_iob_consistent(doc.c, doc.length)
# Return the merged Python object
return doc[spans[0].start]
@ -480,3 +397,12 @@ def _validate_extensions(extensions):
raise ValueError(Errors.E118.format(attr=key))
if not is_writable_attr(extension):
raise ValueError(Errors.E119.format(attr=key))
cdef make_iob_consistent(TokenC* tokens, int length):
cdef int i
if tokens[0].ent_iob == 1:
tokens[0].ent_iob = 3
for i in range(1, length):
if tokens[i].ent_iob == 1 and tokens[i - 1].ent_type != tokens[i].ent_type:
tokens[i].ent_iob = 3