mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
💫 Fix interaction of lemmatizer and tokenizer exceptions (#3388)
Closes #2203. Closes #3268. Lemmas set from outside the `Morphology` class were being overwritten. The result was especially confusing when deserialising, as it meant some lemmas could change when storing and retrieving a `Doc` object. This PR applies two fixes: 1) When we go to set the lemma in the `Morphology` class, first check whether a lemma is already set. If so, don't overwrite. 2) When we load with `doc.from_array()`, take care to apply the `TAG` field first. This allows other fields to overwrite the `TAG` implied properties, if they're provided explicitly (e.g. the `LEMMA`). ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
This commit is contained in:
parent
04ca710da7
commit
80b94313b6
|
@ -110,7 +110,8 @@ cdef class Morphology:
|
||||||
analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth,
|
analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth,
|
||||||
self.tag_map.get(tag_str, {}))
|
self.tag_map.get(tag_str, {}))
|
||||||
self._cache.set(tag_id, token.lex.orth, analysis)
|
self._cache.set(tag_id, token.lex.orth, analysis)
|
||||||
token.lemma = analysis.lemma
|
if token.lemma == 0:
|
||||||
|
token.lemma = analysis.lemma
|
||||||
token.pos = analysis.tag.pos
|
token.pos = analysis.tag.pos
|
||||||
token.tag = analysis.tag.name
|
token.tag = analysis.tag.name
|
||||||
token.morph = analysis.tag.morph
|
token.morph = analysis.tag.morph
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
import numpy
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.displacy import render
|
from spacy.displacy import render
|
||||||
from spacy.gold import iob_to_biluo
|
from spacy.gold import iob_to_biluo
|
||||||
|
@ -39,6 +40,26 @@ def test_issue2179():
|
||||||
assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",)
|
assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",)
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue2203(en_vocab):
|
||||||
|
"""Test that lemmas are set correctly in doc.from_array."""
|
||||||
|
words = ["I", "'ll", "survive"]
|
||||||
|
tags = ["PRP", "MD", "VB"]
|
||||||
|
lemmas = ["-PRON-", "will", "survive"]
|
||||||
|
tag_ids = [en_vocab.strings.add(tag) for tag in tags]
|
||||||
|
lemma_ids = [en_vocab.strings.add(lemma) for lemma in lemmas]
|
||||||
|
doc = Doc(en_vocab, words=words)
|
||||||
|
# Work around lemma corrpution problem and set lemmas after tags
|
||||||
|
doc.from_array("TAG", numpy.array(tag_ids, dtype="uint64"))
|
||||||
|
doc.from_array("LEMMA", numpy.array(lemma_ids, dtype="uint64"))
|
||||||
|
assert [t.tag_ for t in doc] == tags
|
||||||
|
assert [t.lemma_ for t in doc] == lemmas
|
||||||
|
# We need to serialize both tag and lemma, since this is what causes the bug
|
||||||
|
doc_array = doc.to_array(["TAG", "LEMMA"])
|
||||||
|
new_doc = Doc(doc.vocab, words=words).from_array(["TAG", "LEMMA"], doc_array)
|
||||||
|
assert [t.tag_ for t in new_doc] == tags
|
||||||
|
assert [t.lemma_ for t in new_doc] == lemmas
|
||||||
|
|
||||||
|
|
||||||
def test_issue2219(en_vocab):
|
def test_issue2219(en_vocab):
|
||||||
vectors = [("a", [1, 2, 3]), ("letter", [4, 5, 6])]
|
vectors = [("a", [1, 2, 3]), ("letter", [4, 5, 6])]
|
||||||
add_vecs_to_vocab(en_vocab, vectors)
|
add_vecs_to_vocab(en_vocab, vectors)
|
||||||
|
|
|
@ -763,17 +763,18 @@ cdef class Doc:
|
||||||
attr_ids[i] = attr_id
|
attr_ids[i] = attr_id
|
||||||
if len(array.shape) == 1:
|
if len(array.shape) == 1:
|
||||||
array = array.reshape((array.size, 1))
|
array = array.reshape((array.size, 1))
|
||||||
|
# Do TAG first. This lets subsequent loop override stuff like POS, LEMMA
|
||||||
|
if TAG in attrs:
|
||||||
|
col = attrs.index(TAG)
|
||||||
|
for i in range(length):
|
||||||
|
if array[i, col] != 0:
|
||||||
|
self.vocab.morphology.assign_tag(&tokens[i], array[i, col])
|
||||||
# Now load the data
|
# Now load the data
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
token = &self.c[i]
|
token = &self.c[i]
|
||||||
for j in range(n_attrs):
|
for j in range(n_attrs):
|
||||||
Token.set_struct_attr(token, attr_ids[j], array[i, j])
|
if attr_ids[j] != TAG:
|
||||||
# Auxiliary loading logic
|
Token.set_struct_attr(token, attr_ids[j], array[i, j])
|
||||||
for col, attr_id in enumerate(attrs):
|
|
||||||
if attr_id == TAG:
|
|
||||||
for i in range(length):
|
|
||||||
if array[i, col] != 0:
|
|
||||||
self.vocab.morphology.assign_tag(&tokens[i], array[i, col])
|
|
||||||
# Set flags
|
# Set flags
|
||||||
self.is_parsed = bool(self.is_parsed or HEAD in attrs or DEP in attrs)
|
self.is_parsed = bool(self.is_parsed or HEAD in attrs or DEP in attrs)
|
||||||
self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs)
|
self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user