Fix issue #514 -- serializer fails when new entity type has been added. The fix here is quite ugly. It's best to add the entities ASAP after loading the NLP pipeline, to mitigate the brittleness.

This commit is contained in:
Matthew Honnibal 2016-10-23 17:45:44 +02:00
parent 79aa03fe98
commit 3e688e6d4b
3 changed files with 28 additions and 2 deletions

View File

@ -7,6 +7,7 @@ from .tagger import Tagger
# TODO: The disorganization here is pretty embarrassing. At least it's only
# internals.
from .syntax.parser import get_templates as get_feature_templates
from .attrs import DEP, ENT_TYPE
cdef class EntityRecognizer(Parser):
@ -14,11 +15,33 @@ cdef class EntityRecognizer(Parser):
feature_templates = get_feature_templates('ner')
def add_label(self, label):
for action in self.moves.action_types:
self.moves.add_action(action, label)
if isinstance(label, basestring):
label = self.vocab.strings[label]
for attr, freqs in self.vocab.serializer_freqs:
if attr == ENT_TYPE and label not in freqs:
freqs.append([label, 1])
# Super hacky :(
self.vocab._serializer = None
cdef class DependencyParser(Parser):
TransitionSystem = ArcEager
feature_templates = get_feature_templates('basic')
def add_label(self, label):
for action in self.moves.action_types:
self.moves.add_action(action, label)
if isinstance(label, basestring):
label = self.vocab.strings[label]
for attr, freqs in self.vocab.serializer_freqs:
if attr == DEP and label not in freqs:
freqs.append([label, 1])
# Super hacky :(
self.vocab._serializer = None
__all__ = [Tagger, DependencyParser, EntityRecognizer]

View File

@ -92,6 +92,7 @@ cdef class Parser:
def __init__(self, Vocab vocab, TransitionSystem=None, ParserModel model=None, **cfg):
if TransitionSystem is None:
TransitionSystem = self.TransitionSystem
self.vocab = vocab
actions = TransitionSystem.get_actions(**cfg)
self.moves = TransitionSystem(vocab.strings, actions)
# TODO: Remove this when we no longer need to support old-style models
@ -226,8 +227,10 @@ cdef class Parser:
stepwise.transition(transition)
def add_label(self, label):
# Doesn't set label into serializer -- subclasses override it to do that.
for action in self.moves.action_types:
self.moves.add_action(action, label)
cdef class StepwiseState:

View File

@ -113,9 +113,9 @@ cdef class Vocab:
self._serializer = None
property serializer:
# Having the serializer live here is super messy :(
def __get__(self):
if self._serializer is None:
freqs = []
self._serializer = Packer(self, self.serializer_freqs)
return self._serializer