From 3e688e6d4b5d2b5c52c15531425c6ecf8f27348c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 23 Oct 2016 17:45:44 +0200 Subject: [PATCH] Fix issue #514 -- serializer fails when new entity type has been added. The fix here is quite ugly. It's best to add the entities ASAP after loading the NLP pipeline, to mitigate the brittleness. --- spacy/pipeline.pyx | 25 ++++++++++++++++++++++++- spacy/syntax/parser.pyx | 3 +++ spacy/vocab.pyx | 2 +- 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index ac1f0d75c..02b6ecbee 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -7,6 +7,7 @@ from .tagger import Tagger # TODO: The disorganization here is pretty embarrassing. At least it's only # internals. from .syntax.parser import get_templates as get_feature_templates +from .attrs import DEP, ENT_TYPE cdef class EntityRecognizer(Parser): @@ -14,11 +15,33 @@ cdef class EntityRecognizer(Parser): feature_templates = get_feature_templates('ner') + def add_label(self, label): + for action in self.moves.action_types: + self.moves.add_action(action, label) + if isinstance(label, basestring): + label = self.vocab.strings[label] + for attr, freqs in self.vocab.serializer_freqs: + if attr == ENT_TYPE and label not in freqs: + freqs.append([label, 1]) + # Super hacky :( + self.vocab._serializer = None + cdef class DependencyParser(Parser): TransitionSystem = ArcEager feature_templates = get_feature_templates('basic') - + + def add_label(self, label): + for action in self.moves.action_types: + self.moves.add_action(action, label) + if isinstance(label, basestring): + label = self.vocab.strings[label] + for attr, freqs in self.vocab.serializer_freqs: + if attr == DEP and label not in freqs: + freqs.append([label, 1]) + # Super hacky :( + self.vocab._serializer = None + __all__ = [Tagger, DependencyParser, EntityRecognizer] diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 85407b942..62b61c37b 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -92,6 +92,7 @@ cdef class Parser: def __init__(self, Vocab vocab, TransitionSystem=None, ParserModel model=None, **cfg): if TransitionSystem is None: TransitionSystem = self.TransitionSystem + self.vocab = vocab actions = TransitionSystem.get_actions(**cfg) self.moves = TransitionSystem(vocab.strings, actions) # TODO: Remove this when we no longer need to support old-style models @@ -226,8 +227,10 @@ cdef class Parser: stepwise.transition(transition) def add_label(self, label): + # Doesn't set label into serializer -- subclasses override it to do that. for action in self.moves.action_types: self.moves.add_action(action, label) + cdef class StepwiseState: diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index bbfe17599..9bfd74046 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -113,9 +113,9 @@ cdef class Vocab: self._serializer = None property serializer: + # Having the serializer live here is super messy :( def __get__(self): if self._serializer is None: - freqs = [] self._serializer = Packer(self, self.serializer_freqs) return self._serializer