mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Merge pull request #1466 from explosion/feature/rename-pipeline
💫 Clean up dead linear model code
This commit is contained in:
commit
287a3ca256
5
setup.py
5
setup.py
|
@ -30,19 +30,14 @@ MOD_NAMES = [
|
|||
'spacy.syntax._state',
|
||||
'spacy.syntax._beam_utils',
|
||||
'spacy.tokenizer',
|
||||
'spacy._cfile',
|
||||
'spacy.syntax.parser',
|
||||
'spacy.syntax.nn_parser',
|
||||
'spacy.syntax.beam_parser',
|
||||
'spacy.syntax.nonproj',
|
||||
'spacy.syntax.transition_system',
|
||||
'spacy.syntax.arc_eager',
|
||||
'spacy.syntax._parse_features',
|
||||
'spacy.gold',
|
||||
'spacy.tokens.doc',
|
||||
'spacy.tokens.span',
|
||||
'spacy.tokens.token',
|
||||
'spacy.cfile',
|
||||
'spacy.matcher',
|
||||
'spacy.syntax.ner',
|
||||
'spacy.symbols',
|
||||
|
|
|
@ -1,26 +0,0 @@
|
|||
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
cdef class CFile:
|
||||
cdef FILE* fp
|
||||
cdef bint is_open
|
||||
cdef Pool mem
|
||||
cdef int size # For compatibility with subclass
|
||||
cdef int _capacity # For compatibility with subclass
|
||||
|
||||
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
|
||||
|
||||
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
|
||||
|
||||
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
|
||||
|
||||
|
||||
|
||||
cdef class StringCFile(CFile):
|
||||
cdef unsigned char* data
|
||||
|
||||
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
|
||||
|
||||
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
|
||||
|
||||
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
|
|
@ -1,88 +0,0 @@
|
|||
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
||||
from libc.string cimport memcpy
|
||||
|
||||
|
||||
cdef class CFile:
|
||||
def __init__(self, loc, mode, on_open_error=None):
|
||||
if isinstance(mode, unicode):
|
||||
mode_str = mode.encode('ascii')
|
||||
else:
|
||||
mode_str = mode
|
||||
if hasattr(loc, 'as_posix'):
|
||||
loc = loc.as_posix()
|
||||
self.mem = Pool()
|
||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||
self.fp = fopen(<char*>bytes_loc, mode_str)
|
||||
if self.fp == NULL:
|
||||
if on_open_error is not None:
|
||||
on_open_error()
|
||||
else:
|
||||
raise IOError("Could not open binary file %s" % bytes_loc)
|
||||
self.is_open = True
|
||||
|
||||
def __dealloc__(self):
|
||||
if self.is_open:
|
||||
fclose(self.fp)
|
||||
|
||||
def close(self):
|
||||
fclose(self.fp)
|
||||
self.is_open = False
|
||||
|
||||
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
|
||||
st = fread(dest, elem_size, number, self.fp)
|
||||
if st != number:
|
||||
raise IOError
|
||||
|
||||
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1:
|
||||
st = fwrite(src, elem_size, number, self.fp)
|
||||
if st != number:
|
||||
raise IOError
|
||||
|
||||
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
|
||||
cdef void* dest = mem.alloc(number, elem_size)
|
||||
self.read_into(dest, number, elem_size)
|
||||
return dest
|
||||
|
||||
def write_unicode(self, unicode value):
|
||||
cdef bytes py_bytes = value.encode('utf8')
|
||||
cdef char* chars = <char*>py_bytes
|
||||
self.write(sizeof(char), len(py_bytes), chars)
|
||||
|
||||
|
||||
cdef class StringCFile:
|
||||
def __init__(self, mode, bytes data=b'', on_open_error=None):
|
||||
self.mem = Pool()
|
||||
self.is_open = 'w' in mode
|
||||
self._capacity = max(len(data), 8)
|
||||
self.size = len(data)
|
||||
self.data = <unsigned char*>self.mem.alloc(1, self._capacity)
|
||||
for i in range(len(data)):
|
||||
self.data[i] = data[i]
|
||||
|
||||
def close(self):
|
||||
self.is_open = False
|
||||
|
||||
def string_data(self):
|
||||
return (self.data-self.size)[:self.size]
|
||||
|
||||
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
|
||||
memcpy(dest, self.data, elem_size * number)
|
||||
self.data += elem_size * number
|
||||
|
||||
cdef int write_from(self, void* src, size_t elem_size, size_t number) except -1:
|
||||
write_size = number * elem_size
|
||||
if (self.size + write_size) >= self._capacity:
|
||||
self._capacity = (self.size + write_size) * 2
|
||||
self.data = <unsigned char*>self.mem.realloc(self.data, self._capacity)
|
||||
memcpy(&self.data[self.size], src, elem_size * number)
|
||||
self.size += write_size
|
||||
|
||||
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
|
||||
cdef void* dest = mem.alloc(number, elem_size)
|
||||
self.read_into(dest, number, elem_size)
|
||||
return dest
|
||||
|
||||
def write_unicode(self, unicode value):
|
||||
cdef bytes py_bytes = value.encode('utf8')
|
||||
cdef char* chars = <char*>py_bytes
|
||||
self.write(sizeof(char), len(py_bytes), chars)
|
|
@ -1,33 +0,0 @@
|
|||
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
cdef class CFile:
|
||||
cdef FILE* fp
|
||||
cdef unsigned char* data
|
||||
cdef int is_open
|
||||
cdef Pool mem
|
||||
cdef int size # For compatibility with subclass
|
||||
cdef int i # For compatibility with subclass
|
||||
cdef int _capacity # For compatibility with subclass
|
||||
|
||||
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
|
||||
|
||||
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
|
||||
|
||||
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
|
||||
|
||||
|
||||
|
||||
cdef class StringCFile:
|
||||
cdef unsigned char* data
|
||||
cdef int is_open
|
||||
cdef Pool mem
|
||||
cdef int size # For compatibility with subclass
|
||||
cdef int i # For compatibility with subclass
|
||||
cdef int _capacity # For compatibility with subclass
|
||||
|
||||
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
|
||||
|
||||
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
|
||||
|
||||
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
|
103
spacy/cfile.pyx
103
spacy/cfile.pyx
|
@ -1,103 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from libc.stdio cimport fopen, fclose, fread, fwrite
|
||||
from libc.string cimport memcpy
|
||||
|
||||
|
||||
cdef class CFile:
|
||||
def __init__(self, loc, mode, on_open_error=None):
|
||||
if isinstance(mode, unicode):
|
||||
mode_str = mode.encode('ascii')
|
||||
else:
|
||||
mode_str = mode
|
||||
if hasattr(loc, 'as_posix'):
|
||||
loc = loc.as_posix()
|
||||
self.mem = Pool()
|
||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||
self.fp = fopen(<char*>bytes_loc, mode_str)
|
||||
if self.fp == NULL:
|
||||
if on_open_error is not None:
|
||||
on_open_error()
|
||||
else:
|
||||
raise IOError("Could not open binary file %s" % bytes_loc)
|
||||
self.is_open = True
|
||||
|
||||
def __dealloc__(self):
|
||||
if self.is_open:
|
||||
fclose(self.fp)
|
||||
|
||||
def close(self):
|
||||
fclose(self.fp)
|
||||
self.is_open = False
|
||||
|
||||
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
|
||||
st = fread(dest, elem_size, number, self.fp)
|
||||
if st != number:
|
||||
raise IOError
|
||||
|
||||
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1:
|
||||
st = fwrite(src, elem_size, number, self.fp)
|
||||
if st != number:
|
||||
raise IOError
|
||||
|
||||
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
|
||||
cdef void* dest = mem.alloc(number, elem_size)
|
||||
self.read_into(dest, number, elem_size)
|
||||
return dest
|
||||
|
||||
def write_unicode(self, unicode value):
|
||||
cdef bytes py_bytes = value.encode('utf8')
|
||||
cdef char* chars = <char*>py_bytes
|
||||
self.write(sizeof(char), len(py_bytes), chars)
|
||||
|
||||
|
||||
cdef class StringCFile:
|
||||
def __init__(self, bytes data, mode, on_open_error=None):
|
||||
self.mem = Pool()
|
||||
self.is_open = 1 if 'w' in mode else 0
|
||||
self._capacity = max(len(data), 8)
|
||||
self.size = len(data)
|
||||
self.i = 0
|
||||
self.data = <unsigned char*>self.mem.alloc(1, self._capacity)
|
||||
for i in range(len(data)):
|
||||
self.data[i] = data[i]
|
||||
|
||||
def __dealloc__(self):
|
||||
# Important to override this -- or
|
||||
# we try to close a non-existant file pointer!
|
||||
pass
|
||||
|
||||
def close(self):
|
||||
self.is_open = False
|
||||
|
||||
def string_data(self):
|
||||
cdef bytes byte_string = b'\0' * (self.size)
|
||||
bytes_ptr = <char*>byte_string
|
||||
for i in range(self.size):
|
||||
bytes_ptr[i] = self.data[i]
|
||||
print(byte_string)
|
||||
return byte_string
|
||||
|
||||
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
|
||||
if self.i+(number * elem_size) < self.size:
|
||||
memcpy(dest, &self.data[self.i], elem_size * number)
|
||||
self.i += elem_size * number
|
||||
|
||||
cdef int write_from(self, void* src, size_t elem_size, size_t number) except -1:
|
||||
write_size = number * elem_size
|
||||
if (self.size + write_size) >= self._capacity:
|
||||
self._capacity = (self.size + write_size) * 2
|
||||
self.data = <unsigned char*>self.mem.realloc(self.data, self._capacity)
|
||||
memcpy(&self.data[self.size], src, write_size)
|
||||
self.size += write_size
|
||||
|
||||
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
|
||||
cdef void* dest = mem.alloc(number, elem_size)
|
||||
self.read_into(dest, number, elem_size)
|
||||
return dest
|
||||
|
||||
def write_unicode(self, unicode value):
|
||||
cdef bytes py_bytes = value.encode('utf8')
|
||||
cdef char* chars = <char*>py_bytes
|
||||
self.write(sizeof(char), len(py_bytes), chars)
|
|
@ -16,10 +16,9 @@ from .tokenizer import Tokenizer
|
|||
from .vocab import Vocab
|
||||
from .tagger import Tagger
|
||||
from .lemmatizer import Lemmatizer
|
||||
from .syntax.parser import get_templates
|
||||
|
||||
from .pipeline import NeuralDependencyParser, TokenVectorEncoder, NeuralTagger
|
||||
from .pipeline import NeuralEntityRecognizer, SimilarityHook, TextCategorizer
|
||||
from .pipeline import DependencyParser, Tensorizer, Tagger
|
||||
from .pipeline import EntityRecognizer, SimilarityHook, TextCategorizer
|
||||
|
||||
from .compat import json_dumps, izip, copy_reg
|
||||
from .scorer import Scorer
|
||||
|
@ -75,9 +74,6 @@ class BaseDefaults(object):
|
|||
infixes = tuple(TOKENIZER_INFIXES)
|
||||
tag_map = dict(TAG_MAP)
|
||||
tokenizer_exceptions = {}
|
||||
parser_features = get_templates('parser')
|
||||
entity_features = get_templates('ner')
|
||||
tagger_features = Tagger.feature_templates # TODO -- fix this
|
||||
stop_words = set()
|
||||
lemma_rules = {}
|
||||
lemma_exc = {}
|
||||
|
@ -102,9 +98,9 @@ class Language(object):
|
|||
factories = {
|
||||
'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp),
|
||||
'tensorizer': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg),
|
||||
'tagger': lambda nlp, **cfg: NeuralTagger(nlp.vocab, **cfg),
|
||||
'parser': lambda nlp, **cfg: NeuralDependencyParser(nlp.vocab, **cfg),
|
||||
'ner': lambda nlp, **cfg: NeuralEntityRecognizer(nlp.vocab, **cfg),
|
||||
'tagger': lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
|
||||
'parser': lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
|
||||
'ner': lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
|
||||
'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
|
||||
'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg)
|
||||
}
|
||||
|
|
|
@ -1,21 +0,0 @@
|
|||
from .syntax.parser cimport Parser
|
||||
#from .syntax.beam_parser cimport BeamParser
|
||||
from .syntax.ner cimport BiluoPushDown
|
||||
from .syntax.arc_eager cimport ArcEager
|
||||
from .tagger cimport Tagger
|
||||
|
||||
|
||||
cdef class EntityRecognizer(Parser):
|
||||
pass
|
||||
|
||||
|
||||
cdef class DependencyParser(Parser):
|
||||
pass
|
||||
|
||||
|
||||
#cdef class BeamEntityRecognizer(BeamParser):
|
||||
# pass
|
||||
#
|
||||
#
|
||||
#cdef class BeamDependencyParser(BeamParser):
|
||||
# pass
|
|
@ -26,11 +26,8 @@ from thinc.neural.util import to_categorical
|
|||
from thinc.neural._classes.difference import Siamese, CauchySimilarity
|
||||
|
||||
from .tokens.doc cimport Doc
|
||||
from .syntax.parser cimport Parser as LinearParser
|
||||
from .syntax.nn_parser cimport Parser as NeuralParser
|
||||
from .syntax.nn_parser cimport Parser
|
||||
from .syntax import nonproj
|
||||
from .syntax.parser import get_templates as get_feature_templates
|
||||
from .syntax.beam_parser cimport BeamParser
|
||||
from .syntax.ner cimport BiluoPushDown
|
||||
from .syntax.arc_eager cimport ArcEager
|
||||
from .tagger import Tagger
|
||||
|
@ -86,7 +83,7 @@ class SentenceSegmenter(object):
|
|||
yield doc[start : len(doc)]
|
||||
|
||||
|
||||
class BaseThincComponent(object):
|
||||
class Pipe(object):
|
||||
name = None
|
||||
|
||||
@classmethod
|
||||
|
@ -217,7 +214,7 @@ def _load_cfg(path):
|
|||
return {}
|
||||
|
||||
|
||||
class TokenVectorEncoder(BaseThincComponent):
|
||||
class Tensorizer(Pipe):
|
||||
"""Assign position-sensitive vectors to tokens, using a CNN or RNN."""
|
||||
name = 'tensorizer'
|
||||
|
||||
|
@ -329,7 +326,7 @@ class TokenVectorEncoder(BaseThincComponent):
|
|||
link_vectors_to_models(self.vocab)
|
||||
|
||||
|
||||
class NeuralTagger(BaseThincComponent):
|
||||
class Tagger(Pipe):
|
||||
name = 'tagger'
|
||||
def __init__(self, vocab, model=True, **cfg):
|
||||
self.vocab = vocab
|
||||
|
@ -513,7 +510,11 @@ class NeuralTagger(BaseThincComponent):
|
|||
return self
|
||||
|
||||
|
||||
class NeuralLabeller(NeuralTagger):
|
||||
class MultitaskObjective(Tagger):
|
||||
'''Assist training of a parser or tagger, by training a side-objective.
|
||||
|
||||
Experimental
|
||||
'''
|
||||
name = 'nn_labeller'
|
||||
def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
|
||||
self.vocab = vocab
|
||||
|
@ -532,7 +533,7 @@ class NeuralLabeller(NeuralTagger):
|
|||
self.make_label = target
|
||||
else:
|
||||
raise ValueError(
|
||||
"NeuralLabeller target should be function or one of "
|
||||
"MultitaskObjective target should be function or one of "
|
||||
"['dep', 'tag', 'ent', 'dep_tag_offset', 'ent_tag']")
|
||||
self.cfg = dict(cfg)
|
||||
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
||||
|
@ -622,7 +623,7 @@ class NeuralLabeller(NeuralTagger):
|
|||
return '%s-%s' % (tags[i], ents[i])
|
||||
|
||||
|
||||
class SimilarityHook(BaseThincComponent):
|
||||
class SimilarityHook(Pipe):
|
||||
"""
|
||||
Experimental
|
||||
|
||||
|
@ -674,7 +675,7 @@ class SimilarityHook(BaseThincComponent):
|
|||
link_vectors_to_models(self.vocab)
|
||||
|
||||
|
||||
class TextCategorizer(BaseThincComponent):
|
||||
class TextCategorizer(Pipe):
|
||||
name = 'textcat'
|
||||
|
||||
@classmethod
|
||||
|
@ -752,45 +753,7 @@ class TextCategorizer(BaseThincComponent):
|
|||
link_vectors_to_models(self.vocab)
|
||||
|
||||
|
||||
cdef class EntityRecognizer(LinearParser):
|
||||
"""Annotate named entities on Doc objects."""
|
||||
TransitionSystem = BiluoPushDown
|
||||
|
||||
feature_templates = get_feature_templates('ner')
|
||||
|
||||
def add_label(self, label):
|
||||
LinearParser.add_label(self, label)
|
||||
if isinstance(label, basestring):
|
||||
label = self.vocab.strings[label]
|
||||
|
||||
|
||||
cdef class BeamEntityRecognizer(BeamParser):
|
||||
"""Annotate named entities on Doc objects."""
|
||||
TransitionSystem = BiluoPushDown
|
||||
|
||||
feature_templates = get_feature_templates('ner')
|
||||
|
||||
def add_label(self, label):
|
||||
LinearParser.add_label(self, label)
|
||||
if isinstance(label, basestring):
|
||||
label = self.vocab.strings[label]
|
||||
|
||||
|
||||
cdef class DependencyParser(LinearParser):
|
||||
TransitionSystem = ArcEager
|
||||
feature_templates = get_feature_templates('basic')
|
||||
|
||||
def add_label(self, label):
|
||||
LinearParser.add_label(self, label)
|
||||
if isinstance(label, basestring):
|
||||
label = self.vocab.strings[label]
|
||||
|
||||
@property
|
||||
def postprocesses(self):
|
||||
return [nonproj.deprojectivize]
|
||||
|
||||
|
||||
cdef class NeuralDependencyParser(NeuralParser):
|
||||
cdef class DependencyParser(Parser):
|
||||
name = 'parser'
|
||||
TransitionSystem = ArcEager
|
||||
|
||||
|
@ -800,17 +763,17 @@ cdef class NeuralDependencyParser(NeuralParser):
|
|||
|
||||
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
|
||||
for target in []:
|
||||
labeller = NeuralLabeller(self.vocab, target=target)
|
||||
labeller = MultitaskObjective(self.vocab, target=target)
|
||||
tok2vec = self.model[0]
|
||||
labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
|
||||
pipeline.append(labeller)
|
||||
self._multitasks.append(labeller)
|
||||
|
||||
def __reduce__(self):
|
||||
return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None)
|
||||
return (DependencyParser, (self.vocab, self.moves, self.model), None, None)
|
||||
|
||||
|
||||
cdef class NeuralEntityRecognizer(NeuralParser):
|
||||
cdef class EntityRecognizer(Parser):
|
||||
name = 'ner'
|
||||
TransitionSystem = BiluoPushDown
|
||||
|
||||
|
@ -818,31 +781,14 @@ cdef class NeuralEntityRecognizer(NeuralParser):
|
|||
|
||||
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
|
||||
for target in []:
|
||||
labeller = NeuralLabeller(self.vocab, target=target)
|
||||
labeller = MultitaskObjective(self.vocab, target=target)
|
||||
tok2vec = self.model[0]
|
||||
labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
|
||||
pipeline.append(labeller)
|
||||
self._multitasks.append(labeller)
|
||||
|
||||
def __reduce__(self):
|
||||
return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)
|
||||
return (EntityRecognizer, (self.vocab, self.moves, self.model), None, None)
|
||||
|
||||
|
||||
cdef class BeamDependencyParser(BeamParser):
|
||||
TransitionSystem = ArcEager
|
||||
|
||||
feature_templates = get_feature_templates('basic')
|
||||
|
||||
def add_label(self, label):
|
||||
Parser.add_label(self, label)
|
||||
if isinstance(label, basestring):
|
||||
label = self.vocab.strings[label]
|
||||
|
||||
@property
|
||||
def postprocesses(self):
|
||||
return [nonproj.deprojectivize]
|
||||
|
||||
|
||||
|
||||
__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'BeamDependencyParser',
|
||||
'BeamEntityRecognizer', 'TokenVectorEnoder']
|
||||
__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer']
|
||||
|
|
|
@ -1,259 +0,0 @@
|
|||
from thinc.typedefs cimport atom_t
|
||||
|
||||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC
|
||||
|
||||
|
||||
cdef int fill_context(atom_t* context, const StateC* state) nogil
|
||||
# Context elements
|
||||
|
||||
# Ensure each token's attributes are listed: w, p, c, c6, c4. The order
|
||||
# is referenced by incrementing the enum...
|
||||
|
||||
# Tokens are listed in left-to-right order.
|
||||
#cdef size_t* SLOTS = [
|
||||
# S2w, S1w,
|
||||
# S0l0w, S0l2w, S0lw,
|
||||
# S0w,
|
||||
# S0r0w, S0r2w, S0rw,
|
||||
# N0l0w, N0l2w, N0lw,
|
||||
# P2w, P1w,
|
||||
# N0w, N1w, N2w, N3w, 0
|
||||
#]
|
||||
|
||||
# NB: The order of the enum is _NOT_ arbitrary!!
|
||||
cpdef enum:
|
||||
S2w
|
||||
S2W
|
||||
S2p
|
||||
S2c
|
||||
S2c4
|
||||
S2c6
|
||||
S2L
|
||||
S2_prefix
|
||||
S2_suffix
|
||||
S2_shape
|
||||
S2_ne_iob
|
||||
S2_ne_type
|
||||
|
||||
S1w
|
||||
S1W
|
||||
S1p
|
||||
S1c
|
||||
S1c4
|
||||
S1c6
|
||||
S1L
|
||||
S1_prefix
|
||||
S1_suffix
|
||||
S1_shape
|
||||
S1_ne_iob
|
||||
S1_ne_type
|
||||
|
||||
S1rw
|
||||
S1rW
|
||||
S1rp
|
||||
S1rc
|
||||
S1rc4
|
||||
S1rc6
|
||||
S1rL
|
||||
S1r_prefix
|
||||
S1r_suffix
|
||||
S1r_shape
|
||||
S1r_ne_iob
|
||||
S1r_ne_type
|
||||
|
||||
S0lw
|
||||
S0lW
|
||||
S0lp
|
||||
S0lc
|
||||
S0lc4
|
||||
S0lc6
|
||||
S0lL
|
||||
S0l_prefix
|
||||
S0l_suffix
|
||||
S0l_shape
|
||||
S0l_ne_iob
|
||||
S0l_ne_type
|
||||
|
||||
S0l2w
|
||||
S0l2W
|
||||
S0l2p
|
||||
S0l2c
|
||||
S0l2c4
|
||||
S0l2c6
|
||||
S0l2L
|
||||
S0l2_prefix
|
||||
S0l2_suffix
|
||||
S0l2_shape
|
||||
S0l2_ne_iob
|
||||
S0l2_ne_type
|
||||
|
||||
S0w
|
||||
S0W
|
||||
S0p
|
||||
S0c
|
||||
S0c4
|
||||
S0c6
|
||||
S0L
|
||||
S0_prefix
|
||||
S0_suffix
|
||||
S0_shape
|
||||
S0_ne_iob
|
||||
S0_ne_type
|
||||
|
||||
S0r2w
|
||||
S0r2W
|
||||
S0r2p
|
||||
S0r2c
|
||||
S0r2c4
|
||||
S0r2c6
|
||||
S0r2L
|
||||
S0r2_prefix
|
||||
S0r2_suffix
|
||||
S0r2_shape
|
||||
S0r2_ne_iob
|
||||
S0r2_ne_type
|
||||
|
||||
S0rw
|
||||
S0rW
|
||||
S0rp
|
||||
S0rc
|
||||
S0rc4
|
||||
S0rc6
|
||||
S0rL
|
||||
S0r_prefix
|
||||
S0r_suffix
|
||||
S0r_shape
|
||||
S0r_ne_iob
|
||||
S0r_ne_type
|
||||
|
||||
N0l2w
|
||||
N0l2W
|
||||
N0l2p
|
||||
N0l2c
|
||||
N0l2c4
|
||||
N0l2c6
|
||||
N0l2L
|
||||
N0l2_prefix
|
||||
N0l2_suffix
|
||||
N0l2_shape
|
||||
N0l2_ne_iob
|
||||
N0l2_ne_type
|
||||
|
||||
N0lw
|
||||
N0lW
|
||||
N0lp
|
||||
N0lc
|
||||
N0lc4
|
||||
N0lc6
|
||||
N0lL
|
||||
N0l_prefix
|
||||
N0l_suffix
|
||||
N0l_shape
|
||||
N0l_ne_iob
|
||||
N0l_ne_type
|
||||
|
||||
N0w
|
||||
N0W
|
||||
N0p
|
||||
N0c
|
||||
N0c4
|
||||
N0c6
|
||||
N0L
|
||||
N0_prefix
|
||||
N0_suffix
|
||||
N0_shape
|
||||
N0_ne_iob
|
||||
N0_ne_type
|
||||
|
||||
N1w
|
||||
N1W
|
||||
N1p
|
||||
N1c
|
||||
N1c4
|
||||
N1c6
|
||||
N1L
|
||||
N1_prefix
|
||||
N1_suffix
|
||||
N1_shape
|
||||
N1_ne_iob
|
||||
N1_ne_type
|
||||
|
||||
N2w
|
||||
N2W
|
||||
N2p
|
||||
N2c
|
||||
N2c4
|
||||
N2c6
|
||||
N2L
|
||||
N2_prefix
|
||||
N2_suffix
|
||||
N2_shape
|
||||
N2_ne_iob
|
||||
N2_ne_type
|
||||
|
||||
P1w
|
||||
P1W
|
||||
P1p
|
||||
P1c
|
||||
P1c4
|
||||
P1c6
|
||||
P1L
|
||||
P1_prefix
|
||||
P1_suffix
|
||||
P1_shape
|
||||
P1_ne_iob
|
||||
P1_ne_type
|
||||
|
||||
P2w
|
||||
P2W
|
||||
P2p
|
||||
P2c
|
||||
P2c4
|
||||
P2c6
|
||||
P2L
|
||||
P2_prefix
|
||||
P2_suffix
|
||||
P2_shape
|
||||
P2_ne_iob
|
||||
P2_ne_type
|
||||
|
||||
E0w
|
||||
E0W
|
||||
E0p
|
||||
E0c
|
||||
E0c4
|
||||
E0c6
|
||||
E0L
|
||||
E0_prefix
|
||||
E0_suffix
|
||||
E0_shape
|
||||
E0_ne_iob
|
||||
E0_ne_type
|
||||
|
||||
E1w
|
||||
E1W
|
||||
E1p
|
||||
E1c
|
||||
E1c4
|
||||
E1c6
|
||||
E1L
|
||||
E1_prefix
|
||||
E1_suffix
|
||||
E1_shape
|
||||
E1_ne_iob
|
||||
E1_ne_type
|
||||
|
||||
# Misc features at the end
|
||||
dist
|
||||
N0lv
|
||||
S0lv
|
||||
S0rv
|
||||
S1lv
|
||||
S1rv
|
||||
|
||||
S0_has_head
|
||||
S1_has_head
|
||||
S2_has_head
|
||||
|
||||
CONTEXT_SIZE
|
|
@ -1,419 +0,0 @@
|
|||
"""
|
||||
Fill an array, context, with every _atomic_ value our features reference.
|
||||
We then write the _actual features_ as tuples of the atoms. The machinery
|
||||
that translates from the tuples to feature-extractors (which pick the values
|
||||
out of "context") is in features/extractor.pyx
|
||||
|
||||
The atomic feature names are listed in a big enum, so that the feature tuples
|
||||
can refer to them.
|
||||
"""
|
||||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from libc.string cimport memset
|
||||
from itertools import combinations
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
from ..structs cimport TokenC
|
||||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC
|
||||
|
||||
|
||||
cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
|
||||
if token is NULL:
|
||||
context[0] = 0
|
||||
context[1] = 0
|
||||
context[2] = 0
|
||||
context[3] = 0
|
||||
context[4] = 0
|
||||
context[5] = 0
|
||||
context[6] = 0
|
||||
context[7] = 0
|
||||
context[8] = 0
|
||||
context[9] = 0
|
||||
context[10] = 0
|
||||
context[11] = 0
|
||||
else:
|
||||
context[0] = token.lex.orth
|
||||
context[1] = token.lemma
|
||||
context[2] = token.tag
|
||||
context[3] = token.lex.cluster
|
||||
# We've read in the string little-endian, so now we can take & (2**n)-1
|
||||
# to get the first n bits of the cluster.
|
||||
# e.g. s = "1110010101"
|
||||
# s = ''.join(reversed(s))
|
||||
# first_4_bits = int(s, 2)
|
||||
# print first_4_bits
|
||||
# 5
|
||||
# print "{0:b}".format(prefix).ljust(4, '0')
|
||||
# 1110
|
||||
# What we're doing here is picking a number where all bits are 1, e.g.
|
||||
# 15 is 1111, 63 is 111111 and doing bitwise AND, so getting all bits in
|
||||
# the source that are set to 1.
|
||||
context[4] = token.lex.cluster & 15
|
||||
context[5] = token.lex.cluster & 63
|
||||
context[6] = token.dep if token.head != 0 else 0
|
||||
context[7] = token.lex.prefix
|
||||
context[8] = token.lex.suffix
|
||||
context[9] = token.lex.shape
|
||||
context[10] = token.ent_iob
|
||||
context[11] = token.ent_type
|
||||
|
||||
cdef int fill_context(atom_t* ctxt, const StateC* st) nogil:
|
||||
# Take care to fill every element of context!
|
||||
# We could memset, but this makes it very easy to have broken features that
|
||||
# make almost no impact on accuracy. If instead they're unset, the impact
|
||||
# tends to be dramatic, so we get an obvious regression to fix...
|
||||
fill_token(&ctxt[S2w], st.S_(2))
|
||||
fill_token(&ctxt[S1w], st.S_(1))
|
||||
fill_token(&ctxt[S1rw], st.R_(st.S(1), 1))
|
||||
fill_token(&ctxt[S0lw], st.L_(st.S(0), 1))
|
||||
fill_token(&ctxt[S0l2w], st.L_(st.S(0), 2))
|
||||
fill_token(&ctxt[S0w], st.S_(0))
|
||||
fill_token(&ctxt[S0r2w], st.R_(st.S(0), 2))
|
||||
fill_token(&ctxt[S0rw], st.R_(st.S(0), 1))
|
||||
fill_token(&ctxt[N0lw], st.L_(st.B(0), 1))
|
||||
fill_token(&ctxt[N0l2w], st.L_(st.B(0), 2))
|
||||
fill_token(&ctxt[N0w], st.B_(0))
|
||||
fill_token(&ctxt[N1w], st.B_(1))
|
||||
fill_token(&ctxt[N2w], st.B_(2))
|
||||
fill_token(&ctxt[P1w], st.safe_get(st.B(0)-1))
|
||||
fill_token(&ctxt[P2w], st.safe_get(st.B(0)-2))
|
||||
|
||||
fill_token(&ctxt[E0w], st.E_(0))
|
||||
fill_token(&ctxt[E1w], st.E_(1))
|
||||
|
||||
if st.stack_depth() >= 1 and not st.eol():
|
||||
ctxt[dist] = min_(st.B(0) - st.E(0), 5)
|
||||
else:
|
||||
ctxt[dist] = 0
|
||||
ctxt[N0lv] = min_(st.n_L(st.B(0)), 5)
|
||||
ctxt[S0lv] = min_(st.n_L(st.S(0)), 5)
|
||||
ctxt[S0rv] = min_(st.n_R(st.S(0)), 5)
|
||||
ctxt[S1lv] = min_(st.n_L(st.S(1)), 5)
|
||||
ctxt[S1rv] = min_(st.n_R(st.S(1)), 5)
|
||||
|
||||
ctxt[S0_has_head] = 0
|
||||
ctxt[S1_has_head] = 0
|
||||
ctxt[S2_has_head] = 0
|
||||
if st.stack_depth() >= 1:
|
||||
ctxt[S0_has_head] = st.has_head(st.S(0)) + 1
|
||||
if st.stack_depth() >= 2:
|
||||
ctxt[S1_has_head] = st.has_head(st.S(1)) + 1
|
||||
if st.stack_depth() >= 3:
|
||||
ctxt[S2_has_head] = st.has_head(st.S(2)) + 1
|
||||
|
||||
|
||||
cdef inline int min_(int a, int b) nogil:
|
||||
return a if a > b else b
|
||||
|
||||
|
||||
ner = (
|
||||
(N0W,),
|
||||
(P1W,),
|
||||
(N1W,),
|
||||
(P2W,),
|
||||
(N2W,),
|
||||
|
||||
(P1W, N0W,),
|
||||
(N0W, N1W),
|
||||
|
||||
(N0_prefix,),
|
||||
(N0_suffix,),
|
||||
|
||||
(P1_shape,),
|
||||
(N0_shape,),
|
||||
(N1_shape,),
|
||||
(P1_shape, N0_shape,),
|
||||
(N0_shape, P1_shape,),
|
||||
(P1_shape, N0_shape, N1_shape),
|
||||
(N2_shape,),
|
||||
(P2_shape,),
|
||||
|
||||
#(P2_norm, P1_norm, W_norm),
|
||||
#(P1_norm, W_norm, N1_norm),
|
||||
#(W_norm, N1_norm, N2_norm)
|
||||
|
||||
(P2p,),
|
||||
(P1p,),
|
||||
(N0p,),
|
||||
(N1p,),
|
||||
(N2p,),
|
||||
|
||||
(P1p, N0p),
|
||||
(N0p, N1p),
|
||||
(P2p, P1p, N0p),
|
||||
(P1p, N0p, N1p),
|
||||
(N0p, N1p, N2p),
|
||||
|
||||
(P2c,),
|
||||
(P1c,),
|
||||
(N0c,),
|
||||
(N1c,),
|
||||
(N2c,),
|
||||
|
||||
(P1c, N0c),
|
||||
(N0c, N1c),
|
||||
|
||||
(E0W,),
|
||||
(E0c,),
|
||||
(E0p,),
|
||||
|
||||
(E0W, N0W),
|
||||
(E0c, N0W),
|
||||
(E0p, N0W),
|
||||
|
||||
(E0p, P1p, N0p),
|
||||
(E0c, P1c, N0c),
|
||||
|
||||
(E0w, P1c),
|
||||
(E0p, P1p),
|
||||
(E0c, P1c),
|
||||
(E0p, E1p),
|
||||
(E0c, P1p),
|
||||
|
||||
(E1W,),
|
||||
(E1c,),
|
||||
(E1p,),
|
||||
|
||||
(E0W, E1W),
|
||||
(E0W, E1p,),
|
||||
(E0p, E1W,),
|
||||
(E0p, E1W),
|
||||
|
||||
(P1_ne_iob,),
|
||||
(P1_ne_iob, P1_ne_type),
|
||||
(N0w, P1_ne_iob, P1_ne_type),
|
||||
|
||||
(N0_shape,),
|
||||
(N1_shape,),
|
||||
(N2_shape,),
|
||||
(P1_shape,),
|
||||
(P2_shape,),
|
||||
|
||||
(N0_prefix,),
|
||||
(N0_suffix,),
|
||||
|
||||
(P1_ne_iob,),
|
||||
(P2_ne_iob,),
|
||||
(P1_ne_iob, P2_ne_iob),
|
||||
(P1_ne_iob, P1_ne_type),
|
||||
(P2_ne_iob, P2_ne_type),
|
||||
(N0w, P1_ne_iob, P1_ne_type),
|
||||
|
||||
(N0w, N1w),
|
||||
)
|
||||
|
||||
|
||||
unigrams = (
|
||||
(S2W, S2p),
|
||||
(S2c6, S2p),
|
||||
|
||||
(S1W, S1p),
|
||||
(S1c6, S1p),
|
||||
|
||||
(S0W, S0p),
|
||||
(S0c6, S0p),
|
||||
|
||||
(N0W, N0p),
|
||||
(N0p,),
|
||||
(N0c,),
|
||||
(N0c6, N0p),
|
||||
(N0L,),
|
||||
|
||||
(N1W, N1p),
|
||||
(N1c6, N1p),
|
||||
|
||||
(N2W, N2p),
|
||||
(N2c6, N2p),
|
||||
|
||||
(S0r2W, S0r2p),
|
||||
(S0r2c6, S0r2p),
|
||||
(S0r2L,),
|
||||
|
||||
(S0rW, S0rp),
|
||||
(S0rc6, S0rp),
|
||||
(S0rL,),
|
||||
|
||||
(S0l2W, S0l2p),
|
||||
(S0l2c6, S0l2p),
|
||||
(S0l2L,),
|
||||
|
||||
(S0lW, S0lp),
|
||||
(S0lc6, S0lp),
|
||||
(S0lL,),
|
||||
|
||||
(N0l2W, N0l2p),
|
||||
(N0l2c6, N0l2p),
|
||||
(N0l2L,),
|
||||
|
||||
(N0lW, N0lp),
|
||||
(N0lc6, N0lp),
|
||||
(N0lL,),
|
||||
)
|
||||
|
||||
|
||||
s0_n0 = (
|
||||
(S0W, S0p, N0W, N0p),
|
||||
(S0c, S0p, N0c, N0p),
|
||||
(S0c6, S0p, N0c6, N0p),
|
||||
(S0c4, S0p, N0c4, N0p),
|
||||
(S0p, N0p),
|
||||
(S0W, N0p),
|
||||
(S0p, N0W),
|
||||
(S0W, N0c),
|
||||
(S0c, N0W),
|
||||
(S0p, N0c),
|
||||
(S0c, N0p),
|
||||
(S0W, S0rp, N0p),
|
||||
(S0p, S0rp, N0p),
|
||||
(S0p, N0lp, N0W),
|
||||
(S0p, N0lp, N0p),
|
||||
(S0L, N0p),
|
||||
(S0p, S0rL, N0p),
|
||||
(S0p, N0lL, N0p),
|
||||
(S0p, S0rv, N0p),
|
||||
(S0p, N0lv, N0p),
|
||||
(S0c6, S0rL, S0r2L, N0p),
|
||||
(S0p, N0lL, N0l2L, N0p),
|
||||
)
|
||||
|
||||
|
||||
s1_s0 = (
|
||||
(S1p, S0p),
|
||||
(S1p, S0p, S0_has_head),
|
||||
(S1W, S0p),
|
||||
(S1W, S0p, S0_has_head),
|
||||
(S1c, S0p),
|
||||
(S1c, S0p, S0_has_head),
|
||||
(S1p, S1rL, S0p),
|
||||
(S1p, S1rL, S0p, S0_has_head),
|
||||
(S1p, S0lL, S0p),
|
||||
(S1p, S0lL, S0p, S0_has_head),
|
||||
(S1p, S0lL, S0l2L, S0p),
|
||||
(S1p, S0lL, S0l2L, S0p, S0_has_head),
|
||||
(S1L, S0L, S0W),
|
||||
(S1L, S0L, S0p),
|
||||
(S1p, S1L, S0L, S0p),
|
||||
(S1p, S0p),
|
||||
)
|
||||
|
||||
|
||||
s1_n0 = (
|
||||
(S1p, N0p),
|
||||
(S1c, N0c),
|
||||
(S1c, N0p),
|
||||
(S1p, N0c),
|
||||
(S1W, S1p, N0p),
|
||||
(S1p, N0W, N0p),
|
||||
(S1c6, S1p, N0c6, N0p),
|
||||
(S1L, N0p),
|
||||
(S1p, S1rL, N0p),
|
||||
(S1p, S1rp, N0p),
|
||||
)
|
||||
|
||||
|
||||
s0_n1 = (
|
||||
(S0p, N1p),
|
||||
(S0c, N1c),
|
||||
(S0c, N1p),
|
||||
(S0p, N1c),
|
||||
(S0W, S0p, N1p),
|
||||
(S0p, N1W, N1p),
|
||||
(S0c6, S0p, N1c6, N1p),
|
||||
(S0L, N1p),
|
||||
(S0p, S0rL, N1p),
|
||||
)
|
||||
|
||||
|
||||
n0_n1 = (
|
||||
(N0W, N0p, N1W, N1p),
|
||||
(N0W, N0p, N1p),
|
||||
(N0p, N1W, N1p),
|
||||
(N0c, N0p, N1c, N1p),
|
||||
(N0c6, N0p, N1c6, N1p),
|
||||
(N0c, N1c),
|
||||
(N0p, N1c),
|
||||
)
|
||||
|
||||
tree_shape = (
|
||||
(dist,),
|
||||
(S0p, S0_has_head, S1_has_head, S2_has_head),
|
||||
(S0p, S0lv, S0rv),
|
||||
(N0p, N0lv),
|
||||
)
|
||||
|
||||
trigrams = (
|
||||
(N0p, N1p, N2p),
|
||||
(S0p, S0lp, S0l2p),
|
||||
(S0p, S0rp, S0r2p),
|
||||
(S0p, S1p, S2p),
|
||||
(S1p, S0p, N0p),
|
||||
(S0p, S0lp, N0p),
|
||||
(S0p, N0p, N0lp),
|
||||
(N0p, N0lp, N0l2p),
|
||||
|
||||
(S0W, S0p, S0rL, S0r2L),
|
||||
(S0p, S0rL, S0r2L),
|
||||
|
||||
(S0W, S0p, S0lL, S0l2L),
|
||||
(S0p, S0lL, S0l2L),
|
||||
|
||||
(N0W, N0p, N0lL, N0l2L),
|
||||
(N0p, N0lL, N0l2L),
|
||||
)
|
||||
|
||||
|
||||
words = (
|
||||
S2w,
|
||||
S1w,
|
||||
S1rw,
|
||||
S0lw,
|
||||
S0l2w,
|
||||
S0w,
|
||||
S0r2w,
|
||||
S0rw,
|
||||
N0lw,
|
||||
N0l2w,
|
||||
N0w,
|
||||
N1w,
|
||||
N2w,
|
||||
P1w,
|
||||
P2w
|
||||
)
|
||||
|
||||
tags = (
|
||||
S2p,
|
||||
S1p,
|
||||
S1rp,
|
||||
S0lp,
|
||||
S0l2p,
|
||||
S0p,
|
||||
S0r2p,
|
||||
S0rp,
|
||||
N0lp,
|
||||
N0l2p,
|
||||
N0p,
|
||||
N1p,
|
||||
N2p,
|
||||
P1p,
|
||||
P2p
|
||||
)
|
||||
|
||||
labels = (
|
||||
S2L,
|
||||
S1L,
|
||||
S1rL,
|
||||
S0lL,
|
||||
S0l2L,
|
||||
S0L,
|
||||
S0r2L,
|
||||
S0rL,
|
||||
N0lL,
|
||||
N0l2L,
|
||||
N0L,
|
||||
N1L,
|
||||
N2L,
|
||||
P1L,
|
||||
P2L
|
||||
)
|
|
@ -1,10 +0,0 @@
|
|||
from .parser cimport Parser
|
||||
from ..structs cimport TokenC
|
||||
from thinc.typedefs cimport weight_t
|
||||
|
||||
|
||||
cdef class BeamParser(Parser):
|
||||
cdef public int beam_width
|
||||
cdef public weight_t beam_density
|
||||
|
||||
cdef int _parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) except -1
|
|
@ -1,239 +0,0 @@
|
|||
"""
|
||||
MALT-style dependency parser
|
||||
"""
|
||||
# cython: profile=True
|
||||
# cython: experimental_cpp_class_def=True
|
||||
# cython: cdivision=True
|
||||
# cython: infer_types=True
|
||||
# coding: utf-8
|
||||
|
||||
from __future__ import unicode_literals, print_function
|
||||
cimport cython
|
||||
|
||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
||||
from libc.stdint cimport uint32_t, uint64_t
|
||||
from libc.string cimport memset, memcpy
|
||||
from libc.stdlib cimport rand
|
||||
from libc.math cimport log, exp, isnan, isinf
|
||||
from cymem.cymem cimport Pool, Address
|
||||
from murmurhash.mrmr cimport real_hash64 as hash64
|
||||
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
|
||||
from thinc.linear.features cimport ConjunctionExtracter
|
||||
from thinc.structs cimport FeatureC, ExampleC
|
||||
from thinc.extra.search cimport Beam, MaxViolation
|
||||
from thinc.extra.eg cimport Example
|
||||
from thinc.extra.mb cimport Minibatch
|
||||
|
||||
from ..structs cimport TokenC
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..strings cimport StringStore
|
||||
from .transition_system cimport TransitionSystem, Transition
|
||||
from ..gold cimport GoldParse
|
||||
from . import _parse_features
|
||||
from ._parse_features cimport CONTEXT_SIZE
|
||||
from ._parse_features cimport fill_context
|
||||
from .stateclass cimport StateClass
|
||||
from .parser cimport Parser
|
||||
|
||||
|
||||
DEBUG = False
|
||||
def set_debug(val):
|
||||
global DEBUG
|
||||
DEBUG = val
|
||||
|
||||
|
||||
def get_templates(name):
|
||||
pf = _parse_features
|
||||
if name == 'ner':
|
||||
return pf.ner
|
||||
elif name == 'debug':
|
||||
return pf.unigrams
|
||||
else:
|
||||
return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \
|
||||
pf.tree_shape + pf.trigrams)
|
||||
|
||||
|
||||
cdef int BEAM_WIDTH = 16
|
||||
cdef weight_t BEAM_DENSITY = 0.001
|
||||
|
||||
cdef class BeamParser(Parser):
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.beam_width = kwargs.get('beam_width', BEAM_WIDTH)
|
||||
self.beam_density = kwargs.get('beam_density', BEAM_DENSITY)
|
||||
Parser.__init__(self, *args, **kwargs)
|
||||
|
||||
cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil:
|
||||
with gil:
|
||||
self._parseC(tokens, length, nr_feat, self.moves.n_moves)
|
||||
|
||||
cdef int _parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) except -1:
|
||||
cdef Beam beam = Beam(self.moves.n_moves, self.beam_width, min_density=self.beam_density)
|
||||
# TODO: How do we handle new labels here? This increases nr_class
|
||||
beam.initialize(self.moves.init_beam_state, length, tokens)
|
||||
beam.check_done(_check_final_state, NULL)
|
||||
if beam.is_done:
|
||||
_cleanup(beam)
|
||||
return 0
|
||||
while not beam.is_done:
|
||||
self._advance_beam(beam, None, False)
|
||||
state = <StateClass>beam.at(0)
|
||||
self.moves.finalize_state(state.c)
|
||||
for i in range(length):
|
||||
tokens[i] = state.c._sent[i]
|
||||
_cleanup(beam)
|
||||
|
||||
def update(self, Doc tokens, GoldParse gold_parse, itn=0):
|
||||
self.moves.preprocess_gold(gold_parse)
|
||||
cdef Beam pred = Beam(self.moves.n_moves, self.beam_width)
|
||||
pred.initialize(self.moves.init_beam_state, tokens.length, tokens.c)
|
||||
pred.check_done(_check_final_state, NULL)
|
||||
# Hack for NER
|
||||
for i in range(pred.size):
|
||||
stcls = <StateClass>pred.at(i)
|
||||
self.moves.initialize_state(stcls.c)
|
||||
|
||||
cdef Beam gold = Beam(self.moves.n_moves, self.beam_width, min_density=0.0)
|
||||
gold.initialize(self.moves.init_beam_state, tokens.length, tokens.c)
|
||||
gold.check_done(_check_final_state, NULL)
|
||||
violn = MaxViolation()
|
||||
while not pred.is_done and not gold.is_done:
|
||||
# We search separately here, to allow for ambiguity in the gold parse.
|
||||
self._advance_beam(pred, gold_parse, False)
|
||||
self._advance_beam(gold, gold_parse, True)
|
||||
violn.check_crf(pred, gold)
|
||||
if pred.loss > 0 and pred.min_score > (gold.score + self.model.time):
|
||||
break
|
||||
else:
|
||||
# The non-monotonic oracle makes it difficult to ensure final costs are
|
||||
# correct. Therefore do final correction
|
||||
for i in range(pred.size):
|
||||
if self.moves.is_gold_parse(<StateClass>pred.at(i), gold_parse):
|
||||
pred._states[i].loss = 0.0
|
||||
elif pred._states[i].loss == 0.0:
|
||||
pred._states[i].loss = 1.0
|
||||
violn.check_crf(pred, gold)
|
||||
if pred.size < 1:
|
||||
raise Exception("No candidates", tokens.length)
|
||||
if gold.size < 1:
|
||||
raise Exception("No gold", tokens.length)
|
||||
if pred.loss == 0:
|
||||
self.model.update_from_histories(self.moves, tokens, [(0.0, [])])
|
||||
elif True:
|
||||
#_check_train_integrity(pred, gold, gold_parse, self.moves)
|
||||
histories = list(zip(violn.p_probs, violn.p_hist)) + \
|
||||
list(zip(violn.g_probs, violn.g_hist))
|
||||
self.model.update_from_histories(self.moves, tokens, histories, min_grad=0.001**(itn+1))
|
||||
else:
|
||||
self.model.update_from_histories(self.moves, tokens,
|
||||
[(1.0, violn.p_hist[0]), (-1.0, violn.g_hist[0])])
|
||||
_cleanup(pred)
|
||||
_cleanup(gold)
|
||||
return pred.loss
|
||||
|
||||
def _advance_beam(self, Beam beam, GoldParse gold, bint follow_gold):
|
||||
cdef atom_t[CONTEXT_SIZE] context
|
||||
cdef Pool mem = Pool()
|
||||
features = <FeatureC*>mem.alloc(self.model.nr_feat, sizeof(FeatureC))
|
||||
if False:
|
||||
mb = Minibatch(self.model.widths, beam.size)
|
||||
for i in range(beam.size):
|
||||
stcls = <StateClass>beam.at(i)
|
||||
if stcls.c.is_final():
|
||||
nr_feat = 0
|
||||
else:
|
||||
nr_feat = self.model.set_featuresC(context, features, stcls.c)
|
||||
self.moves.set_valid(beam.is_valid[i], stcls.c)
|
||||
mb.c.push_back(features, nr_feat, beam.costs[i], beam.is_valid[i], 0)
|
||||
self.model(mb)
|
||||
for i in range(beam.size):
|
||||
memcpy(beam.scores[i], mb.c.scores(i), mb.c.nr_out() * sizeof(beam.scores[i][0]))
|
||||
else:
|
||||
for i in range(beam.size):
|
||||
stcls = <StateClass>beam.at(i)
|
||||
if not stcls.is_final():
|
||||
nr_feat = self.model.set_featuresC(context, features, stcls.c)
|
||||
self.moves.set_valid(beam.is_valid[i], stcls.c)
|
||||
self.model.set_scoresC(beam.scores[i], features, nr_feat)
|
||||
if gold is not None:
|
||||
n_gold = 0
|
||||
lines = []
|
||||
for i in range(beam.size):
|
||||
stcls = <StateClass>beam.at(i)
|
||||
if not stcls.c.is_final():
|
||||
self.moves.set_costs(beam.is_valid[i], beam.costs[i], stcls, gold)
|
||||
if follow_gold:
|
||||
for j in range(self.moves.n_moves):
|
||||
if beam.costs[i][j] >= 1:
|
||||
beam.is_valid[i][j] = 0
|
||||
lines.append((stcls.B(0), stcls.B(1),
|
||||
stcls.B_(0).ent_iob, stcls.B_(1).ent_iob,
|
||||
stcls.B_(1).sent_start,
|
||||
j,
|
||||
beam.is_valid[i][j], 'set invalid',
|
||||
beam.costs[i][j], self.moves.c[j].move, self.moves.c[j].label))
|
||||
n_gold += 1 if beam.is_valid[i][j] else 0
|
||||
if follow_gold and n_gold == 0:
|
||||
raise Exception("No gold")
|
||||
if follow_gold:
|
||||
beam.advance(_transition_state, NULL, <void*>self.moves.c)
|
||||
else:
|
||||
beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
|
||||
beam.check_done(_check_final_state, NULL)
|
||||
|
||||
|
||||
# These are passed as callbacks to thinc.search.Beam
|
||||
cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
|
||||
dest = <StateClass>_dest
|
||||
src = <StateClass>_src
|
||||
moves = <const Transition*>_moves
|
||||
dest.clone(src)
|
||||
moves[clas].do(dest.c, moves[clas].label)
|
||||
|
||||
|
||||
cdef int _check_final_state(void* _state, void* extra_args) except -1:
|
||||
return (<StateClass>_state).is_final()
|
||||
|
||||
|
||||
def _cleanup(Beam beam):
|
||||
for i in range(beam.width):
|
||||
Py_XDECREF(<PyObject*>beam._states[i].content)
|
||||
Py_XDECREF(<PyObject*>beam._parents[i].content)
|
||||
|
||||
|
||||
cdef hash_t _hash_state(void* _state, void* _) except 0:
|
||||
state = <StateClass>_state
|
||||
if state.c.is_final():
|
||||
return 1
|
||||
else:
|
||||
return state.c.hash()
|
||||
|
||||
|
||||
def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, TransitionSystem moves):
|
||||
for i in range(pred.size):
|
||||
if not pred._states[i].is_done or pred._states[i].loss == 0:
|
||||
continue
|
||||
state = <StateClass>pred.at(i)
|
||||
if moves.is_gold_parse(state, gold_parse) == True:
|
||||
for dep in gold_parse.orig_annot:
|
||||
print(dep[1], dep[3], dep[4])
|
||||
print("Cost", pred._states[i].loss)
|
||||
for j in range(gold_parse.length):
|
||||
print(gold_parse.orig_annot[j][1], state.H(j), moves.strings[state.safe_get(j).dep])
|
||||
acts = [moves.c[clas].move for clas in pred.histories[i]]
|
||||
labels = [moves.c[clas].label for clas in pred.histories[i]]
|
||||
print([moves.move_name(move, label) for move, label in zip(acts, labels)])
|
||||
raise Exception("Predicted state is gold-standard")
|
||||
for i in range(gold.size):
|
||||
if not gold._states[i].is_done:
|
||||
continue
|
||||
state = <StateClass>gold.at(i)
|
||||
if moves.is_gold(state, gold_parse) == False:
|
||||
print("Truth")
|
||||
for dep in gold_parse.orig_annot:
|
||||
print(dep[1], dep[3], dep[4])
|
||||
print("Predicted good")
|
||||
for j in range(gold_parse.length):
|
||||
print(gold_parse.orig_annot[j][1], state.H(j), moves.strings[state.safe_get(j).dep])
|
||||
raise Exception("Gold parse is not gold-standard")
|
||||
|
||||
|
|
@ -54,9 +54,6 @@ from .._ml import link_vectors_to_models
|
|||
from .._ml import HistoryFeatures
|
||||
from ..compat import json_dumps, copy_array
|
||||
|
||||
from . import _parse_features
|
||||
from ._parse_features cimport CONTEXT_SIZE
|
||||
from ._parse_features cimport fill_context
|
||||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC
|
||||
from . import nonproj
|
||||
|
|
|
@ -1,24 +0,0 @@
|
|||
from thinc.linear.avgtron cimport AveragedPerceptron
|
||||
from thinc.typedefs cimport atom_t
|
||||
from thinc.structs cimport FeatureC
|
||||
|
||||
from .stateclass cimport StateClass
|
||||
from .arc_eager cimport TransitionSystem
|
||||
from ..vocab cimport Vocab
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..structs cimport TokenC
|
||||
from ._state cimport StateC
|
||||
|
||||
|
||||
cdef class ParserModel(AveragedPerceptron):
|
||||
cdef int set_featuresC(self, atom_t* context, FeatureC* features,
|
||||
const StateC* state) nogil
|
||||
|
||||
|
||||
cdef class Parser:
|
||||
cdef readonly Vocab vocab
|
||||
cdef readonly ParserModel model
|
||||
cdef readonly TransitionSystem moves
|
||||
cdef readonly object cfg
|
||||
|
||||
cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil
|
|
@ -1,526 +0,0 @@
|
|||
"""
|
||||
MALT-style dependency parser
|
||||
"""
|
||||
# coding: utf-8
|
||||
# cython: infer_types=True
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from collections import Counter
|
||||
import ujson
|
||||
|
||||
cimport cython
|
||||
cimport cython.parallel
|
||||
|
||||
import numpy.random
|
||||
|
||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
||||
from cpython.exc cimport PyErr_CheckSignals
|
||||
from libc.stdint cimport uint32_t, uint64_t
|
||||
from libc.string cimport memset, memcpy
|
||||
from libc.stdlib cimport malloc, calloc, free
|
||||
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
|
||||
from thinc.linear.avgtron cimport AveragedPerceptron
|
||||
from thinc.linalg cimport VecVec
|
||||
from thinc.structs cimport SparseArrayC, FeatureC, ExampleC
|
||||
from thinc.extra.eg cimport Example
|
||||
from cymem.cymem cimport Pool, Address
|
||||
from murmurhash.mrmr cimport hash64
|
||||
from preshed.maps cimport MapStruct
|
||||
from preshed.maps cimport map_get
|
||||
|
||||
from . import _parse_features
|
||||
from ._parse_features cimport CONTEXT_SIZE
|
||||
from ._parse_features cimport fill_context
|
||||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC
|
||||
from .transition_system import OracleError
|
||||
from .transition_system cimport TransitionSystem, Transition
|
||||
from ..structs cimport TokenC
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..strings cimport StringStore
|
||||
from ..gold cimport GoldParse
|
||||
|
||||
|
||||
USE_FTRL = True
|
||||
DEBUG = False
|
||||
def set_debug(val):
|
||||
global DEBUG
|
||||
DEBUG = val
|
||||
|
||||
|
||||
def get_templates(name):
|
||||
pf = _parse_features
|
||||
if name == 'ner':
|
||||
return pf.ner
|
||||
elif name == 'debug':
|
||||
return pf.unigrams
|
||||
elif name.startswith('embed'):
|
||||
return (pf.words, pf.tags, pf.labels)
|
||||
else:
|
||||
return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \
|
||||
pf.tree_shape + pf.trigrams)
|
||||
|
||||
|
||||
cdef class ParserModel(AveragedPerceptron):
|
||||
cdef int set_featuresC(self, atom_t* context, FeatureC* features,
|
||||
const StateC* state) nogil:
|
||||
fill_context(context, state)
|
||||
nr_feat = self.extracter.set_features(features, context)
|
||||
return nr_feat
|
||||
|
||||
def update(self, Example eg, itn=0):
|
||||
"""
|
||||
Does regression on negative cost. Sort of cute?
|
||||
"""
|
||||
self.time += 1
|
||||
cdef int best = arg_max_if_gold(eg.c.scores, eg.c.costs, eg.c.nr_class)
|
||||
cdef int guess = eg.guess
|
||||
if guess == best or best == -1:
|
||||
return 0.0
|
||||
cdef FeatureC feat
|
||||
cdef int clas
|
||||
cdef weight_t gradient
|
||||
if USE_FTRL:
|
||||
for feat in eg.c.features[:eg.c.nr_feat]:
|
||||
for clas in range(eg.c.nr_class):
|
||||
if eg.c.is_valid[clas] and eg.c.scores[clas] >= eg.c.scores[best]:
|
||||
gradient = eg.c.scores[clas] + eg.c.costs[clas]
|
||||
self.update_weight_ftrl(feat.key, clas, feat.value * gradient)
|
||||
else:
|
||||
for feat in eg.c.features[:eg.c.nr_feat]:
|
||||
self.update_weight(feat.key, guess, feat.value * eg.c.costs[guess])
|
||||
self.update_weight(feat.key, best, -feat.value * eg.c.costs[guess])
|
||||
return eg.c.costs[guess]
|
||||
|
||||
def update_from_histories(self, TransitionSystem moves, Doc doc, histories, weight_t min_grad=0.0):
|
||||
cdef Pool mem = Pool()
|
||||
features = <FeatureC*>mem.alloc(self.nr_feat, sizeof(FeatureC))
|
||||
|
||||
cdef StateClass stcls
|
||||
|
||||
cdef class_t clas
|
||||
self.time += 1
|
||||
cdef atom_t[CONTEXT_SIZE] atoms
|
||||
histories = [(grad, hist) for grad, hist in histories if abs(grad) >= min_grad and hist]
|
||||
if not histories:
|
||||
return None
|
||||
gradient = [Counter() for _ in range(max([max(h)+1 for _, h in histories]))]
|
||||
for d_loss, history in histories:
|
||||
stcls = StateClass.init(doc.c, doc.length)
|
||||
moves.initialize_state(stcls.c)
|
||||
for clas in history:
|
||||
nr_feat = self.set_featuresC(atoms, features, stcls.c)
|
||||
clas_grad = gradient[clas]
|
||||
for feat in features[:nr_feat]:
|
||||
clas_grad[feat.key] += d_loss * feat.value
|
||||
moves.c[clas].do(stcls.c, moves.c[clas].label)
|
||||
cdef feat_t key
|
||||
cdef weight_t d_feat
|
||||
for clas, clas_grad in enumerate(gradient):
|
||||
for key, d_feat in clas_grad.items():
|
||||
if d_feat != 0:
|
||||
self.update_weight_ftrl(key, clas, d_feat)
|
||||
|
||||
|
||||
cdef class Parser:
|
||||
"""
|
||||
Base class of the DependencyParser and EntityRecognizer.
|
||||
"""
|
||||
@classmethod
|
||||
def load(cls, path, Vocab vocab, TransitionSystem=None, require=False, **cfg):
|
||||
"""
|
||||
Load the statistical model from the supplied path.
|
||||
|
||||
Arguments:
|
||||
path (Path):
|
||||
The path to load from.
|
||||
vocab (Vocab):
|
||||
The vocabulary. Must be shared by the documents to be processed.
|
||||
require (bool):
|
||||
Whether to raise an error if the files are not found.
|
||||
Returns (Parser):
|
||||
The newly constructed object.
|
||||
"""
|
||||
with (path / 'config.json').open() as file_:
|
||||
cfg = ujson.load(file_)
|
||||
# TODO: remove this shim when we don't have to support older data
|
||||
if 'labels' in cfg and 'actions' not in cfg:
|
||||
cfg['actions'] = cfg.pop('labels')
|
||||
# TODO: remove this shim when we don't have to support older data
|
||||
for action_name, labels in dict(cfg.get('actions', {})).items():
|
||||
# We need this to be sorted
|
||||
if isinstance(labels, dict):
|
||||
labels = list(sorted(labels.keys()))
|
||||
cfg['actions'][action_name] = labels
|
||||
self = cls(vocab, TransitionSystem=TransitionSystem, model=None, **cfg)
|
||||
if (path / 'model').exists():
|
||||
self.model.load(str(path / 'model'))
|
||||
elif require:
|
||||
raise IOError(
|
||||
"Required file %s/model not found when loading" % str(path))
|
||||
return self
|
||||
|
||||
def __init__(self, Vocab vocab, TransitionSystem=None, ParserModel model=None, **cfg):
|
||||
"""
|
||||
Create a Parser.
|
||||
|
||||
Arguments:
|
||||
vocab (Vocab):
|
||||
The vocabulary object. Must be shared with documents to be processed.
|
||||
model (thinc.linear.AveragedPerceptron):
|
||||
The statistical model.
|
||||
Returns (Parser):
|
||||
The newly constructed object.
|
||||
"""
|
||||
if TransitionSystem is None:
|
||||
TransitionSystem = self.TransitionSystem
|
||||
self.vocab = vocab
|
||||
cfg['actions'] = TransitionSystem.get_actions(**cfg)
|
||||
self.moves = TransitionSystem(vocab.strings, cfg['actions'])
|
||||
# TODO: Remove this when we no longer need to support old-style models
|
||||
if isinstance(cfg.get('features'), basestring):
|
||||
cfg['features'] = get_templates(cfg['features'])
|
||||
elif 'features' not in cfg:
|
||||
cfg['features'] = self.feature_templates
|
||||
|
||||
self.model = ParserModel(cfg['features'])
|
||||
self.model.l1_penalty = cfg.get('L1', 0.0)
|
||||
self.model.learn_rate = cfg.get('learn_rate', 0.001)
|
||||
|
||||
self.cfg = cfg
|
||||
# TODO: This is a pretty hacky fix to the problem of adding more
|
||||
# labels. The issue is they come in out of order, if labels are
|
||||
# added during training
|
||||
for label in cfg.get('extra_labels', []):
|
||||
self.add_label(label)
|
||||
|
||||
def __reduce__(self):
|
||||
return (Parser, (self.vocab, self.moves, self.model), None, None)
|
||||
|
||||
def __call__(self, Doc tokens):
|
||||
"""
|
||||
Apply the entity recognizer, setting the annotations onto the Doc object.
|
||||
|
||||
Arguments:
|
||||
doc (Doc): The document to be processed.
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
cdef int nr_feat = self.model.nr_feat
|
||||
with nogil:
|
||||
status = self.parseC(tokens.c, tokens.length, nr_feat)
|
||||
# Check for KeyboardInterrupt etc. Untested
|
||||
PyErr_CheckSignals()
|
||||
if status != 0:
|
||||
raise ParserStateError(tokens)
|
||||
self.moves.finalize_doc(tokens)
|
||||
|
||||
def pipe(self, stream, int batch_size=1000, int n_threads=2):
|
||||
"""
|
||||
Process a stream of documents.
|
||||
|
||||
Arguments:
|
||||
stream: The sequence of documents to process.
|
||||
batch_size (int):
|
||||
The number of documents to accumulate into a working set.
|
||||
n_threads (int):
|
||||
The number of threads with which to work on the buffer in parallel.
|
||||
Yields (Doc): Documents, in order.
|
||||
"""
|
||||
cdef Pool mem = Pool()
|
||||
cdef TokenC** doc_ptr = <TokenC**>mem.alloc(batch_size, sizeof(TokenC*))
|
||||
cdef int* lengths = <int*>mem.alloc(batch_size, sizeof(int))
|
||||
cdef Doc doc
|
||||
cdef int i
|
||||
cdef int nr_feat = self.model.nr_feat
|
||||
cdef int status
|
||||
queue = []
|
||||
for doc in stream:
|
||||
doc_ptr[len(queue)] = doc.c
|
||||
lengths[len(queue)] = doc.length
|
||||
queue.append(doc)
|
||||
if len(queue) == batch_size:
|
||||
with nogil:
|
||||
for i in cython.parallel.prange(batch_size, num_threads=n_threads):
|
||||
status = self.parseC(doc_ptr[i], lengths[i], nr_feat)
|
||||
if status != 0:
|
||||
with gil:
|
||||
raise ParserStateError(queue[i])
|
||||
PyErr_CheckSignals()
|
||||
for doc in queue:
|
||||
self.moves.finalize_doc(doc)
|
||||
yield doc
|
||||
queue = []
|
||||
batch_size = len(queue)
|
||||
with nogil:
|
||||
for i in cython.parallel.prange(batch_size, num_threads=n_threads):
|
||||
status = self.parseC(doc_ptr[i], lengths[i], nr_feat)
|
||||
if status != 0:
|
||||
with gil:
|
||||
raise ParserStateError(queue[i])
|
||||
PyErr_CheckSignals()
|
||||
for doc in queue:
|
||||
self.moves.finalize_doc(doc)
|
||||
yield doc
|
||||
|
||||
cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil:
|
||||
state = new StateC(tokens, length)
|
||||
# NB: This can change self.moves.n_moves!
|
||||
# I think this causes memory errors if called by .pipe()
|
||||
self.moves.initialize_state(state)
|
||||
nr_class = self.moves.n_moves
|
||||
|
||||
cdef ExampleC eg
|
||||
eg.nr_feat = nr_feat
|
||||
eg.nr_atom = CONTEXT_SIZE
|
||||
eg.nr_class = nr_class
|
||||
eg.features = <FeatureC*>calloc(sizeof(FeatureC), nr_feat)
|
||||
eg.atoms = <atom_t*>calloc(sizeof(atom_t), CONTEXT_SIZE)
|
||||
eg.scores = <weight_t*>calloc(sizeof(weight_t), nr_class)
|
||||
eg.is_valid = <int*>calloc(sizeof(int), nr_class)
|
||||
cdef int i
|
||||
while not state.is_final():
|
||||
eg.nr_feat = self.model.set_featuresC(eg.atoms, eg.features, state)
|
||||
self.moves.set_valid(eg.is_valid, state)
|
||||
self.model.set_scoresC(eg.scores, eg.features, eg.nr_feat)
|
||||
|
||||
guess = VecVec.arg_max_if_true(eg.scores, eg.is_valid, eg.nr_class)
|
||||
if guess < 0:
|
||||
return 1
|
||||
|
||||
action = self.moves.c[guess]
|
||||
|
||||
action.do(state, action.label)
|
||||
memset(eg.scores, 0, sizeof(eg.scores[0]) * eg.nr_class)
|
||||
for i in range(eg.nr_class):
|
||||
eg.is_valid[i] = 1
|
||||
self.moves.finalize_state(state)
|
||||
for i in range(length):
|
||||
tokens[i] = state._sent[i]
|
||||
del state
|
||||
free(eg.features)
|
||||
free(eg.atoms)
|
||||
free(eg.scores)
|
||||
free(eg.is_valid)
|
||||
return 0
|
||||
|
||||
def update(self, Doc tokens, GoldParse gold, itn=0, double drop=0.0):
|
||||
"""
|
||||
Update the statistical model.
|
||||
|
||||
Arguments:
|
||||
doc (Doc):
|
||||
The example document for the update.
|
||||
gold (GoldParse):
|
||||
The gold-standard annotations, to calculate the loss.
|
||||
Returns (float):
|
||||
The loss on this example.
|
||||
"""
|
||||
self.moves.preprocess_gold(gold)
|
||||
cdef StateClass stcls = StateClass.init(tokens.c, tokens.length)
|
||||
self.moves.initialize_state(stcls.c)
|
||||
cdef Pool mem = Pool()
|
||||
cdef Example eg = Example(
|
||||
nr_class=self.moves.n_moves,
|
||||
nr_atom=CONTEXT_SIZE,
|
||||
nr_feat=self.model.nr_feat)
|
||||
cdef weight_t loss = 0
|
||||
cdef Transition action
|
||||
cdef double dropout_rate = self.cfg.get('dropout', drop)
|
||||
while not stcls.is_final():
|
||||
eg.c.nr_feat = self.model.set_featuresC(eg.c.atoms, eg.c.features,
|
||||
stcls.c)
|
||||
dropout(eg.c.features, eg.c.nr_feat, dropout_rate)
|
||||
self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold)
|
||||
self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat)
|
||||
guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
|
||||
self.model.update(eg)
|
||||
|
||||
action = self.moves.c[guess]
|
||||
action.do(stcls.c, action.label)
|
||||
loss += eg.costs[guess]
|
||||
eg.fill_scores(0, eg.c.nr_class)
|
||||
eg.fill_costs(0, eg.c.nr_class)
|
||||
eg.fill_is_valid(1, eg.c.nr_class)
|
||||
|
||||
self.moves.finalize_state(stcls.c)
|
||||
return loss
|
||||
|
||||
def step_through(self, Doc doc, GoldParse gold=None):
|
||||
"""
|
||||
Set up a stepwise state, to introspect and control the transition sequence.
|
||||
|
||||
Arguments:
|
||||
doc (Doc): The document to step through.
|
||||
gold (GoldParse): Optional gold parse
|
||||
Returns (StepwiseState):
|
||||
A state object, to step through the annotation process.
|
||||
"""
|
||||
return StepwiseState(self, doc, gold=gold)
|
||||
|
||||
def from_transition_sequence(self, Doc doc, sequence):
|
||||
"""Control the annotations on a document by specifying a transition sequence
|
||||
to follow.
|
||||
|
||||
Arguments:
|
||||
doc (Doc): The document to annotate.
|
||||
sequence: A sequence of action names, as unicode strings.
|
||||
Returns: None
|
||||
"""
|
||||
with self.step_through(doc) as stepwise:
|
||||
for transition in sequence:
|
||||
stepwise.transition(transition)
|
||||
|
||||
def add_label(self, label):
|
||||
# Doesn't set label into serializer -- subclasses override it to do that.
|
||||
for action in self.moves.action_types:
|
||||
added = self.moves.add_action(action, label)
|
||||
if added:
|
||||
# Important that the labels be stored as a list! We need the
|
||||
# order, or the model goes out of synch
|
||||
self.cfg.setdefault('extra_labels', []).append(label)
|
||||
|
||||
|
||||
cdef int dropout(FeatureC* feats, int nr_feat, float prob) except -1:
|
||||
if prob <= 0 or prob >= 1.:
|
||||
return 0
|
||||
cdef double[::1] py_probs = numpy.random.uniform(0., 1., nr_feat)
|
||||
cdef double* probs = &py_probs[0]
|
||||
for i in range(nr_feat):
|
||||
if probs[i] >= prob:
|
||||
feats[i].value /= prob
|
||||
else:
|
||||
feats[i].value = 0.
|
||||
|
||||
|
||||
cdef class StepwiseState:
|
||||
cdef readonly StateClass stcls
|
||||
cdef readonly Example eg
|
||||
cdef readonly Doc doc
|
||||
cdef readonly GoldParse gold
|
||||
cdef readonly Parser parser
|
||||
|
||||
def __init__(self, Parser parser, Doc doc, GoldParse gold=None):
|
||||
self.parser = parser
|
||||
self.doc = doc
|
||||
if gold is not None:
|
||||
self.gold = gold
|
||||
self.parser.moves.preprocess_gold(self.gold)
|
||||
else:
|
||||
self.gold = GoldParse(doc)
|
||||
self.stcls = StateClass.init(doc.c, doc.length)
|
||||
self.parser.moves.initialize_state(self.stcls.c)
|
||||
self.eg = Example(
|
||||
nr_class=self.parser.moves.n_moves,
|
||||
nr_atom=CONTEXT_SIZE,
|
||||
nr_feat=self.parser.model.nr_feat)
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, type, value, traceback):
|
||||
self.finish()
|
||||
|
||||
@property
|
||||
def is_final(self):
|
||||
return self.stcls.is_final()
|
||||
|
||||
@property
|
||||
def stack(self):
|
||||
return self.stcls.stack
|
||||
|
||||
@property
|
||||
def queue(self):
|
||||
return self.stcls.queue
|
||||
|
||||
@property
|
||||
def heads(self):
|
||||
return [self.stcls.H(i) for i in range(self.stcls.c.length)]
|
||||
|
||||
@property
|
||||
def deps(self):
|
||||
return [self.doc.vocab.strings[self.stcls.c._sent[i].dep]
|
||||
for i in range(self.stcls.c.length)]
|
||||
|
||||
@property
|
||||
def costs(self):
|
||||
"""
|
||||
Find the action-costs for the current state.
|
||||
"""
|
||||
if not self.gold:
|
||||
raise ValueError("Can't set costs: No GoldParse provided")
|
||||
self.parser.moves.set_costs(self.eg.c.is_valid, self.eg.c.costs,
|
||||
self.stcls, self.gold)
|
||||
costs = {}
|
||||
for i in range(self.parser.moves.n_moves):
|
||||
if not self.eg.c.is_valid[i]:
|
||||
continue
|
||||
transition = self.parser.moves.c[i]
|
||||
name = self.parser.moves.move_name(transition.move, transition.label)
|
||||
costs[name] = self.eg.c.costs[i]
|
||||
return costs
|
||||
|
||||
def predict(self):
|
||||
self.eg.reset()
|
||||
self.eg.c.nr_feat = self.parser.model.set_featuresC(self.eg.c.atoms, self.eg.c.features,
|
||||
self.stcls.c)
|
||||
self.parser.moves.set_valid(self.eg.c.is_valid, self.stcls.c)
|
||||
self.parser.model.set_scoresC(self.eg.c.scores,
|
||||
self.eg.c.features, self.eg.c.nr_feat)
|
||||
|
||||
cdef Transition action = self.parser.moves.c[self.eg.guess]
|
||||
return self.parser.moves.move_name(action.move, action.label)
|
||||
|
||||
def transition(self, action_name=None):
|
||||
if action_name is None:
|
||||
action_name = self.predict()
|
||||
moves = {'S': 0, 'D': 1, 'L': 2, 'R': 3}
|
||||
if action_name == '_':
|
||||
action_name = self.predict()
|
||||
action = self.parser.moves.lookup_transition(action_name)
|
||||
elif action_name == 'L' or action_name == 'R':
|
||||
self.predict()
|
||||
move = moves[action_name]
|
||||
clas = _arg_max_clas(self.eg.c.scores, move, self.parser.moves.c,
|
||||
self.eg.c.nr_class)
|
||||
action = self.parser.moves.c[clas]
|
||||
else:
|
||||
action = self.parser.moves.lookup_transition(action_name)
|
||||
action.do(self.stcls.c, action.label)
|
||||
|
||||
def finish(self):
|
||||
if self.stcls.is_final():
|
||||
self.parser.moves.finalize_state(self.stcls.c)
|
||||
self.doc.set_parse(self.stcls.c._sent)
|
||||
self.parser.moves.finalize_doc(self.doc)
|
||||
|
||||
|
||||
class ParserStateError(ValueError):
|
||||
def __init__(self, doc):
|
||||
ValueError.__init__(self,
|
||||
"Error analysing doc -- no valid actions available. This should "
|
||||
"never happen, so please report the error on the issue tracker. "
|
||||
"Here's the thread to do so --- reopen it if it's closed:\n"
|
||||
"https://github.com/spacy-io/spaCy/issues/429\n"
|
||||
"Please include the text that the parser failed on, which is:\n"
|
||||
"%s" % repr(doc.text))
|
||||
|
||||
cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs, int n) nogil:
|
||||
cdef int best = -1
|
||||
for i in range(n):
|
||||
if costs[i] <= 0:
|
||||
if best == -1 or scores[i] > scores[best]:
|
||||
best = i
|
||||
return best
|
||||
|
||||
|
||||
cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions,
|
||||
int nr_class) except -1:
|
||||
cdef weight_t score = 0
|
||||
cdef int mode = -1
|
||||
cdef int i
|
||||
for i in range(nr_class):
|
||||
if actions[i].move == move and (mode == -1 or scores[i] >= score):
|
||||
mode = i
|
||||
score = scores[i]
|
||||
return mode
|
|
@ -10,7 +10,8 @@ import pytest
|
|||
def test_doc_add_entities_set_ents_iob(en_vocab):
|
||||
text = ["This", "is", "a", "lion"]
|
||||
doc = get_doc(en_vocab, text)
|
||||
ner = EntityRecognizer(en_vocab, features=[(2,), (3,)])
|
||||
ner = EntityRecognizer(en_vocab)
|
||||
ner.begin_training([])
|
||||
ner(doc)
|
||||
|
||||
assert len(list(doc.ents)) == 0
|
||||
|
|
|
@ -9,7 +9,7 @@ from ...attrs import NORM
|
|||
from ...gold import GoldParse
|
||||
from ...vocab import Vocab
|
||||
from ...tokens import Doc
|
||||
from ...pipeline import NeuralDependencyParser
|
||||
from ...pipeline import DependencyParser
|
||||
|
||||
numpy.random.seed(0)
|
||||
|
||||
|
@ -21,7 +21,7 @@ def vocab():
|
|||
|
||||
@pytest.fixture
|
||||
def parser(vocab):
|
||||
parser = NeuralDependencyParser(vocab)
|
||||
parser = DependencyParser(vocab)
|
||||
parser.cfg['token_vector_width'] = 8
|
||||
parser.cfg['hidden_width'] = 30
|
||||
parser.cfg['hist_size'] = 0
|
||||
|
|
|
@ -6,7 +6,7 @@ import numpy
|
|||
|
||||
from ..._ml import chain, Tok2Vec, doc2feats
|
||||
from ...vocab import Vocab
|
||||
from ...pipeline import TokenVectorEncoder
|
||||
from ...pipeline import Tensorizer
|
||||
from ...syntax.arc_eager import ArcEager
|
||||
from ...syntax.nn_parser import Parser
|
||||
from ...tokens.doc import Doc
|
||||
|
|
|
@ -8,7 +8,7 @@ from ...attrs import NORM
|
|||
from ...gold import GoldParse
|
||||
from ...vocab import Vocab
|
||||
from ...tokens import Doc
|
||||
from ...pipeline import NeuralDependencyParser
|
||||
from ...pipeline import DependencyParser
|
||||
|
||||
@pytest.fixture
|
||||
def vocab():
|
||||
|
@ -16,7 +16,7 @@ def vocab():
|
|||
|
||||
@pytest.fixture
|
||||
def parser(vocab):
|
||||
parser = NeuralDependencyParser(vocab)
|
||||
parser = DependencyParser(vocab)
|
||||
parser.cfg['token_vector_width'] = 4
|
||||
parser.cfg['hidden_width'] = 32
|
||||
#parser.add_label('right')
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
import pytest
|
||||
|
||||
from ...pipeline import NeuralDependencyParser
|
||||
from ...pipeline import DependencyParser
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def parser(en_vocab):
|
||||
parser = NeuralDependencyParser(en_vocab)
|
||||
parser = DependencyParser(en_vocab)
|
||||
parser.add_label('nsubj')
|
||||
parser.model, cfg = parser.Model(parser.moves.n_moves)
|
||||
parser.cfg.update(cfg)
|
||||
|
@ -14,7 +14,7 @@ def parser(en_vocab):
|
|||
|
||||
@pytest.fixture
|
||||
def blank_parser(en_vocab):
|
||||
parser = NeuralDependencyParser(en_vocab)
|
||||
parser = DependencyParser(en_vocab)
|
||||
return parser
|
||||
|
||||
|
||||
|
|
|
@ -2,8 +2,8 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from ..util import make_tempdir
|
||||
from ...pipeline import NeuralDependencyParser as DependencyParser
|
||||
from ...pipeline import NeuralEntityRecognizer as EntityRecognizer
|
||||
from ...pipeline import DependencyParser
|
||||
from ...pipeline import EntityRecognizer
|
||||
|
||||
import pytest
|
||||
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from ..util import make_tempdir
|
||||
from ...pipeline import NeuralTagger as Tagger
|
||||
from ...pipeline import Tagger
|
||||
|
||||
import pytest
|
||||
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from ..util import make_tempdir
|
||||
from ...pipeline import TokenVectorEncoder as Tensorizer
|
||||
from ...pipeline import Tensorizer
|
||||
|
||||
import pytest
|
||||
|
||||
|
|
|
@ -16,7 +16,6 @@ from .lexeme cimport EMPTY_LEXEME
|
|||
from .lexeme cimport Lexeme
|
||||
from .strings cimport hash_string
|
||||
from .typedefs cimport attr_t
|
||||
from .cfile cimport CFile
|
||||
from .tokens.token cimport Token
|
||||
from .attrs cimport PROB, LANG
|
||||
from .structs cimport SerializedLexemeC
|
||||
|
|
Loading…
Reference in New Issue
Block a user