mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
* Tmp
This commit is contained in:
parent
d11c1edf8c
commit
e1c1a4b868
135
spacy/en.pxd
135
spacy/en.pxd
|
@ -1,135 +0,0 @@
|
||||||
from thinc.typedefs cimport atom_t
|
|
||||||
|
|
||||||
from .lang cimport Language
|
|
||||||
from .tokens cimport Tokens
|
|
||||||
from .tokens cimport TokenC
|
|
||||||
|
|
||||||
|
|
||||||
cpdef enum en_person_t:
|
|
||||||
NO_PERSON
|
|
||||||
FIRST
|
|
||||||
SECOND
|
|
||||||
THIRD
|
|
||||||
NON_THIRD
|
|
||||||
|
|
||||||
|
|
||||||
cpdef enum en_number_t:
|
|
||||||
NO_NUMBER
|
|
||||||
SINGULAR
|
|
||||||
PLURAL
|
|
||||||
MASS
|
|
||||||
|
|
||||||
|
|
||||||
cpdef enum en_gender_t:
|
|
||||||
NO_GENDER
|
|
||||||
MASCULINE
|
|
||||||
FEMININE
|
|
||||||
NEUTER
|
|
||||||
|
|
||||||
|
|
||||||
cpdef enum en_case_t:
|
|
||||||
NO_CASE
|
|
||||||
NOMINATIVE
|
|
||||||
GENITIVE
|
|
||||||
ACCUSATIVE
|
|
||||||
REFLEXIVE
|
|
||||||
DEMONYM
|
|
||||||
|
|
||||||
|
|
||||||
cpdef enum en_tenspect_t:
|
|
||||||
NO_TENSE
|
|
||||||
BASE_VERB
|
|
||||||
PRESENT
|
|
||||||
PAST
|
|
||||||
PASSIVE
|
|
||||||
ING
|
|
||||||
MODAL
|
|
||||||
|
|
||||||
|
|
||||||
cpdef enum misc_t:
|
|
||||||
NO_MISC
|
|
||||||
COMPARATIVE
|
|
||||||
SUPERLATIVE
|
|
||||||
RELATIVE
|
|
||||||
NAME
|
|
||||||
|
|
||||||
|
|
||||||
# Flags
|
|
||||||
cpdef enum FlagID:
|
|
||||||
IS_ALPHA
|
|
||||||
IS_ASCII
|
|
||||||
IS_DIGIT
|
|
||||||
IS_LOWER
|
|
||||||
IS_PUNCT
|
|
||||||
IS_SPACE
|
|
||||||
IS_TITLE
|
|
||||||
IS_UPPER
|
|
||||||
|
|
||||||
LIKE_URL
|
|
||||||
LIKE_NUMBER
|
|
||||||
|
|
||||||
OFT_LOWER
|
|
||||||
OFT_TITLE
|
|
||||||
OFT_UPPER
|
|
||||||
|
|
||||||
IN_MALES
|
|
||||||
IN_FEMALES
|
|
||||||
IN_SURNAMES
|
|
||||||
IN_PLACES
|
|
||||||
IN_GAMES
|
|
||||||
IN_CELEBS
|
|
||||||
IN_NAMES
|
|
||||||
|
|
||||||
|
|
||||||
cpdef enum:
|
|
||||||
P2_sic
|
|
||||||
P2_cluster
|
|
||||||
P2_shape
|
|
||||||
P2_prefix
|
|
||||||
P2_suffix
|
|
||||||
P2_pos
|
|
||||||
P2_lemma
|
|
||||||
P2_pos_type
|
|
||||||
|
|
||||||
P1_sic
|
|
||||||
P1_cluster
|
|
||||||
P1_shape
|
|
||||||
P1_prefix
|
|
||||||
P1_suffix
|
|
||||||
P1_pos
|
|
||||||
P1_lemma
|
|
||||||
P1_pos_type
|
|
||||||
|
|
||||||
W_sic
|
|
||||||
W_cluster
|
|
||||||
W_shape
|
|
||||||
W_prefix
|
|
||||||
W_suffix
|
|
||||||
W_pos
|
|
||||||
W_lemma
|
|
||||||
W_pos_type
|
|
||||||
|
|
||||||
N1_sic
|
|
||||||
N1_cluster
|
|
||||||
N1_shape
|
|
||||||
N1_prefix
|
|
||||||
N1_suffix
|
|
||||||
N1_pos
|
|
||||||
N1_lemma
|
|
||||||
N1_pos_type
|
|
||||||
|
|
||||||
N2_sic
|
|
||||||
N2_cluster
|
|
||||||
N2_shape
|
|
||||||
N2_prefix
|
|
||||||
N2_suffix
|
|
||||||
N2_pos
|
|
||||||
N2_lemma
|
|
||||||
N2_pos_type
|
|
||||||
|
|
||||||
N_CONTEXT_FIELDS
|
|
||||||
|
|
||||||
|
|
||||||
cdef class English(Language):
|
|
||||||
cdef int is_base_np_end(self, const TokenC* token) except -1
|
|
||||||
cdef int is_outside_base_np(self, const TokenC* token) except -1
|
|
213
spacy/en.pyx
213
spacy/en.pyx
|
@ -1,213 +0,0 @@
|
||||||
# cython: profile=True
|
|
||||||
# cython: embedsignature=True
|
|
||||||
'''Tokenize English text, using a scheme that differs from the Penn Treebank 3
|
|
||||||
scheme in several important respects:
|
|
||||||
|
|
||||||
* Whitespace is added as tokens, except for single spaces. e.g.,
|
|
||||||
|
|
||||||
>>> [w.string for w in EN.tokenize(u'\\nHello \\tThere')]
|
|
||||||
[u'\\n', u'Hello', u' ', u'\\t', u'There']
|
|
||||||
|
|
||||||
* Contractions are normalized, e.g.
|
|
||||||
|
|
||||||
>>> [w.string for w in EN.tokenize(u"isn't ain't won't he's")]
|
|
||||||
[u'is', u'not', u'are', u'not', u'will', u'not', u'he', u"__s"]
|
|
||||||
|
|
||||||
* Hyphenated words are split, with the hyphen preserved, e.g.:
|
|
||||||
|
|
||||||
>>> [w.string for w in EN.tokenize(u'New York-based')]
|
|
||||||
[u'New', u'York', u'-', u'based']
|
|
||||||
|
|
||||||
Other improvements:
|
|
||||||
|
|
||||||
* Email addresses, URLs, European-formatted dates and other numeric entities not
|
|
||||||
found in the PTB are tokenized correctly
|
|
||||||
* Heuristic handling of word-final periods (PTB expects sentence boundary detection
|
|
||||||
as a pre-process before tokenization.)
|
|
||||||
|
|
||||||
Take care to ensure your training and run-time data is tokenized according to the
|
|
||||||
same scheme. Tokenization problems are a major cause of poor performance for
|
|
||||||
NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
|
|
||||||
provides a fully Penn Treebank 3-compliant tokenizer.
|
|
||||||
'''
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from murmurhash.mrmr cimport hash64
|
|
||||||
|
|
||||||
cimport lang
|
|
||||||
from .typedefs cimport hash_t, id_t, flags_t
|
|
||||||
import orth
|
|
||||||
from .morphology cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
|
|
||||||
from .morphology cimport X, PUNCT, EOL
|
|
||||||
|
|
||||||
from .tokens cimport Morphology
|
|
||||||
|
|
||||||
|
|
||||||
DEF USE_POS_CACHE = True
|
|
||||||
|
|
||||||
|
|
||||||
POS_TAGS = {
|
|
||||||
'NULL': (NO_TAG, {}),
|
|
||||||
'EOL': (EOL, {}),
|
|
||||||
'CC': (CONJ, {}),
|
|
||||||
'CD': (NUM, {}),
|
|
||||||
'DT': (DET, {}),
|
|
||||||
'EX': (DET, {}),
|
|
||||||
'FW': (X, {}),
|
|
||||||
'IN': (ADP, {}),
|
|
||||||
'JJ': (ADJ, {}),
|
|
||||||
'JJR': (ADJ, {'misc': COMPARATIVE}),
|
|
||||||
'JJS': (ADJ, {'misc': SUPERLATIVE}),
|
|
||||||
'LS': (X, {}),
|
|
||||||
'MD': (VERB, {'tenspect': MODAL}),
|
|
||||||
'NN': (NOUN, {}),
|
|
||||||
'NNS': (NOUN, {'number': PLURAL}),
|
|
||||||
'NNP': (NOUN, {'misc': NAME}),
|
|
||||||
'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}),
|
|
||||||
'PDT': (DET, {}),
|
|
||||||
'POS': (PRT, {'case': GENITIVE}),
|
|
||||||
'PRP': (NOUN, {}),
|
|
||||||
'PRP$': (NOUN, {'case': GENITIVE}),
|
|
||||||
'RB': (ADV, {}),
|
|
||||||
'RBR': (ADV, {'misc': COMPARATIVE}),
|
|
||||||
'RBS': (ADV, {'misc': SUPERLATIVE}),
|
|
||||||
'RP': (PRT, {}),
|
|
||||||
'SYM': (X, {}),
|
|
||||||
'TO': (PRT, {}),
|
|
||||||
'UH': (X, {}),
|
|
||||||
'VB': (VERB, {}),
|
|
||||||
'VBD': (VERB, {'tenspect': PAST}),
|
|
||||||
'VBG': (VERB, {'tenspect': ING}),
|
|
||||||
'VBN': (VERB, {'tenspect': PASSIVE}),
|
|
||||||
'VBP': (VERB, {'tenspect': PRESENT}),
|
|
||||||
'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}),
|
|
||||||
'WDT': (DET, {'misc': RELATIVE}),
|
|
||||||
'WP': (PRON, {'misc': RELATIVE}),
|
|
||||||
'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}),
|
|
||||||
'WRB': (ADV, {'misc': RELATIVE}),
|
|
||||||
'!': (PUNCT, {}),
|
|
||||||
'#': (PUNCT, {}),
|
|
||||||
'$': (PUNCT, {}),
|
|
||||||
"''": (PUNCT, {}),
|
|
||||||
"(": (PUNCT, {}),
|
|
||||||
")": (PUNCT, {}),
|
|
||||||
"-LRB-": (PUNCT, {}),
|
|
||||||
"-RRB-": (PUNCT, {}),
|
|
||||||
".": (PUNCT, {}),
|
|
||||||
",": (PUNCT, {}),
|
|
||||||
"``": (PUNCT, {}),
|
|
||||||
":": (PUNCT, {}),
|
|
||||||
"?": (PUNCT, {}),
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
POS_TEMPLATES = (
|
|
||||||
(W_sic,),
|
|
||||||
(P1_lemma, P1_pos),
|
|
||||||
(P2_lemma, P2_pos),
|
|
||||||
(N1_sic,),
|
|
||||||
(N2_sic,),
|
|
||||||
|
|
||||||
(W_suffix,),
|
|
||||||
(W_prefix,),
|
|
||||||
|
|
||||||
(P1_pos,),
|
|
||||||
(P2_pos,),
|
|
||||||
(P1_pos, P2_pos),
|
|
||||||
(P1_pos, W_sic),
|
|
||||||
(P1_suffix,),
|
|
||||||
(N1_suffix,),
|
|
||||||
|
|
||||||
(W_shape,),
|
|
||||||
(W_cluster,),
|
|
||||||
(N1_cluster,),
|
|
||||||
(N2_cluster,),
|
|
||||||
(P1_cluster,),
|
|
||||||
(P2_cluster,),
|
|
||||||
|
|
||||||
(W_pos_type,),
|
|
||||||
(N1_pos_type,),
|
|
||||||
(N1_pos_type,),
|
|
||||||
(P1_pos, W_pos_type, N1_pos_type),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
cdef class English(Language):
|
|
||||||
"""English tokenizer, tightly coupled to lexicon.
|
|
||||||
|
|
||||||
Attributes:
|
|
||||||
name (unicode): The two letter code used by Wikipedia for the language.
|
|
||||||
lexicon (Lexicon): The lexicon. Exposes the lookup method.
|
|
||||||
"""
|
|
||||||
def get_props(self, unicode string):
|
|
||||||
return {'flags': self.set_flags(string), 'dense': orth.word_shape(string)}
|
|
||||||
|
|
||||||
def set_flags(self, unicode string):
|
|
||||||
cdef flags_t flags = 0
|
|
||||||
flags |= orth.is_alpha(string) << IS_ALPHA
|
|
||||||
flags |= orth.is_ascii(string) << IS_ASCII
|
|
||||||
flags |= orth.is_digit(string) << IS_DIGIT
|
|
||||||
flags |= orth.is_lower(string) << IS_LOWER
|
|
||||||
flags |= orth.is_punct(string) << IS_PUNCT
|
|
||||||
flags |= orth.is_space(string) << IS_SPACE
|
|
||||||
flags |= orth.is_title(string) << IS_TITLE
|
|
||||||
flags |= orth.is_upper(string) << IS_UPPER
|
|
||||||
|
|
||||||
flags |= orth.like_url(string) << LIKE_URL
|
|
||||||
flags |= orth.like_number(string) << LIKE_NUMBER
|
|
||||||
return flags
|
|
||||||
|
|
||||||
def set_pos(self, Tokens tokens):
|
|
||||||
cdef int i
|
|
||||||
cdef atom_t[N_CONTEXT_FIELDS] context
|
|
||||||
cdef TokenC* t = tokens.data
|
|
||||||
cdef id_t[2] bigram
|
|
||||||
cdef hash_t cache_key
|
|
||||||
cdef void* cached = NULL
|
|
||||||
assert self.morphologizer is not None
|
|
||||||
cdef dict tagdict = self.pos_tagger.tagdict
|
|
||||||
for i in range(tokens.length):
|
|
||||||
fill_pos_context(context, i, t)
|
|
||||||
t[i].pos = self.pos_tagger.predict(context)
|
|
||||||
self.morphologizer.set_morph(i, t)
|
|
||||||
|
|
||||||
def train_pos(self, Tokens tokens, golds):
|
|
||||||
cdef int i
|
|
||||||
cdef atom_t[N_CONTEXT_FIELDS] context
|
|
||||||
c = 0
|
|
||||||
cdef TokenC* t = tokens.data
|
|
||||||
for i in range(tokens.length):
|
|
||||||
fill_pos_context(context, i, t)
|
|
||||||
t[i].pos = self.pos_tagger.predict(context, [golds[i]])
|
|
||||||
self.morphologizer.set_morph(i, t)
|
|
||||||
c += t[i].pos == golds[i]
|
|
||||||
return c
|
|
||||||
|
|
||||||
cdef int is_base_np_end(self, const TokenC* token) except -1:
|
|
||||||
pass
|
|
||||||
|
|
||||||
cdef int is_outside_base_np(self, const TokenC* token) except -1:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef int fill_pos_context(atom_t* context, const int i, const TokenC* tokens) except -1:
|
|
||||||
_fill_from_token(&context[P2_sic], &tokens[i-2])
|
|
||||||
_fill_from_token(&context[P1_sic], &tokens[i-1])
|
|
||||||
_fill_from_token(&context[W_sic], &tokens[i])
|
|
||||||
_fill_from_token(&context[N1_sic], &tokens[i+1])
|
|
||||||
_fill_from_token(&context[N2_sic], &tokens[i+2])
|
|
||||||
|
|
||||||
|
|
||||||
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
|
||||||
context[0] = t.lex.sic
|
|
||||||
context[1] = t.lex.cluster
|
|
||||||
context[2] = t.lex.shape
|
|
||||||
context[3] = t.lex.prefix
|
|
||||||
context[4] = t.lex.suffix
|
|
||||||
context[5] = t.pos
|
|
||||||
context[6] = t.lemma
|
|
||||||
context[7] = t.lex.pos_type
|
|
||||||
|
|
||||||
|
|
||||||
EN = English('en')
|
|
|
@ -1,44 +0,0 @@
|
||||||
from libcpp.vector cimport vector
|
|
||||||
from libcpp.pair cimport pair
|
|
||||||
|
|
||||||
from preshed.counter cimport count_t
|
|
||||||
from preshed.maps cimport PreshMap
|
|
||||||
from preshed.counter cimport PreshCounter
|
|
||||||
from cymem.cymem cimport Pool
|
|
||||||
|
|
||||||
from .lang cimport Lexicon
|
|
||||||
from .tokens cimport Tokens, TokenC
|
|
||||||
from .typedefs cimport id_t
|
|
||||||
from .lexeme cimport attr_id_t
|
|
||||||
from .typedefs cimport attr_t
|
|
||||||
from .typedefs cimport hash_t
|
|
||||||
|
|
||||||
from murmurhash.mrmr cimport hash64
|
|
||||||
|
|
||||||
|
|
||||||
ctypedef vector[pair[id_t, count_t]] count_vector_t
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Index:
|
|
||||||
cdef attr_id_t attr_id
|
|
||||||
cdef readonly attr_t max_value
|
|
||||||
cdef vector[count_vector_t] counts
|
|
||||||
|
|
||||||
cpdef int count(self, Tokens tokens) except -1
|
|
||||||
|
|
||||||
|
|
||||||
cdef class DecisionMemory:
|
|
||||||
cdef int n_classes
|
|
||||||
cdef Pool mem
|
|
||||||
cdef PreshCounter _counts
|
|
||||||
cdef PreshCounter _class_counts
|
|
||||||
cdef PreshMap memos
|
|
||||||
cdef list class_names
|
|
||||||
|
|
||||||
cdef int inc(self, hash_t context_key, hash_t clas, count_t inc) except -1
|
|
||||||
cdef int find_best_class(self, count_t* counts, hash_t context_key) except -1
|
|
||||||
|
|
||||||
cdef inline int get(self, hash_t context_key) nogil:
|
|
||||||
return <int><size_t>self.memos.get(context_key) - 1
|
|
||||||
|
|
||||||
|
|
120
spacy/index.pyx
120
spacy/index.pyx
|
@ -1,120 +0,0 @@
|
||||||
"""Create a term-document matrix"""
|
|
||||||
cimport cython
|
|
||||||
from libc.stdint cimport int64_t
|
|
||||||
from libc.string cimport memmove
|
|
||||||
|
|
||||||
from cymem.cymem cimport Address
|
|
||||||
|
|
||||||
from .lexeme cimport Lexeme, get_attr
|
|
||||||
from .tokens cimport TokenC
|
|
||||||
from .typedefs cimport hash_t
|
|
||||||
|
|
||||||
from preshed.maps cimport MapStruct, Cell, map_get, map_set, map_init
|
|
||||||
from murmurhash.mrmr cimport hash64
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Index:
|
|
||||||
def __init__(self, attr_id_t attr_id):
|
|
||||||
self.attr_id = attr_id
|
|
||||||
self.max_value = 0
|
|
||||||
|
|
||||||
cpdef int count(self, Tokens tokens) except -1:
|
|
||||||
cdef PreshCounter counts = PreshCounter(2 ** 8)
|
|
||||||
cdef attr_id_t attr_id = self.attr_id
|
|
||||||
cdef attr_t term
|
|
||||||
cdef int i
|
|
||||||
for i in range(tokens.length):
|
|
||||||
term = get_attr(tokens.data[i].lex, attr_id)
|
|
||||||
counts.inc(term, 1)
|
|
||||||
if term > self.max_value:
|
|
||||||
self.max_value = term
|
|
||||||
cdef count_t count
|
|
||||||
cdef count_vector_t doc_counts
|
|
||||||
for term, count in counts:
|
|
||||||
doc_counts.push_back(pair[id_t, count_t](term, count))
|
|
||||||
self.counts.push_back(doc_counts)
|
|
||||||
|
|
||||||
|
|
||||||
cdef class DecisionMemory:
|
|
||||||
def __init__(self, class_names):
|
|
||||||
self.class_names = class_names
|
|
||||||
self.n_classes = len(class_names)
|
|
||||||
self.mem = Pool()
|
|
||||||
self._counts = PreshCounter()
|
|
||||||
self._class_counts = PreshCounter()
|
|
||||||
self.memos = PreshMap()
|
|
||||||
|
|
||||||
def load(self, loc, thresh=50):
|
|
||||||
cdef:
|
|
||||||
count_t freq
|
|
||||||
hash_t key
|
|
||||||
int clas
|
|
||||||
for line in open(loc):
|
|
||||||
freq, key, clas = [int(p) for p in line.split()]
|
|
||||||
if thresh == 0 or freq >= thresh:
|
|
||||||
self.memos.set(key, <void*>(clas+1))
|
|
||||||
|
|
||||||
def __getitem__(self, ids):
|
|
||||||
cdef id_t[2] context
|
|
||||||
context[0] = context[0]
|
|
||||||
context[1] = context[1]
|
|
||||||
cdef hash_t context_key = hash64(context, 2 * sizeof(id_t), 0)
|
|
||||||
cdef hash_t[2] class_context
|
|
||||||
class_context[0] = context_key
|
|
||||||
counts = {}
|
|
||||||
cdef id_t i
|
|
||||||
for i, clas in enumerate(self.clas_names):
|
|
||||||
class_context[1] = <hash_t>i
|
|
||||||
key = hash64(class_context, sizeof(hash_t) * 2, 0)
|
|
||||||
count = self._class_counts[key]
|
|
||||||
counts[clas] = count
|
|
||||||
return counts
|
|
||||||
|
|
||||||
@cython.cdivision(True)
|
|
||||||
def iter_contexts(self, float min_acc=0.99, count_t min_freq=10):
|
|
||||||
cdef Address counts_addr = Address(self.n_classes, sizeof(count_t))
|
|
||||||
cdef count_t* counts = <count_t*>counts_addr.ptr
|
|
||||||
cdef MapStruct* context_counts = self._counts.c_map
|
|
||||||
cdef hash_t context_key
|
|
||||||
cdef count_t context_freq
|
|
||||||
cdef int best_class
|
|
||||||
cdef float acc
|
|
||||||
|
|
||||||
cdef int i
|
|
||||||
for i in range(context_counts.length):
|
|
||||||
context_key = context_counts.cells[i].key
|
|
||||||
context_freq = <count_t>context_counts.cells[i].value
|
|
||||||
if context_key != 0 and context_freq >= min_freq:
|
|
||||||
best_class = self.find_best_class(counts, context_key)
|
|
||||||
acc = counts[best_class] / context_freq
|
|
||||||
if acc >= min_acc:
|
|
||||||
yield counts[best_class], context_key, best_class
|
|
||||||
|
|
||||||
cdef int inc(self, hash_t context_key, hash_t clas, count_t inc) except -1:
|
|
||||||
cdef hash_t context_and_class_key
|
|
||||||
cdef hash_t[2] context_and_class
|
|
||||||
context_and_class[0] = context_key
|
|
||||||
context_and_class[1] = clas
|
|
||||||
context_and_class_key = hash64(context_and_class, 2 * sizeof(hash_t), 0)
|
|
||||||
self._counts.inc(context_key, inc)
|
|
||||||
self._class_counts.inc(context_and_class_key, inc)
|
|
||||||
|
|
||||||
cdef int find_best_class(self, count_t* counts, hash_t context_key) except -1:
|
|
||||||
cdef hash_t[2] unhashed_key
|
|
||||||
unhashed_key[0] = context_key
|
|
||||||
|
|
||||||
cdef count_t total = 0
|
|
||||||
cdef hash_t key
|
|
||||||
cdef int clas
|
|
||||||
cdef int best
|
|
||||||
cdef int mode = 0
|
|
||||||
for clas in range(self.n_classes):
|
|
||||||
unhashed_key[1] = <hash_t>clas
|
|
||||||
key = hash64(unhashed_key, sizeof(hash_t) * 2, 0)
|
|
||||||
count = self._class_counts[key]
|
|
||||||
counts[clas] = count
|
|
||||||
if count >= mode:
|
|
||||||
mode = count
|
|
||||||
best = clas
|
|
||||||
total += count
|
|
||||||
return best
|
|
|
@ -1,90 +0,0 @@
|
||||||
from os import path
|
|
||||||
|
|
||||||
|
|
||||||
NOUN_RULES = (
|
|
||||||
('s', ''),
|
|
||||||
('ses', 's'),
|
|
||||||
('ves', 'f'),
|
|
||||||
('xes', 'x'),
|
|
||||||
('zes', 'z'),
|
|
||||||
('ches', 'ch'),
|
|
||||||
('shes', 'sh'),
|
|
||||||
('men', 'man'),
|
|
||||||
('ies', 'y')
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
VERB_RULES = (
|
|
||||||
("s", ""),
|
|
||||||
("ies", "y"),
|
|
||||||
("es", "e"),
|
|
||||||
("es", ""),
|
|
||||||
("ed", "e"),
|
|
||||||
("ed", ""),
|
|
||||||
("ing", "e"),
|
|
||||||
("ing", "")
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
ADJ_RULES = (
|
|
||||||
("er", ""),
|
|
||||||
("est", ""),
|
|
||||||
("er", "e"),
|
|
||||||
("est", "e")
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class Lemmatizer(object):
|
|
||||||
def __init__(self, wn_dict_dir):
|
|
||||||
self.index = {}
|
|
||||||
self.exc = {}
|
|
||||||
for pos in ['adj', 'adv', 'noun', 'verb']:
|
|
||||||
self.index[pos] = read_index(path.join(wn_dict_dir, 'index.%s' % pos))
|
|
||||||
self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos))
|
|
||||||
|
|
||||||
def noun(self, string):
|
|
||||||
return lemmatize(string, self.index['noun'], self.exc['noun'], NOUN_RULES)
|
|
||||||
|
|
||||||
def verb(self, string):
|
|
||||||
return lemmatize(string, self.index['verb'], self.exc['verb'], VERB_RULES)
|
|
||||||
|
|
||||||
def adj(self, string):
|
|
||||||
return lemmatize(string, self.index['adj'], self.exc['adj'], ADJ_RULES)
|
|
||||||
|
|
||||||
|
|
||||||
def lemmatize(string, index, exceptions, rules):
|
|
||||||
string = string.lower()
|
|
||||||
forms = []
|
|
||||||
if string in index:
|
|
||||||
forms.append(string)
|
|
||||||
forms.extend(exceptions.get(string, []))
|
|
||||||
for old, new in rules:
|
|
||||||
if string.endswith(old):
|
|
||||||
form = string[:len(string) - len(old)] + new
|
|
||||||
if form in index:
|
|
||||||
forms.append(form)
|
|
||||||
if not forms:
|
|
||||||
forms.append(string)
|
|
||||||
return set(forms)
|
|
||||||
|
|
||||||
|
|
||||||
def read_index(loc):
|
|
||||||
index = set()
|
|
||||||
for line in open(loc):
|
|
||||||
if line.startswith(' '):
|
|
||||||
continue
|
|
||||||
pieces = line.split()
|
|
||||||
word = pieces[0]
|
|
||||||
if word.count('_') == 0:
|
|
||||||
index.add(word)
|
|
||||||
return index
|
|
||||||
|
|
||||||
|
|
||||||
def read_exc(loc):
|
|
||||||
exceptions = {}
|
|
||||||
for line in open(loc):
|
|
||||||
if line.startswith(' '):
|
|
||||||
continue
|
|
||||||
pieces = line.split()
|
|
||||||
exceptions[pieces[0]] = tuple(pieces[1:])
|
|
||||||
return exceptions
|
|
|
@ -36,11 +36,11 @@ cdef struct _Cached:
|
||||||
cdef class Morphologizer:
|
cdef class Morphologizer:
|
||||||
"""Given a POS tag and a Lexeme, find its lemma and morphological analysis.
|
"""Given a POS tag and a Lexeme, find its lemma and morphological analysis.
|
||||||
"""
|
"""
|
||||||
def __init__(self, StringStore strings, object lemmatizer, **kwargs):
|
def __init__(self, StringStore strings, object lemmatizer,
|
||||||
|
irregulars=None, tag_map=None, tag_names=None):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self.strings = strings
|
self.strings = strings
|
||||||
tag_map = kwargs['tag_map']
|
self.tag_names = tag_names
|
||||||
self.tag_names = kwargs['tag_names']
|
|
||||||
self.lemmatizer = lemmatizer
|
self.lemmatizer = lemmatizer
|
||||||
self._cache = PreshMapArray(len(self.tag_names))
|
self._cache = PreshMapArray(len(self.tag_names))
|
||||||
self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
|
self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
|
||||||
|
@ -55,9 +55,16 @@ cdef class Morphologizer:
|
||||||
self.tags[i].morph.person = props.get('person', 0)
|
self.tags[i].morph.person = props.get('person', 0)
|
||||||
self.tags[i].morph.case = props.get('case', 0)
|
self.tags[i].morph.case = props.get('case', 0)
|
||||||
self.tags[i].morph.misc = props.get('misc', 0)
|
self.tags[i].morph.misc = props.get('misc', 0)
|
||||||
#if path.exists(path.join(data_dir, 'morphs.json')):
|
if irregulars is not None:
|
||||||
# with open(path.join(data_dir, 'morphs.json')) as file_:
|
self.load_exceptions(irregulars)
|
||||||
# self.load_exceptions(json.load(file_))
|
|
||||||
|
@classmethod
|
||||||
|
def from_dir(cls, StringStore strings, object lemmatizer, data_dir):
|
||||||
|
tag_map = None
|
||||||
|
irregulars = None
|
||||||
|
tag_names = None
|
||||||
|
return cls(strings, lemmatizer, tag_map=tag_map, irregulars=irregulars,
|
||||||
|
tag_names=tag_names)
|
||||||
|
|
||||||
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
|
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
|
||||||
if self.lemmatizer is None:
|
if self.lemmatizer is None:
|
||||||
|
@ -86,7 +93,6 @@ cdef class Morphologizer:
|
||||||
cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
|
cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
|
||||||
cached.morph = tag.morph
|
cached.morph = tag.morph
|
||||||
self._cache.set(tag.id, tokens[i].lex.sic, <void*>cached)
|
self._cache.set(tag.id, tokens[i].lex.sic, <void*>cached)
|
||||||
|
|
||||||
tokens[i].lemma = cached.lemma
|
tokens[i].lemma = cached.lemma
|
||||||
tokens[i].morph = cached.morph
|
tokens[i].morph = cached.morph
|
||||||
|
|
||||||
|
|
|
@ -1,169 +0,0 @@
|
||||||
from spacy.context cimport FIELD_IDS, Token
|
|
||||||
|
|
||||||
|
|
||||||
cdef Token P4 = FIELD_IDS.P4
|
|
||||||
cdef Token P3 = FIELD_IDS.P3
|
|
||||||
cdef Token P2 = FIELD_IDS.P2
|
|
||||||
cdef Token P1 = FIELD_IDS.P1
|
|
||||||
cdef Token N0 = FIELD_IDS.N0
|
|
||||||
cdef Token N1 = FIELD_IDS.N1
|
|
||||||
cdef Token N2 = FIELD_IDS.N2
|
|
||||||
cdef Token N3 = FIELD_IDS.N3
|
|
||||||
cdef Token N4 = FIELD_IDS.N4
|
|
||||||
|
|
||||||
"""
|
|
||||||
TEMPLATES = (
|
|
||||||
(N0.sic,),
|
|
||||||
(N0.cluster,),
|
|
||||||
|
|
||||||
(P1.pos,),
|
|
||||||
(P1.sic,),
|
|
||||||
|
|
||||||
(N1.norm,),
|
|
||||||
(N1.pos,),
|
|
||||||
|
|
||||||
(P1.ner,),
|
|
||||||
(P2.ner,),
|
|
||||||
|
|
||||||
(N0.cluster,),
|
|
||||||
(P1.cluster,),
|
|
||||||
(N1.cluster,),
|
|
||||||
|
|
||||||
(N0.is_alpha,),
|
|
||||||
(N0.is_digit,),
|
|
||||||
(N0.is_title,),
|
|
||||||
(N0.is_upper,),
|
|
||||||
|
|
||||||
(N0.is_title, N0.oft_title),
|
|
||||||
(N0.is_upper, N0.oft_upper),
|
|
||||||
|
|
||||||
(P1.cluster, N0.norm),
|
|
||||||
(N0.norm, N1.cluster),
|
|
||||||
|
|
||||||
(P1.ner, N0.pos),
|
|
||||||
(P2.ner, P1.ner, N0.pos),
|
|
||||||
|
|
||||||
(P2.pos, P1.pos, N0.sic),
|
|
||||||
(N0.sic, N1.pos, N2.pos)
|
|
||||||
)
|
|
||||||
"""
|
|
||||||
|
|
||||||
LOCAL = (
|
|
||||||
(N0.sic,),
|
|
||||||
(P1.sic,),
|
|
||||||
(N1.sic,),
|
|
||||||
(P2.sic,),
|
|
||||||
(N2.sic,),
|
|
||||||
(P3.sic,),
|
|
||||||
(N3.sic,),
|
|
||||||
(P4.sic,),
|
|
||||||
(N4.sic,),
|
|
||||||
|
|
||||||
(P1.sic, N0.sic,),
|
|
||||||
(N0.sic, N1.sic),
|
|
||||||
|
|
||||||
(N0.prefix,),
|
|
||||||
(N0.suffix,),
|
|
||||||
|
|
||||||
(P1.shape,),
|
|
||||||
(N0.shape,),
|
|
||||||
(N1.shape,),
|
|
||||||
(P1.shape, N0.shape,),
|
|
||||||
(N0.shape, P1.shape,),
|
|
||||||
(P1.shape, N0.shape, N1.shape),
|
|
||||||
(N2.shape,),
|
|
||||||
(P2.shape,),
|
|
||||||
(P3.shape,),
|
|
||||||
(N3.shape,),
|
|
||||||
(P4.shape,),
|
|
||||||
(N4.shape,),
|
|
||||||
|
|
||||||
(P2.norm, P1.norm, N0.norm),
|
|
||||||
(P1.norm, N0.norm, N1.norm),
|
|
||||||
(N0.norm, N1.norm, N2.norm)
|
|
||||||
)
|
|
||||||
|
|
||||||
BOOLS = (
|
|
||||||
(N0.is_title,),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
HISTORY = (
|
|
||||||
(P1.ner,),
|
|
||||||
(P1.ner, N0.sic,),
|
|
||||||
(P2.ner,),
|
|
||||||
(P2.ner, P1.ner),
|
|
||||||
(P2.ner, P1.ner, N0.sic),
|
|
||||||
(P2.pos, P1.ner, N0.pos),
|
|
||||||
(P2.ner, P1.pos, N0.pos),
|
|
||||||
(P3.ner,),
|
|
||||||
(P4.ner,),
|
|
||||||
)
|
|
||||||
|
|
||||||
POS = (
|
|
||||||
(P4.pos,),
|
|
||||||
(P3.pos,),
|
|
||||||
(P2.pos,),
|
|
||||||
(P1.pos,),
|
|
||||||
(N0.pos,),
|
|
||||||
(N1.pos,),
|
|
||||||
(N2.pos,),
|
|
||||||
(N3.pos,),
|
|
||||||
(N4.pos,),
|
|
||||||
|
|
||||||
(P1.pos, N0.pos),
|
|
||||||
(N0.pos, N1.pos),
|
|
||||||
(P2.pos, P1.pos, N0.pos),
|
|
||||||
(P1.pos, N0.pos, N1.pos),
|
|
||||||
(N0.pos, N1.pos, N2.pos)
|
|
||||||
)
|
|
||||||
|
|
||||||
CLUSTERS = (
|
|
||||||
(P4.cluster,),
|
|
||||||
(P3.cluster,),
|
|
||||||
(P2.cluster,),
|
|
||||||
(P1.cluster,),
|
|
||||||
(N0.cluster,),
|
|
||||||
(N1.cluster,),
|
|
||||||
(N2.cluster,),
|
|
||||||
(N3.cluster,),
|
|
||||||
(N4.cluster,),
|
|
||||||
|
|
||||||
(P1.cluster, N0.cluster),
|
|
||||||
(N0.cluster, N1.cluster),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
CLUSTER_POS = (
|
|
||||||
(P1.cluster, N0.pos),
|
|
||||||
(N0.pos, P1.cluster),
|
|
||||||
(N0.cluster, N1.pos),
|
|
||||||
(N0.pos, N1.cluster)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
GAZ = (
|
|
||||||
(N0.in_males,),
|
|
||||||
(N0.in_females,),
|
|
||||||
(N0.in_surnames,),
|
|
||||||
(N0.in_places,),
|
|
||||||
(N0.in_games,),
|
|
||||||
(N0.in_celebs,),
|
|
||||||
(N0.in_names,),
|
|
||||||
(P1.in_males,),
|
|
||||||
(P1.in_females,),
|
|
||||||
(P1.in_surnames,),
|
|
||||||
(P1.in_places,),
|
|
||||||
(P1.in_games,),
|
|
||||||
(P1.in_celebs,),
|
|
||||||
(P1.in_names,),
|
|
||||||
(N1.in_males,),
|
|
||||||
(N1.in_females,),
|
|
||||||
(N1.in_surnames,),
|
|
||||||
(N1.in_places,),
|
|
||||||
(N1.in_games,),
|
|
||||||
(N1.in_celebs,),
|
|
||||||
(N1.in_names,),
|
|
||||||
)
|
|
||||||
|
|
||||||
TEMPLATES = LOCAL + HISTORY + CLUSTERS + POS + CLUSTER_POS + GAZ + BOOLS
|
|
|
@ -1,15 +0,0 @@
|
||||||
from cymem.cymem cimport Pool
|
|
||||||
from .structs cimport State, Entity, Move
|
|
||||||
|
|
||||||
cdef int begin_entity(State* s, label) except -1
|
|
||||||
|
|
||||||
cdef int end_entity(State* s) except -1
|
|
||||||
|
|
||||||
cdef State* init_state(Pool mem, int sent_length) except NULL
|
|
||||||
cdef int copy_state(Pool mem, State* dest, State* source) except -1
|
|
||||||
|
|
||||||
cdef bint entity_is_open(State *s) except -1
|
|
||||||
|
|
||||||
cdef int entity_is_sunk(State *s, Move* golds) except -1
|
|
||||||
|
|
||||||
cdef int is_done(State* s) except -1
|
|
|
@ -1,54 +0,0 @@
|
||||||
from libc.string cimport memcpy
|
|
||||||
|
|
||||||
|
|
||||||
cdef int begin_entity(State* s, label) except -1:
|
|
||||||
s.j += 1
|
|
||||||
s.ents[s.j].start = s.i
|
|
||||||
s.ents[s.j].tag = label
|
|
||||||
s.ents[s.j].end = s.i + 1
|
|
||||||
|
|
||||||
|
|
||||||
cdef int end_entity(State* s) except -1:
|
|
||||||
s.ents[s.j].end = s.i + 1
|
|
||||||
|
|
||||||
|
|
||||||
cdef State* init_state(Pool mem, int sent_length) except NULL:
|
|
||||||
s = <State*>mem.alloc(1, sizeof(State))
|
|
||||||
s.ents = <Entity*>mem.alloc(sent_length, sizeof(Entity))
|
|
||||||
s.tags = <int*>mem.alloc(sent_length, sizeof(int))
|
|
||||||
s.length = sent_length
|
|
||||||
|
|
||||||
|
|
||||||
cdef bint entity_is_open(State *s) except -1:
|
|
||||||
return s.ents[s.j].start != 0
|
|
||||||
|
|
||||||
|
|
||||||
cdef int entity_is_sunk(State *s, Move* golds) except -1:
|
|
||||||
if not entity_is_open(s):
|
|
||||||
return False
|
|
||||||
raise StandardError
|
|
||||||
#cdef Entity* ent = &s.ents[s.j]
|
|
||||||
#cdef Move* gold = &golds[ent.start]
|
|
||||||
#if gold.action != BEGIN and gold.action != UNIT:
|
|
||||||
# return True
|
|
||||||
#elif gold.label != ent.label:
|
|
||||||
# return True
|
|
||||||
#else:
|
|
||||||
# return False
|
|
||||||
|
|
||||||
|
|
||||||
cdef int copy_state(Pool mem, State* dest, State* source) except -1:
|
|
||||||
'''Copy state source into state dest.'''
|
|
||||||
if source.length > dest.length:
|
|
||||||
dest.ents = <Entity*>mem.realloc(dest.ents, source.length * sizeof(Entity))
|
|
||||||
dest.tags = <int*>mem.realloc(dest.tags, source.length * sizeof(int))
|
|
||||||
memcpy(dest.ents, source.ents, source.length * sizeof(Entity))
|
|
||||||
memcpy(dest.tags, source.tags, source.length * sizeof(int))
|
|
||||||
dest.length = source.length
|
|
||||||
dest.i = source.i
|
|
||||||
dest.j = source.j
|
|
||||||
dest.curr = source.curr
|
|
||||||
|
|
||||||
|
|
||||||
cdef int is_done(State* s) except -1:
|
|
||||||
return s.i >= s.length and not entity_is_open(s)
|
|
|
@ -1,8 +0,0 @@
|
||||||
from cymem.cymem cimport Pool
|
|
||||||
|
|
||||||
cdef class NERAnnotation:
|
|
||||||
cdef Pool mem
|
|
||||||
cdef int* starts
|
|
||||||
cdef int* ends
|
|
||||||
cdef int* labels
|
|
||||||
cdef readonly list entities
|
|
|
@ -1,94 +0,0 @@
|
||||||
from libc.string cimport memset
|
|
||||||
|
|
||||||
|
|
||||||
cdef class NERAnnotation:
|
|
||||||
def __init__(self, entities, length, entity_types):
|
|
||||||
self.mem = Pool()
|
|
||||||
self.starts = <int*>self.mem.alloc(length, sizeof(int))
|
|
||||||
self.ends = <int*>self.mem.alloc(length, sizeof(int))
|
|
||||||
self.labels = <int*>self.mem.alloc(length, sizeof(int))
|
|
||||||
self.entities = entities
|
|
||||||
memset(self.starts, -1, sizeof(int) * length)
|
|
||||||
memset(self.ends, -1, sizeof(int) * length)
|
|
||||||
memset(self.labels, -1, sizeof(int) * length)
|
|
||||||
|
|
||||||
cdef int start, end, label
|
|
||||||
for start, end, label in entities:
|
|
||||||
for i in range(start, end):
|
|
||||||
self.starts[i] = start
|
|
||||||
self.ends[i] = end
|
|
||||||
self.labels[i] = label
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_bilous(cls, tag_strs, entity_types):
|
|
||||||
entities = []
|
|
||||||
start = None
|
|
||||||
for i, tag_str in enumerate(tag_strs):
|
|
||||||
if tag_str == 'O' or tag_str == '-':
|
|
||||||
continue
|
|
||||||
move, label_str = tag_str.split('-')
|
|
||||||
label = entity_types.index(label_str)
|
|
||||||
if label == -1:
|
|
||||||
label = len(entity_types)
|
|
||||||
entity_types.append(label)
|
|
||||||
if move == 'U':
|
|
||||||
assert start is None
|
|
||||||
entities.append((i, i+1, label))
|
|
||||||
elif move == 'B':
|
|
||||||
assert start is None
|
|
||||||
start = i
|
|
||||||
elif move == 'L':
|
|
||||||
assert start is not None
|
|
||||||
entities.append((start, i+1, label))
|
|
||||||
start = None
|
|
||||||
return cls(entities, len(tag_strs), entity_types)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def read_iob(file_, entity_types, create_tokens):
|
|
||||||
sent_strs = file_.read().strip().split('\n\n')
|
|
||||||
sents = []
|
|
||||||
for sent_str in sent_strs:
|
|
||||||
if sent_str.startswith('-DOCSTART-'):
|
|
||||||
continue
|
|
||||||
words = []
|
|
||||||
iob = []
|
|
||||||
for token_str in sent_str.split('\n'):
|
|
||||||
word, pos, chunk, ner = token_str.split()
|
|
||||||
words.append(word)
|
|
||||||
iob.append(ner)
|
|
||||||
bilou = iob_to_bilou(iob)
|
|
||||||
tokens = create_tokens(words)
|
|
||||||
sents.append((tokens, NERAnnotation.from_bilous(bilou, entity_types)))
|
|
||||||
return sents
|
|
||||||
|
|
||||||
|
|
||||||
def iob_to_bilou(tags):
|
|
||||||
out = []
|
|
||||||
curr_label = None
|
|
||||||
tags = list(tags)
|
|
||||||
while tags:
|
|
||||||
out.extend(_consume_os(tags))
|
|
||||||
out.extend(_consume_ent(tags))
|
|
||||||
return out
|
|
||||||
|
|
||||||
def _consume_os(tags):
|
|
||||||
while tags and tags[0] == 'O':
|
|
||||||
yield tags.pop(0)
|
|
||||||
|
|
||||||
def _consume_ent(tags):
|
|
||||||
if not tags:
|
|
||||||
return []
|
|
||||||
target = tags.pop(0).replace('B', 'I')
|
|
||||||
length = 1
|
|
||||||
while tags and tags[0] == target:
|
|
||||||
length += 1
|
|
||||||
tags.pop(0)
|
|
||||||
label = target[2:]
|
|
||||||
if length == 1:
|
|
||||||
return ['U-' + label]
|
|
||||||
else:
|
|
||||||
start = 'B-' + label
|
|
||||||
end = 'L-' + label
|
|
||||||
middle = ['I-%s' % label for _ in range(1, length - 1)]
|
|
||||||
return [start] + middle + [end]
|
|
|
@ -1,27 +0,0 @@
|
||||||
from cymem.cymem cimport Pool
|
|
||||||
|
|
||||||
from thinc.typedefs cimport class_t
|
|
||||||
from thinc.typedefs cimport weight_t
|
|
||||||
|
|
||||||
from .structs cimport State, Move
|
|
||||||
|
|
||||||
|
|
||||||
cpdef enum ActionType:
|
|
||||||
MISSING
|
|
||||||
BEGIN
|
|
||||||
IN
|
|
||||||
LAST
|
|
||||||
UNIT
|
|
||||||
OUT
|
|
||||||
N_ACTIONS
|
|
||||||
|
|
||||||
|
|
||||||
cdef int set_accept_if_oracle(Move* moves, Move* golds, int n, State* s) except 0
|
|
||||||
|
|
||||||
cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0
|
|
||||||
|
|
||||||
cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL
|
|
||||||
|
|
||||||
cdef int transition(State *s, Move* m) except -1
|
|
||||||
|
|
||||||
cdef int fill_moves(Move* moves, list tag_names) except -1
|
|
|
@ -1,207 +0,0 @@
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ._state cimport begin_entity
|
|
||||||
from ._state cimport end_entity
|
|
||||||
from ._state cimport entity_is_open
|
|
||||||
from ._state cimport entity_is_sunk
|
|
||||||
|
|
||||||
|
|
||||||
ACTION_NAMES = ['' for _ in range(N_ACTIONS)]
|
|
||||||
ACTION_NAMES[<int>MISSING] = '?'
|
|
||||||
ACTION_NAMES[<int>BEGIN] = 'B'
|
|
||||||
ACTION_NAMES[<int>IN] = 'I'
|
|
||||||
ACTION_NAMES[<int>LAST] = 'L'
|
|
||||||
ACTION_NAMES[<int>UNIT] = 'U'
|
|
||||||
ACTION_NAMES[<int>OUT] = 'O'
|
|
||||||
|
|
||||||
|
|
||||||
cdef bint can_begin(State* s, int label):
|
|
||||||
return not entity_is_open(s)
|
|
||||||
|
|
||||||
|
|
||||||
cdef bint can_in(State* s, int label):
|
|
||||||
return entity_is_open(s) and s.curr.label == label
|
|
||||||
|
|
||||||
|
|
||||||
cdef bint can_last(State* s, int label):
|
|
||||||
return entity_is_open(s) and s.curr.label == label
|
|
||||||
|
|
||||||
|
|
||||||
cdef bint can_unit(State* s, int label):
|
|
||||||
return not entity_is_open(s)
|
|
||||||
|
|
||||||
|
|
||||||
cdef bint can_out(State* s, int label):
|
|
||||||
return not entity_is_open(s)
|
|
||||||
|
|
||||||
|
|
||||||
cdef bint is_oracle(ActionType act, int tag, ActionType g_act, int g_tag,
|
|
||||||
ActionType next_act, bint is_sunk):
|
|
||||||
if g_act == MISSING:
|
|
||||||
return True
|
|
||||||
if act == BEGIN:
|
|
||||||
if g_act == BEGIN:
|
|
||||||
# B, Gold B --> Label match
|
|
||||||
return tag == g_tag
|
|
||||||
else:
|
|
||||||
# B, Gold I --> False (P)
|
|
||||||
# B, Gold L --> False (P)
|
|
||||||
# B, Gold O --> False (P)
|
|
||||||
# B, Gold U --> False (P)
|
|
||||||
return False
|
|
||||||
elif act == IN:
|
|
||||||
if g_act == BEGIN:
|
|
||||||
# I, Gold B --> True (P of bad open entity sunk, R of this entity sunk)
|
|
||||||
return True
|
|
||||||
elif g_act == IN:
|
|
||||||
# I, Gold I --> True (label forced by prev, if mismatch, P and R both sunk)
|
|
||||||
return True
|
|
||||||
elif g_act == LAST:
|
|
||||||
# I, Gold L --> True iff this entity sunk and next tag == O
|
|
||||||
return is_sunk and (next_act == OUT or next_act == MISSING)
|
|
||||||
elif g_act == OUT:
|
|
||||||
# I, Gold O --> True iff next tag == O
|
|
||||||
return next_act == OUT or next_act == MISSING
|
|
||||||
elif g_act == UNIT:
|
|
||||||
# I, Gold U --> True iff next tag == O
|
|
||||||
return next_act == OUT
|
|
||||||
elif act == LAST:
|
|
||||||
if g_act == BEGIN:
|
|
||||||
# L, Gold B --> True
|
|
||||||
return True
|
|
||||||
elif g_act == IN:
|
|
||||||
# L, Gold I --> True iff this entity sunk
|
|
||||||
return is_sunk
|
|
||||||
elif g_act == LAST:
|
|
||||||
# L, Gold L --> True
|
|
||||||
return True
|
|
||||||
elif g_act == OUT:
|
|
||||||
# L, Gold O --> True
|
|
||||||
return True
|
|
||||||
elif g_act == UNIT:
|
|
||||||
# L, Gold U --> True
|
|
||||||
return True
|
|
||||||
elif act == OUT:
|
|
||||||
if g_act == BEGIN:
|
|
||||||
# O, Gold B --> False
|
|
||||||
return False
|
|
||||||
elif g_act == IN:
|
|
||||||
# O, Gold I --> True
|
|
||||||
return True
|
|
||||||
elif g_act == LAST:
|
|
||||||
# O, Gold L --> True
|
|
||||||
return True
|
|
||||||
elif g_act == OUT:
|
|
||||||
# O, Gold O --> True
|
|
||||||
return True
|
|
||||||
elif g_act == UNIT:
|
|
||||||
# O, Gold U --> False
|
|
||||||
return False
|
|
||||||
elif act == UNIT:
|
|
||||||
if g_act == UNIT:
|
|
||||||
# U, Gold U --> True iff tag match
|
|
||||||
return tag == g_tag
|
|
||||||
else:
|
|
||||||
# U, Gold B --> False
|
|
||||||
# U, Gold I --> False
|
|
||||||
# U, Gold L --> False
|
|
||||||
# U, Gold O --> False
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
cdef int set_accept_if_valid(Move* moves, int n_classes, State* s) except 0:
|
|
||||||
cdef int n_accept = 0
|
|
||||||
cdef Move* m
|
|
||||||
moves[0].accept = False
|
|
||||||
for i in range(1, n_classes):
|
|
||||||
m = &moves[i]
|
|
||||||
if m.action == BEGIN:
|
|
||||||
m.accept = can_begin(s, m.label)
|
|
||||||
elif m.action == IN:
|
|
||||||
m.accept = can_in(s, m.label)
|
|
||||||
elif m.action == LAST:
|
|
||||||
m.accept = can_last(s, m.label)
|
|
||||||
elif m.action == UNIT:
|
|
||||||
m.accept = can_unit(s, m.label)
|
|
||||||
elif m.action == OUT:
|
|
||||||
m.accept = can_out(s, m.label)
|
|
||||||
n_accept += m.accept
|
|
||||||
assert n_accept != 0
|
|
||||||
return n_accept
|
|
||||||
|
|
||||||
|
|
||||||
cdef int set_accept_if_oracle(Move* moves, Move* golds, int n_classes, State* s) except 0:
|
|
||||||
|
|
||||||
cdef Move* g = &golds[s.i]
|
|
||||||
cdef ActionType next_act = <ActionType>golds[s.i+1].action if s.i < s.length else OUT
|
|
||||||
cdef bint is_sunk = entity_is_sunk(s, golds)
|
|
||||||
cdef Move* m
|
|
||||||
cdef int n_accept = 0
|
|
||||||
set_accept_if_valid(moves, n_classes, s)
|
|
||||||
for i in range(1, n_classes):
|
|
||||||
m = &moves[i]
|
|
||||||
if not m.accept:
|
|
||||||
continue
|
|
||||||
m.accept = is_oracle(<ActionType>m.action, m.label, <ActionType>g.action,
|
|
||||||
g.label, next_act, is_sunk)
|
|
||||||
n_accept += m.accept
|
|
||||||
assert n_accept != 0
|
|
||||||
return n_accept
|
|
||||||
|
|
||||||
|
|
||||||
cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL:
|
|
||||||
cdef int first_accept = -1
|
|
||||||
for first_accept in range(1, n):
|
|
||||||
if moves[first_accept].accept:
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
raise StandardError
|
|
||||||
assert first_accept != -1
|
|
||||||
cdef int best = first_accept
|
|
||||||
cdef weight_t score = scores[first_accept-1]
|
|
||||||
cdef int i
|
|
||||||
for i in range(first_accept+1, n):
|
|
||||||
if moves[i].accept and scores[i-1] > score:
|
|
||||||
best = i
|
|
||||||
score = scores[i-1]
|
|
||||||
return &moves[best]
|
|
||||||
|
|
||||||
|
|
||||||
cdef int transition(State *s, Move* move) except -1:
|
|
||||||
if move.action == BEGIN:
|
|
||||||
begin_entity(s, move.label)
|
|
||||||
elif move.action == IN:
|
|
||||||
pass
|
|
||||||
elif move.action == LAST:
|
|
||||||
end_entity(s)
|
|
||||||
elif move.action == UNIT:
|
|
||||||
begin_entity(s, move.label)
|
|
||||||
end_entity(s)
|
|
||||||
elif move.action == OUT:
|
|
||||||
pass
|
|
||||||
s.tags[s.i] = move.clas
|
|
||||||
s.i += 1
|
|
||||||
|
|
||||||
|
|
||||||
def get_n_moves(n_tags):
|
|
||||||
return n_tags + n_tags + n_tags + n_tags + 1
|
|
||||||
|
|
||||||
|
|
||||||
cdef int fill_moves(Move* moves, list tag_names) except -1:
|
|
||||||
cdef Move* m
|
|
||||||
label_names = {'-': 0}
|
|
||||||
for i, tag_name in enumerate(tag_names):
|
|
||||||
m = &moves[i]
|
|
||||||
if '-' in tag_name:
|
|
||||||
action_str, label = tag_name.split('-')
|
|
||||||
elif tag_name == 'O':
|
|
||||||
action_str = 'O'
|
|
||||||
label = '-'
|
|
||||||
elif tag_name == 'NULL' or tag_name == 'EOL':
|
|
||||||
action_str = '?'
|
|
||||||
label = '-'
|
|
||||||
else:
|
|
||||||
raise StandardError(tag_name)
|
|
||||||
m.action = ACTION_NAMES.index(action_str)
|
|
||||||
m.label = label_names.setdefault(label, len(label_names))
|
|
||||||
m.clas = i
|
|
|
@ -1,155 +0,0 @@
|
||||||
from thinc.typedefs cimport atom_t
|
|
||||||
from ..typedefs cimport hash_t
|
|
||||||
from ..tokens cimport Tokens
|
|
||||||
from ..lexeme cimport Lexeme
|
|
||||||
from .structs cimport State
|
|
||||||
|
|
||||||
|
|
||||||
cpdef enum:
|
|
||||||
T_sic
|
|
||||||
T_cluster
|
|
||||||
T_norm
|
|
||||||
T_shape
|
|
||||||
T_asciied
|
|
||||||
T_prefix
|
|
||||||
T_suffix
|
|
||||||
T_length
|
|
||||||
T_postype
|
|
||||||
T_nertype
|
|
||||||
T_sensetype
|
|
||||||
T_is_alpha
|
|
||||||
T_is_ascii
|
|
||||||
T_is_digit
|
|
||||||
T_is_lower
|
|
||||||
T_is_punct
|
|
||||||
T_is_space
|
|
||||||
T_is_title
|
|
||||||
T_is_upper
|
|
||||||
T_like_url
|
|
||||||
T_like_number
|
|
||||||
T_oft_lower
|
|
||||||
T_oft_title
|
|
||||||
T_oft_upper
|
|
||||||
T_in_males
|
|
||||||
T_in_females
|
|
||||||
T_in_surnames
|
|
||||||
T_in_places
|
|
||||||
T_in_celebs
|
|
||||||
T_in_names
|
|
||||||
T_pos
|
|
||||||
T_sense
|
|
||||||
T_ner
|
|
||||||
|
|
||||||
|
|
||||||
cpdef enum:
|
|
||||||
P2_sic
|
|
||||||
P2_cluster
|
|
||||||
P2_norm
|
|
||||||
P2_shape
|
|
||||||
P2_prefix
|
|
||||||
P2_suffix
|
|
||||||
P2_length
|
|
||||||
P2_postype
|
|
||||||
P2_is_alpha
|
|
||||||
P2_is_digit
|
|
||||||
P2_is_lower
|
|
||||||
P2_is_punct
|
|
||||||
P2_is_title
|
|
||||||
P2_is_upper
|
|
||||||
P2_like_number
|
|
||||||
P2_pos
|
|
||||||
|
|
||||||
P1_sic
|
|
||||||
P1_cluster
|
|
||||||
P1_norm
|
|
||||||
P1_shape
|
|
||||||
P1_prefix
|
|
||||||
P1_suffix
|
|
||||||
P1_length
|
|
||||||
P1_postype
|
|
||||||
P1_is_alpha
|
|
||||||
P1_is_digit
|
|
||||||
P1_is_lower
|
|
||||||
P1_is_punct
|
|
||||||
P1_is_title
|
|
||||||
P1_is_upper
|
|
||||||
P1_like_number
|
|
||||||
P1_pos
|
|
||||||
|
|
||||||
W_sic
|
|
||||||
W_cluster
|
|
||||||
W_norm
|
|
||||||
W_shape
|
|
||||||
W_prefix
|
|
||||||
W_suffix
|
|
||||||
W_length
|
|
||||||
W_postype
|
|
||||||
W_is_alpha
|
|
||||||
W_is_digit
|
|
||||||
W_is_lower
|
|
||||||
W_is_punct
|
|
||||||
W_is_space
|
|
||||||
W_is_title
|
|
||||||
W_is_upper
|
|
||||||
W_like_number
|
|
||||||
W_pos
|
|
||||||
|
|
||||||
N1_sic
|
|
||||||
N1_cluster
|
|
||||||
N1_norm
|
|
||||||
N1_shape
|
|
||||||
N1_prefix
|
|
||||||
N1_suffix
|
|
||||||
N1_length
|
|
||||||
N1_postype
|
|
||||||
N1_is_alpha
|
|
||||||
N1_is_ascii
|
|
||||||
N1_is_digit
|
|
||||||
N1_is_lower
|
|
||||||
N1_is_punct
|
|
||||||
N1_is_space
|
|
||||||
N1_is_title
|
|
||||||
N1_is_upper
|
|
||||||
N1_like_number
|
|
||||||
N1_pos
|
|
||||||
|
|
||||||
N2_sic
|
|
||||||
N2_cluster
|
|
||||||
N2_norm
|
|
||||||
N2_shape
|
|
||||||
N2_asciied
|
|
||||||
N2_prefix
|
|
||||||
N2_suffix
|
|
||||||
N2_length
|
|
||||||
N2_postype
|
|
||||||
N2_is_alpha
|
|
||||||
N2_is_digit
|
|
||||||
N2_is_lower
|
|
||||||
N2_is_punct
|
|
||||||
N2_is_space
|
|
||||||
N2_is_title
|
|
||||||
N2_is_upper
|
|
||||||
N2_like_number
|
|
||||||
N2_pos
|
|
||||||
N2_sense
|
|
||||||
|
|
||||||
E_label
|
|
||||||
|
|
||||||
E0_sic
|
|
||||||
E0_cluster
|
|
||||||
E0_pos
|
|
||||||
|
|
||||||
E1_sic
|
|
||||||
E1_cluster
|
|
||||||
E1_pos
|
|
||||||
|
|
||||||
E_last_sic
|
|
||||||
E_last_cluster
|
|
||||||
E_last_pos
|
|
||||||
|
|
||||||
N_FIELDS
|
|
||||||
|
|
||||||
|
|
||||||
cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1
|
|
||||||
|
|
||||||
|
|
|
@ -1,77 +0,0 @@
|
||||||
from libc.string cimport memset
|
|
||||||
|
|
||||||
from murmurhash.mrmr cimport hash64
|
|
||||||
from ._state cimport entity_is_open
|
|
||||||
from ..lexeme cimport *
|
|
||||||
|
|
||||||
|
|
||||||
cdef int _fill_token(atom_t* c, Lexeme* lex, atom_t pos):
|
|
||||||
c[T_sic] = lex.sic
|
|
||||||
c[T_cluster] = lex.cluster
|
|
||||||
c[T_norm] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
|
|
||||||
c[T_shape] = lex.shape
|
|
||||||
c[T_asciied] = lex.asciied
|
|
||||||
c[T_prefix] = lex.prefix
|
|
||||||
c[T_suffix] = lex.suffix
|
|
||||||
c[T_length] = lex.length
|
|
||||||
|
|
||||||
c[T_postype] = lex.postype
|
|
||||||
c[T_nertype] = 0
|
|
||||||
c[T_sensetype] = 0
|
|
||||||
|
|
||||||
c[T_is_alpha] = lex.flags & (1 << IS_ALPHA)
|
|
||||||
c[T_is_digit] = lex.flags & (1 << IS_DIGIT)
|
|
||||||
c[T_is_lower] = lex.flags & (1 << IS_LOWER)
|
|
||||||
c[T_is_punct] = lex.flags & (1 << IS_PUNCT)
|
|
||||||
c[T_is_space] = lex.flags & (1 << IS_SPACE)
|
|
||||||
c[T_is_title] = lex.flags & (1 << IS_TITLE)
|
|
||||||
c[T_is_upper] = lex.flags & (1 << IS_UPPER)
|
|
||||||
c[T_like_url] = lex.flags & (1 << LIKE_URL)
|
|
||||||
c[T_like_number] = lex.flags & (1 << LIKE_NUMBER)
|
|
||||||
c[T_oft_lower] = lex.flags & (1 << OFT_LOWER)
|
|
||||||
c[T_oft_title] = lex.flags & (1 << OFT_TITLE)
|
|
||||||
c[T_oft_upper] = lex.flags & (1 << OFT_UPPER)
|
|
||||||
|
|
||||||
c[T_in_males] = lex.flags & (1 << IN_MALES)
|
|
||||||
c[T_in_females] = lex.flags & (1 << IN_FEMALES)
|
|
||||||
c[T_in_surnames] = lex.flags & (1 << IN_SURNAMES)
|
|
||||||
c[T_in_places] = lex.flags & (1 << IN_PLACES)
|
|
||||||
c[T_in_celebs] = lex.flags & (1 << IN_CELEBS)
|
|
||||||
c[T_in_names] = lex.flags & (1 << IN_NAMES)
|
|
||||||
|
|
||||||
c[T_pos] = pos
|
|
||||||
c[T_sense] = 0
|
|
||||||
|
|
||||||
|
|
||||||
cdef int _fill_outer_token(atom_t* c, Lexeme* lex, atom_t pos):
|
|
||||||
c[0] = lex.sic
|
|
||||||
c[1] = lex.cluster
|
|
||||||
c[2] = lex.shape
|
|
||||||
c[3] = pos
|
|
||||||
|
|
||||||
|
|
||||||
cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1:
|
|
||||||
cdef int i
|
|
||||||
for i in range(N_FIELDS):
|
|
||||||
context[i] = 0
|
|
||||||
i = s.i
|
|
||||||
_fill_token(&context[P2_sic], tokens.lex[i-2], tokens.pos[i-2])
|
|
||||||
_fill_token(&context[P1_sic], tokens.lex[i-1], tokens.pos[i-1])
|
|
||||||
_fill_token(&context[W_sic], tokens.lex[i], tokens.pos[i])
|
|
||||||
_fill_token(&context[N1_sic], tokens.lex[i+1], tokens.pos[i+1])
|
|
||||||
_fill_token(&context[N2_sic], tokens.lex[i+2], tokens.pos[i+2])
|
|
||||||
|
|
||||||
cdef atom_t[5] ent_vals
|
|
||||||
if entity_is_open(s):
|
|
||||||
context[E_label] = s.curr.label
|
|
||||||
context[E0_sic] = tokens.lex[s.curr.start].sic
|
|
||||||
context[E0_cluster] = tokens.lex[s.curr.start].cluster
|
|
||||||
context[E0_pos] = tokens.pos[s.curr.start]
|
|
||||||
context[E_last_sic] = tokens.lex[s.i-1].sic
|
|
||||||
context[E_last_cluster] = tokens.lex[s.i-1].cluster
|
|
||||||
context[E_last_pos] = tokens.pos[s.i-1]
|
|
||||||
if (s.curr.start + 1) < s.i:
|
|
||||||
context[E1_sic] = tokens.lex[s.curr.start+1].sic
|
|
||||||
context[E1_cluster] = tokens.lex[s.curr.start+1].cluster
|
|
||||||
context[E1_pos] = tokens.pos[s.curr.start+1]
|
|
||||||
return 1
|
|
|
@ -1,107 +0,0 @@
|
||||||
from .context import *
|
|
||||||
|
|
||||||
|
|
||||||
LOCAL = (
|
|
||||||
(W_sic,),
|
|
||||||
(P1_sic,),
|
|
||||||
(N1_sic,),
|
|
||||||
(P2_sic,),
|
|
||||||
(N2_sic,),
|
|
||||||
|
|
||||||
(P1_sic, W_sic,),
|
|
||||||
(W_sic, N1_sic),
|
|
||||||
|
|
||||||
(W_prefix,),
|
|
||||||
(W_suffix,),
|
|
||||||
|
|
||||||
(P1_shape,),
|
|
||||||
(W_shape,),
|
|
||||||
(N1_shape,),
|
|
||||||
(P1_shape, W_shape,),
|
|
||||||
(W_shape, P1_shape,),
|
|
||||||
(P1_shape, W_shape, N1_shape),
|
|
||||||
(N2_shape,),
|
|
||||||
(P2_shape,),
|
|
||||||
|
|
||||||
(P2_norm, P1_norm, W_norm),
|
|
||||||
(P1_norm, W_norm, N1_norm),
|
|
||||||
(W_norm, N1_norm, N2_norm)
|
|
||||||
)
|
|
||||||
|
|
||||||
POS = (
|
|
||||||
(P2_pos,),
|
|
||||||
(P1_pos,),
|
|
||||||
(W_pos,),
|
|
||||||
(N1_pos,),
|
|
||||||
(N2_pos,),
|
|
||||||
|
|
||||||
(P1_pos, W_pos),
|
|
||||||
(W_pos, N1_pos),
|
|
||||||
(P2_pos, P1_pos, W_pos),
|
|
||||||
(P1_pos, W_pos, N1_pos),
|
|
||||||
(W_pos, N1_pos, N2_pos)
|
|
||||||
)
|
|
||||||
|
|
||||||
CLUSTERS = (
|
|
||||||
(P2_cluster,),
|
|
||||||
(P1_cluster,),
|
|
||||||
(W_cluster,),
|
|
||||||
(N1_cluster,),
|
|
||||||
(N2_cluster,),
|
|
||||||
|
|
||||||
(P1_cluster, W_cluster),
|
|
||||||
(W_cluster, N1_cluster),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
CLUSTER_POS = (
|
|
||||||
(P1_cluster, W_pos),
|
|
||||||
(W_pos, P1_cluster),
|
|
||||||
(W_cluster, N1_pos),
|
|
||||||
(W_pos, N1_cluster)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
STATE = (
|
|
||||||
(E0_sic,),
|
|
||||||
(E0_cluster,),
|
|
||||||
(E0_pos,),
|
|
||||||
(E_last_sic,),
|
|
||||||
(E_last_cluster,),
|
|
||||||
(E_last_pos,),
|
|
||||||
|
|
||||||
(E0_sic, W_sic),
|
|
||||||
(E0_cluster, W_cluster),
|
|
||||||
(E0_pos, W_pos),
|
|
||||||
(E_last_sic, W_sic),
|
|
||||||
(E_last_pos, W_pos),
|
|
||||||
|
|
||||||
(E0_pos, E_last_pos, W_pos),
|
|
||||||
(E0_cluster, E_last_cluster, W_cluster),
|
|
||||||
|
|
||||||
(E0_sic, E_last_sic),
|
|
||||||
(E0_pos, E_last_pos),
|
|
||||||
(E0_cluster, E_last_cluster),
|
|
||||||
(E0_pos, E_last_cluster),
|
|
||||||
(E0_cluster, E_last_pos),
|
|
||||||
|
|
||||||
(E1_sic,),
|
|
||||||
(E1_cluster,),
|
|
||||||
(E1_pos,),
|
|
||||||
|
|
||||||
(E0_sic, E1_sic),
|
|
||||||
(E0_sic, E1_pos,),
|
|
||||||
(E0_pos, E1_sic,),
|
|
||||||
(E0_pos, E1_pos),
|
|
||||||
|
|
||||||
(E_label,),
|
|
||||||
(E_label, W_sic),
|
|
||||||
(E_label, W_pos),
|
|
||||||
(E_label, W_cluster),
|
|
||||||
(E_label, W_shape),
|
|
||||||
(E_label, E_last_sic),
|
|
||||||
(E_label, E0_pos, E_last_pos),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
TEMPLATES = LOCAL + CLUSTERS + POS + CLUSTER_POS + STATE
|
|
|
@ -1,29 +0,0 @@
|
||||||
from cymem.cymem cimport Pool
|
|
||||||
from thinc.features cimport Extractor
|
|
||||||
from thinc.learner cimport LinearModel
|
|
||||||
from thinc.typedefs cimport *
|
|
||||||
|
|
||||||
from ..tokens cimport Tokens
|
|
||||||
from ..typedefs cimport *
|
|
||||||
|
|
||||||
from .structs cimport Move
|
|
||||||
from .annot cimport NERAnnotation
|
|
||||||
|
|
||||||
|
|
||||||
cdef class NERParser:
|
|
||||||
cdef Pool mem
|
|
||||||
cdef Extractor extractor
|
|
||||||
cdef LinearModel model
|
|
||||||
cdef readonly list tag_names
|
|
||||||
cdef readonly list entity_types
|
|
||||||
cdef readonly int n_classes
|
|
||||||
|
|
||||||
cdef Move* _moves
|
|
||||||
cdef atom_t* _context
|
|
||||||
cdef feat_t* _feats
|
|
||||||
cdef weight_t* _values
|
|
||||||
cdef weight_t* _scores
|
|
||||||
|
|
||||||
|
|
||||||
cpdef list train(self, Tokens tokens, NERAnnotation annot)
|
|
||||||
cpdef list set_tags(self, Tokens tokens)
|
|
|
@ -1,81 +0,0 @@
|
||||||
cimport cython
|
|
||||||
import random
|
|
||||||
import os
|
|
||||||
from os import path
|
|
||||||
import shutil
|
|
||||||
import json
|
|
||||||
|
|
||||||
from thinc.features cimport ConjFeat
|
|
||||||
|
|
||||||
from ..context cimport fill_context
|
|
||||||
from ..context cimport N_FIELDS
|
|
||||||
from .moves cimport Move
|
|
||||||
from .moves cimport fill_moves, transition, best_accepted
|
|
||||||
from .moves cimport set_accept_if_valid, set_accept_if_oracle
|
|
||||||
from .moves import get_n_moves
|
|
||||||
from ._state cimport State
|
|
||||||
from ._state cimport init_state
|
|
||||||
|
|
||||||
|
|
||||||
cdef class NERParser:
|
|
||||||
def __init__(self, model_dir):
|
|
||||||
self.mem = Pool()
|
|
||||||
cfg = json.load(open(path.join(model_dir, 'config.json')))
|
|
||||||
templates = cfg['templates']
|
|
||||||
self.entity_types = cfg['entity_types']
|
|
||||||
self.extractor = Extractor(templates, [ConjFeat] * len(templates))
|
|
||||||
self.n_classes = get_n_moves(len(self.entity_types))
|
|
||||||
self._moves = <Move*>self.mem.alloc(self.n_classes, sizeof(Move))
|
|
||||||
fill_moves(self._moves, len(self.entity_types))
|
|
||||||
self.model = LinearModel(len(self.tag_names))
|
|
||||||
if path.exists(path.join(model_dir, 'model')):
|
|
||||||
self.model.load(path.join(model_dir, 'model'))
|
|
||||||
|
|
||||||
self._context = <atom_t*>self.mem.alloc(N_FIELDS, sizeof(atom_t))
|
|
||||||
self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))
|
|
||||||
self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
|
|
||||||
self._scores = <weight_t*>self.mem.alloc(self.model.nr_class, sizeof(weight_t))
|
|
||||||
|
|
||||||
cpdef int train(self, Tokens tokens, gold_classes):
|
|
||||||
cdef Pool mem = Pool()
|
|
||||||
cdef State* s = init_state(mem, tokens.length)
|
|
||||||
cdef Move* golds = <Move*>mem.alloc(len(gold_classes), sizeof(Move))
|
|
||||||
for i, clas in enumerate(gold_classes):
|
|
||||||
golds[i] = self.moves[clas - 1]
|
|
||||||
assert golds[i].id == clas
|
|
||||||
cdef Move* guess
|
|
||||||
while s.i < tokens.length:
|
|
||||||
fill_context(self._context, s.i, tokens)
|
|
||||||
self.extractor.extract(self._feats, self._values, self._context, NULL)
|
|
||||||
self.model.score(self._scores, self._feats, self._values)
|
|
||||||
|
|
||||||
set_accept_if_valid(self._moves, self.n_classes, s)
|
|
||||||
guess = best_accepted(self._moves, self._scores, self.n_classes)
|
|
||||||
|
|
||||||
set_accept_if_oracle(self._moves, golds, self.n_classes, s) # TODO
|
|
||||||
gold = best_accepted(self._moves, self._scores, self.n_classes)
|
|
||||||
|
|
||||||
if guess.clas == gold.clas:
|
|
||||||
self.model.update({})
|
|
||||||
return 0
|
|
||||||
|
|
||||||
counts = {guess.clas: {}, gold.clas: {}}
|
|
||||||
self.extractor.count(counts[gold.clas], self._feats, 1)
|
|
||||||
self.extractor.count(counts[guess.clas], self._feats, -1)
|
|
||||||
self.model.update(counts)
|
|
||||||
|
|
||||||
transition(s, guess)
|
|
||||||
tokens.ner[s.i-1] = s.tags[s.i-1]
|
|
||||||
|
|
||||||
cpdef int set_tags(self, Tokens tokens) except -1:
|
|
||||||
cdef Pool mem = Pool()
|
|
||||||
cdef State* s = init_state(mem, tokens.length)
|
|
||||||
cdef Move* move
|
|
||||||
while s.i < tokens.length:
|
|
||||||
fill_context(self._context, s.i, tokens)
|
|
||||||
self.extractor.extract(self._feats, self._values, self._context, NULL)
|
|
||||||
self.model.score(self._scores, self._feats, self._values)
|
|
||||||
set_accept_if_valid(self._moves, self.n_classes, s)
|
|
||||||
move = best_accepted(self._moves, self._scores, self.n_classes)
|
|
||||||
transition(s, move)
|
|
||||||
tokens.ner[s.i-1] = s.tags[s.i-1]
|
|
|
@ -1,26 +0,0 @@
|
||||||
from cymem.cymem cimport Pool
|
|
||||||
|
|
||||||
from thinc.typedefs cimport class_t
|
|
||||||
from thinc.typedefs cimport weight_t
|
|
||||||
|
|
||||||
from .structs cimport State, Move
|
|
||||||
|
|
||||||
|
|
||||||
cpdef enum ActionType:
|
|
||||||
MISSING
|
|
||||||
SHIFT
|
|
||||||
REDUCE
|
|
||||||
OUT
|
|
||||||
N_ACTIONS
|
|
||||||
|
|
||||||
|
|
||||||
cdef int set_accept_if_oracle(Move* moves, int n, State* s,
|
|
||||||
int* g_starts, int* g_ends, int* g_labels) except 0
|
|
||||||
|
|
||||||
cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0
|
|
||||||
|
|
||||||
cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL
|
|
||||||
|
|
||||||
cdef int transition(State *s, Move* m) except -1
|
|
||||||
|
|
||||||
cdef int fill_moves(Move* moves, int n, list entity_types) except -1
|
|
|
@ -1,161 +0,0 @@
|
||||||
from __future__ import unicode_literals
|
|
||||||
from cymem.cymem cimport Pool
|
|
||||||
|
|
||||||
from thinc.typedefs cimport class_t
|
|
||||||
from thinc.typedefs cimport weight_t
|
|
||||||
|
|
||||||
from ._state cimport begin_entity
|
|
||||||
from ._state cimport end_entity
|
|
||||||
from ._state cimport entity_is_open
|
|
||||||
|
|
||||||
|
|
||||||
ACTION_NAMES = ['' for _ in range(N_ACTIONS)]
|
|
||||||
ACTION_NAMES[<int>MISSING] = '?'
|
|
||||||
ACTION_NAMES[<int>SHIFT] = 'S'
|
|
||||||
ACTION_NAMES[<int>REDUCE] = 'R'
|
|
||||||
ACTION_NAMES[<int>OUT] = 'O'
|
|
||||||
|
|
||||||
|
|
||||||
cdef int set_accept_if_oracle(Move* moves, int n, State* s,
|
|
||||||
int* g_starts, int* g_ends, int* g_labels) except 0:
|
|
||||||
# If curr entity: (O invalid)
|
|
||||||
# if cost is not sunk (start matches, end is i-1 or greater
|
|
||||||
# - If i-1 == gold.end --> R=True, S=False
|
|
||||||
# - Shift if end >= i --> S=True, R=False
|
|
||||||
# else
|
|
||||||
# - If i == gold.start --> R=True, S=False
|
|
||||||
# - Else --> R=True, S=True
|
|
||||||
# Else (R invalid):
|
|
||||||
# if start == gold.start: S=True, O=False
|
|
||||||
# else: O=True, S=False
|
|
||||||
if entity_is_open(s):
|
|
||||||
g_start = g_starts[s.curr.start]
|
|
||||||
g_end = g_ends[s.curr.start]
|
|
||||||
accept_o = False
|
|
||||||
if g_start == s.curr.start and g_end == s.i:
|
|
||||||
accept_r = True
|
|
||||||
accept_s = False
|
|
||||||
elif g_start == s.curr.start and g_end > s.i:
|
|
||||||
accept_s = True
|
|
||||||
s_label = s.curr.label
|
|
||||||
accept_r = False
|
|
||||||
elif g_starts[s.i] == s.i:
|
|
||||||
accept_r = True
|
|
||||||
accept_s = False
|
|
||||||
else:
|
|
||||||
accept_r = True
|
|
||||||
accept_s = True
|
|
||||||
s_label = s.curr.label
|
|
||||||
else:
|
|
||||||
accept_r = False
|
|
||||||
if g_starts[s.i] == s.i:
|
|
||||||
accept_s = True
|
|
||||||
s_label = g_labels[s.i]
|
|
||||||
accept_o = False
|
|
||||||
else:
|
|
||||||
accept_o = True
|
|
||||||
accept_s = False
|
|
||||||
n_accept = 0
|
|
||||||
moves[0].accept = False
|
|
||||||
for i in range(1, n):
|
|
||||||
m = &moves[i]
|
|
||||||
if m.action == SHIFT:
|
|
||||||
m.accept = accept_s and m.label == s_label
|
|
||||||
elif m.action == REDUCE:
|
|
||||||
m.accept = accept_r
|
|
||||||
elif m.action == OUT:
|
|
||||||
m.accept = accept_o
|
|
||||||
n_accept += m.accept
|
|
||||||
assert n_accept != 0
|
|
||||||
return n_accept
|
|
||||||
|
|
||||||
|
|
||||||
cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0:
|
|
||||||
cdef int i
|
|
||||||
cdef bint open_ent = entity_is_open(s)
|
|
||||||
cdef int n_accept = 0
|
|
||||||
moves[0].accept = False
|
|
||||||
for i in range(1, n):
|
|
||||||
if moves[i].action == SHIFT:
|
|
||||||
if s.i >= s.length:
|
|
||||||
moves[i].accept = False
|
|
||||||
elif open_ent and moves[i].label != s.curr.label:
|
|
||||||
moves[i].accept = False
|
|
||||||
else:
|
|
||||||
moves[i].accept = True
|
|
||||||
elif moves[i].action == REDUCE:
|
|
||||||
moves[i].accept = open_ent
|
|
||||||
elif moves[i].action == OUT:
|
|
||||||
moves[i].accept = s.i < s.length and not open_ent
|
|
||||||
n_accept += moves[i].accept
|
|
||||||
return n_accept
|
|
||||||
|
|
||||||
|
|
||||||
cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL:
|
|
||||||
cdef int first_accept = -1
|
|
||||||
for first_accept in range(1, n):
|
|
||||||
if moves[first_accept].accept:
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
raise StandardError
|
|
||||||
assert first_accept != -1
|
|
||||||
cdef int best = first_accept
|
|
||||||
cdef weight_t score = scores[first_accept-1]
|
|
||||||
cdef int i
|
|
||||||
for i in range(first_accept+1, n):
|
|
||||||
if moves[i].accept and scores[i-1] > score:
|
|
||||||
best = i
|
|
||||||
score = scores[i-1]
|
|
||||||
return &moves[best]
|
|
||||||
|
|
||||||
|
|
||||||
cdef int transition(State *s, Move* move) except -1:
|
|
||||||
s.tags[s.i] = move.clas
|
|
||||||
if move.action == OUT:
|
|
||||||
s.i += 1
|
|
||||||
elif move.action == SHIFT:
|
|
||||||
if not entity_is_open(s):
|
|
||||||
s.curr.start = s.i
|
|
||||||
s.curr.label = move.label
|
|
||||||
s.i += 1
|
|
||||||
elif move.action == REDUCE:
|
|
||||||
s.curr.end = s.i
|
|
||||||
s.ents[s.j] = s.curr
|
|
||||||
s.j += 1
|
|
||||||
s.curr.start = 0
|
|
||||||
s.curr.label = -1
|
|
||||||
s.curr.end = 0
|
|
||||||
else:
|
|
||||||
raise ValueError(move.action)
|
|
||||||
|
|
||||||
|
|
||||||
def get_n_moves(n_tags):
|
|
||||||
return 1 + 1 + 1 + n_tags
|
|
||||||
|
|
||||||
|
|
||||||
cdef int fill_moves(Move* moves, int n, list entity_types) except -1:
|
|
||||||
cdef Move* m
|
|
||||||
label_names = {'-': 0}
|
|
||||||
# Reserve class 0
|
|
||||||
cdef int i = 0
|
|
||||||
moves[i].clas = i
|
|
||||||
moves[i].action = MISSING
|
|
||||||
moves[i].label = 0
|
|
||||||
i += 1
|
|
||||||
for entity_type in entity_types:
|
|
||||||
moves[i].action = SHIFT
|
|
||||||
moves[i].label = label_names.setdefault(entity_type, len(label_names))
|
|
||||||
moves[i].clas = i
|
|
||||||
i += 1
|
|
||||||
moves[i].clas = i
|
|
||||||
moves[i].action = OUT
|
|
||||||
moves[i].label = 0
|
|
||||||
i += 1
|
|
||||||
moves[i].action = REDUCE
|
|
||||||
moves[i].clas = i
|
|
||||||
moves[i].label = 0
|
|
||||||
i += 1
|
|
||||||
|
|
||||||
|
|
||||||
cdef bint is_final(State* s):
|
|
||||||
return s.i == s.length and not entity_is_open(s)
|
|
|
@ -1,16 +0,0 @@
|
||||||
from cymem.cymem cimport Pool
|
|
||||||
|
|
||||||
from .structs cimport Move, State
|
|
||||||
|
|
||||||
|
|
||||||
cdef class PyState:
|
|
||||||
cdef Pool mem
|
|
||||||
cdef readonly list tag_names
|
|
||||||
cdef readonly int n_classes
|
|
||||||
cdef readonly dict moves_by_name
|
|
||||||
|
|
||||||
cdef Move* _moves
|
|
||||||
cdef Move* _golds
|
|
||||||
cdef State* _s
|
|
||||||
|
|
||||||
cdef Move* _get_move(self, unicode move_name) except NULL
|
|
|
@ -1,60 +0,0 @@
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ._state cimport init_state
|
|
||||||
from ._state cimport entity_is_open
|
|
||||||
from .bilou_moves cimport fill_moves
|
|
||||||
from .bilou_moves cimport transition
|
|
||||||
from .bilou_moves cimport set_accept_if_valid, set_accept_if_oracle
|
|
||||||
from .bilou_moves import get_n_moves
|
|
||||||
from .bilou_moves import ACTION_NAMES
|
|
||||||
|
|
||||||
|
|
||||||
cdef class PyState:
|
|
||||||
def __init__(self, tag_names, n_tokens):
|
|
||||||
self.mem = Pool()
|
|
||||||
self.tag_names = tag_names
|
|
||||||
self.n_classes = len(tag_names)
|
|
||||||
assert self.n_classes != 0
|
|
||||||
self._moves = <Move*>self.mem.alloc(self.n_classes, sizeof(Move))
|
|
||||||
fill_moves(self._moves, tag_names)
|
|
||||||
self._s = init_state(self.mem, n_tokens)
|
|
||||||
self._golds = <Move*>self.mem.alloc(n_tokens, sizeof(Move))
|
|
||||||
|
|
||||||
cdef Move* _get_move(self, unicode move_name) except NULL:
|
|
||||||
return &self._moves[self.tag_names.index(move_name)]
|
|
||||||
|
|
||||||
def set_golds(self, list gold_names):
|
|
||||||
cdef Move* m
|
|
||||||
for i, name in enumerate(gold_names):
|
|
||||||
m = self._get_move(name)
|
|
||||||
self._golds[i] = m[0]
|
|
||||||
|
|
||||||
def transition(self, unicode move_name):
|
|
||||||
cdef Move* m = self._get_move(move_name)
|
|
||||||
transition(self._s, m)
|
|
||||||
|
|
||||||
def is_valid(self, unicode move_name):
|
|
||||||
cdef Move* m = self._get_move(move_name)
|
|
||||||
set_accept_if_valid(self._moves, self.n_classes, self._s)
|
|
||||||
return m.accept
|
|
||||||
|
|
||||||
def is_gold(self, unicode move_name):
|
|
||||||
cdef Move* m = self._get_move(move_name)
|
|
||||||
set_accept_if_oracle(self._moves, self._golds, self.n_classes, self._s)
|
|
||||||
return m.accept
|
|
||||||
|
|
||||||
property ent:
|
|
||||||
def __get__(self):
|
|
||||||
return self._s.curr
|
|
||||||
|
|
||||||
property n_ents:
|
|
||||||
def __get__(self):
|
|
||||||
return self._s.j
|
|
||||||
|
|
||||||
property i:
|
|
||||||
def __get__(self):
|
|
||||||
return self._s.i
|
|
||||||
|
|
||||||
property open_entity:
|
|
||||||
def __get__(self):
|
|
||||||
return entity_is_open(self._s)
|
|
|
@ -1,23 +0,0 @@
|
||||||
from thinc.typedefs cimport class_t
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct Entity:
|
|
||||||
int start
|
|
||||||
int end
|
|
||||||
int label
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct State:
|
|
||||||
Entity curr
|
|
||||||
Entity* ents
|
|
||||||
int* tags
|
|
||||||
int i
|
|
||||||
int j
|
|
||||||
int length
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct Move:
|
|
||||||
class_t clas
|
|
||||||
int action
|
|
||||||
int label
|
|
||||||
bint accept
|
|
|
@ -1,41 +0,0 @@
|
||||||
from spacy.context cimport FIELD_IDS, Token
|
|
||||||
|
|
||||||
|
|
||||||
cpdef Token P2 = FIELD_IDS.P2
|
|
||||||
cpdef Token P1 = FIELD_IDS.P1
|
|
||||||
cpdef Token N0 = FIELD_IDS.N0
|
|
||||||
cpdef Token N1 = FIELD_IDS.N1
|
|
||||||
cpdef Token N2 = FIELD_IDS.N2
|
|
||||||
|
|
||||||
|
|
||||||
TEMPLATES = (
|
|
||||||
(N0.sic,),
|
|
||||||
(N0.norm,),
|
|
||||||
(N0.suffix,),
|
|
||||||
(N0.prefix,),
|
|
||||||
(P1.pos,),
|
|
||||||
(P2.pos,),
|
|
||||||
(P1.pos, P2.pos),
|
|
||||||
(P1.pos, N0.norm),
|
|
||||||
(P1.norm,),
|
|
||||||
(P1.suffix,),
|
|
||||||
(P2.norm,),
|
|
||||||
(N1.norm,),
|
|
||||||
(N1.suffix,),
|
|
||||||
(N2.norm,),
|
|
||||||
|
|
||||||
(N0.shape,),
|
|
||||||
(N0.cluster,),
|
|
||||||
(N1.cluster,),
|
|
||||||
(N2.cluster,),
|
|
||||||
(P1.cluster,),
|
|
||||||
(P2.cluster,),
|
|
||||||
(N0.oft_upper,),
|
|
||||||
(N0.oft_title,),
|
|
||||||
|
|
||||||
(N0.postype,),
|
|
||||||
|
|
||||||
(P1.like_url,),
|
|
||||||
(N1.like_number,),
|
|
||||||
(N1.like_url,),
|
|
||||||
)
|
|
|
@ -1,153 +0,0 @@
|
||||||
from __future__ import unicode_literals
|
|
||||||
from . import util
|
|
||||||
from . import tokens
|
|
||||||
from .en import EN
|
|
||||||
|
|
||||||
|
|
||||||
def read_gold(file_, tag_list, col):
|
|
||||||
paras = file_.read().strip().split('\n\n')
|
|
||||||
golds = []
|
|
||||||
tag_ids = dict((tag, i) for i, tag in enumerate(tag_list))
|
|
||||||
for para in paras:
|
|
||||||
if not para.strip():
|
|
||||||
continue
|
|
||||||
lines = para.strip().split('\n')
|
|
||||||
raw = lines.pop(0)
|
|
||||||
gold_toks = lines.pop(0)
|
|
||||||
tokens = EN.tokenize(raw)
|
|
||||||
tags = []
|
|
||||||
conll_toks = []
|
|
||||||
for line in lines:
|
|
||||||
pieces = line.split()
|
|
||||||
conll_toks.append((int(pieces[0]), len(pieces[1]), pieces[col]))
|
|
||||||
for i, token in enumerate(tokens):
|
|
||||||
if not conll_toks:
|
|
||||||
tags.append('NULL')
|
|
||||||
elif token.idx == conll_toks[0][0]:
|
|
||||||
tags.append(conll_toks[0][2])
|
|
||||||
conll_toks.pop(0)
|
|
||||||
elif token.idx < conll_toks[0]:
|
|
||||||
tags.append('NULL')
|
|
||||||
else:
|
|
||||||
conll_toks.pop(0)
|
|
||||||
assert len(tags) == len(tokens)
|
|
||||||
tags = [_encode_pos(t, tag_ids, tag_list) for t in tags]
|
|
||||||
golds.append((tokens, tags))
|
|
||||||
return golds
|
|
||||||
|
|
||||||
def _encode_pos(tag, tag_ids, tag_list):
|
|
||||||
if tag == '-':
|
|
||||||
return 0
|
|
||||||
if tag not in tag_ids:
|
|
||||||
tag_ids[tag] = len(tag_list)
|
|
||||||
tag_list.append(tag)
|
|
||||||
return tag_ids[tag]
|
|
||||||
|
|
||||||
|
|
||||||
def ptb_to_univ(tag):
|
|
||||||
mapping = dict(tuple(line.split()) for line in """
|
|
||||||
NULL NULL
|
|
||||||
HYPH .
|
|
||||||
ADD X
|
|
||||||
NFP .
|
|
||||||
AFX X
|
|
||||||
XX X
|
|
||||||
BES VERB
|
|
||||||
HVS VERB
|
|
||||||
GW X
|
|
||||||
! .
|
|
||||||
# .
|
|
||||||
$ .
|
|
||||||
'' .
|
|
||||||
( .
|
|
||||||
) .
|
|
||||||
, .
|
|
||||||
-LRB- .
|
|
||||||
-RRB- .
|
|
||||||
. .
|
|
||||||
: .
|
|
||||||
? .
|
|
||||||
CC CONJ
|
|
||||||
CD NUM
|
|
||||||
CD|RB X
|
|
||||||
DT DET
|
|
||||||
EX DET
|
|
||||||
FW X
|
|
||||||
IN ADP
|
|
||||||
IN|RP ADP
|
|
||||||
JJ ADJ
|
|
||||||
JJR ADJ
|
|
||||||
JJRJR ADJ
|
|
||||||
JJS ADJ
|
|
||||||
JJ|RB ADJ
|
|
||||||
JJ|VBG ADJ
|
|
||||||
LS X
|
|
||||||
MD VERB
|
|
||||||
NN NOUN
|
|
||||||
NNP NOUN
|
|
||||||
NNPS NOUN
|
|
||||||
NNS NOUN
|
|
||||||
NN|NNS NOUN
|
|
||||||
NN|SYM NOUN
|
|
||||||
NN|VBG NOUN
|
|
||||||
NP NOUN
|
|
||||||
PDT DET
|
|
||||||
POS PRT
|
|
||||||
PRP PRON
|
|
||||||
PRP$ PRON
|
|
||||||
PRP|VBP PRON
|
|
||||||
PRT PRT
|
|
||||||
RB ADV
|
|
||||||
RBR ADV
|
|
||||||
RBS ADV
|
|
||||||
RB|RP ADV
|
|
||||||
RB|VBG ADV
|
|
||||||
RN X
|
|
||||||
RP PRT
|
|
||||||
SYM X
|
|
||||||
TO PRT
|
|
||||||
UH X
|
|
||||||
VB VERB
|
|
||||||
VBD VERB
|
|
||||||
VBD|VBN VERB
|
|
||||||
VBG VERB
|
|
||||||
VBG|NN VERB
|
|
||||||
VBN VERB
|
|
||||||
VBP VERB
|
|
||||||
VBP|TO VERB
|
|
||||||
VBZ VERB
|
|
||||||
VP VERB
|
|
||||||
WDT DET
|
|
||||||
WH X
|
|
||||||
WP PRON
|
|
||||||
WP$ PRON
|
|
||||||
WRB ADV
|
|
||||||
! PRT
|
|
||||||
# X
|
|
||||||
$ NUM
|
|
||||||
& CONJ
|
|
||||||
, .
|
|
||||||
@ X
|
|
||||||
A ADJ
|
|
||||||
D DET
|
|
||||||
E X
|
|
||||||
G X
|
|
||||||
L PRT
|
|
||||||
M PRT
|
|
||||||
N NOUN
|
|
||||||
O PRON
|
|
||||||
P ADP
|
|
||||||
R ADV
|
|
||||||
S NOUN
|
|
||||||
T PRT
|
|
||||||
U X
|
|
||||||
V VERB
|
|
||||||
X PRT
|
|
||||||
Y PRT
|
|
||||||
Z NOUN
|
|
||||||
^ NOUN
|
|
||||||
~ X
|
|
||||||
`` .
|
|
||||||
EOL EOL""".strip().split('\n'))
|
|
||||||
return mapping[tag]
|
|
||||||
|
|
|
@ -8,7 +8,7 @@ from .structs cimport Utf8Str, UniStr
|
||||||
cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) nogil:
|
cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) nogil:
|
||||||
s.chars = &chars[start]
|
s.chars = &chars[start]
|
||||||
s.n = end - start
|
s.n = end - start
|
||||||
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
|
s.key = hash64(s.chars, <int>(s.n * sizeof(Py_UNICODE)), 0)
|
||||||
|
|
||||||
|
|
||||||
cdef class StringStore:
|
cdef class StringStore:
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from libc.stdint cimport uint8_t, uint32_t
|
from libc.stdint cimport uint8_t, uint32_t
|
||||||
|
|
||||||
from .typedefs cimport flags_t, attr_t, id_t, hash_t
|
from .typedefs cimport flags_t, attr_t, id_t, hash_t, univ_tag_t
|
||||||
|
|
||||||
|
|
||||||
cdef struct Lexeme:
|
cdef struct Lexeme:
|
||||||
|
@ -34,7 +34,7 @@ cdef struct Morphology:
|
||||||
cdef struct PosTag:
|
cdef struct PosTag:
|
||||||
Morphology morph
|
Morphology morph
|
||||||
int id
|
int id
|
||||||
int pos
|
univ_tag_t pos
|
||||||
|
|
||||||
|
|
||||||
cdef struct TokenC:
|
cdef struct TokenC:
|
||||||
|
|
|
@ -2,7 +2,7 @@ from libc.stdint cimport uint32_t
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from ..tokens cimport TokenC
|
from ..structs cimport TokenC
|
||||||
|
|
||||||
|
|
||||||
cdef struct State:
|
cdef struct State:
|
||||||
|
@ -20,7 +20,8 @@ cdef int pop_stack(State *s) except -1
|
||||||
cdef int push_stack(State *s) except -1
|
cdef int push_stack(State *s) except -1
|
||||||
|
|
||||||
|
|
||||||
cdef bint has_head(const TokenC* t) nogil
|
cdef inline bint has_head(const TokenC* t) nogil:
|
||||||
|
return t.head != 0
|
||||||
|
|
||||||
|
|
||||||
cdef inline int get_idx(const State* s, const TokenC* t) nogil:
|
cdef inline int get_idx(const State* s, const TokenC* t) nogil:
|
||||||
|
@ -70,29 +71,14 @@ cdef inline bint is_final(const State *s) nogil:
|
||||||
return at_eol(s) # The stack will be attached to root anyway
|
return at_eol(s) # The stack will be attached to root anyway
|
||||||
|
|
||||||
|
|
||||||
cdef int children_in_buffer(const State *s, const int head, const int* gold) except -1
|
cdef int children_in_buffer(const State *s, const int head, int* gold) except -1
|
||||||
cdef int head_in_buffer(const State *s, const int child, const int* gold) except -1
|
cdef int head_in_buffer(const State *s, const int child, int* gold) except -1
|
||||||
cdef int children_in_stack(const State *s, const int head, const int* gold) except -1
|
cdef int children_in_stack(const State *s, const int head, int* gold) except -1
|
||||||
cdef int head_in_stack(const State *s, const int child, const int* gold) except -1
|
cdef int head_in_stack(const State *s, const int child, int* gold) except -1
|
||||||
|
|
||||||
cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL
|
cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL
|
||||||
|
|
||||||
|
|
||||||
cdef int count_left_kids(const TokenC* head) nogil
|
|
||||||
|
|
||||||
|
|
||||||
cdef int count_right_kids(const TokenC* head) nogil
|
|
||||||
|
|
||||||
|
|
||||||
# From https://en.wikipedia.org/wiki/Hamming_weight
|
|
||||||
cdef inline uint32_t _popcount(uint32_t x) nogil:
|
|
||||||
"""Find number of non-zero bits."""
|
|
||||||
cdef int count = 0
|
|
||||||
while x != 0:
|
|
||||||
x &= x - 1
|
|
||||||
count += 1
|
|
||||||
return count
|
|
||||||
|
|
||||||
|
|
||||||
cdef inline uint32_t _nth_significant_bit(uint32_t bits, int n) nogil:
|
cdef inline uint32_t _nth_significant_bit(uint32_t bits, int n) nogil:
|
||||||
cdef int i
|
cdef int i
|
||||||
|
|
|
@ -3,31 +3,23 @@ from libc.string cimport memmove
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from ..lexeme cimport EMPTY_LEXEME
|
from ..lexeme cimport EMPTY_LEXEME
|
||||||
from ..tokens cimport TokenC
|
|
||||||
|
|
||||||
|
|
||||||
DEF PADDING = 5
|
|
||||||
DEF NON_MONOTONIC = True
|
|
||||||
|
|
||||||
|
|
||||||
cdef int add_dep(State *s, int head, int child, int label) except -1:
|
cdef int add_dep(State *s, int head, int child, int label) except -1:
|
||||||
cdef int dist = head - child
|
s.sent[child].head = head - child
|
||||||
s.sent[child].head = dist
|
|
||||||
s.sent[child].dep_tag = label
|
s.sent[child].dep_tag = label
|
||||||
# Keep a bit-vector tracking child dependencies. If a word has a child at
|
# Keep a bit-vector tracking child dependencies. If a word has a child at
|
||||||
# offset i from it, set that bit (tracking left and right separately)
|
# offset i from it, set that bit (tracking left and right separately)
|
||||||
if child > head:
|
if child > head:
|
||||||
s.sent[head].r_kids |= 1 << (-dist)
|
s.sent[head].r_kids |= 1 << (-s.sent[child].head)
|
||||||
else:
|
else:
|
||||||
s.sent[head].l_kids |= 1 << dist
|
s.sent[head].l_kids |= 1 << s.sent[child].head
|
||||||
|
|
||||||
|
|
||||||
cdef int pop_stack(State *s) except -1:
|
cdef int pop_stack(State *s) except -1:
|
||||||
assert s.stack_len >= 1
|
assert s.stack_len >= 1
|
||||||
s.stack_len -= 1
|
s.stack_len -= 1
|
||||||
s.stack -= 1
|
s.stack -= 1
|
||||||
if s.stack_len == 0 and not at_eol(s):
|
|
||||||
push_stack(s)
|
|
||||||
|
|
||||||
|
|
||||||
cdef int push_stack(State *s) except -1:
|
cdef int push_stack(State *s) except -1:
|
||||||
|
@ -36,14 +28,9 @@ cdef int push_stack(State *s) except -1:
|
||||||
s.stack[0] = s.i
|
s.stack[0] = s.i
|
||||||
s.stack_len += 1
|
s.stack_len += 1
|
||||||
s.i += 1
|
s.i += 1
|
||||||
if at_eol(s):
|
|
||||||
while s.stack_len != 0:
|
|
||||||
if not has_head(get_s0(s)):
|
|
||||||
get_s0(s).dep_tag = 0
|
|
||||||
pop_stack(s)
|
|
||||||
|
|
||||||
|
|
||||||
cdef int children_in_buffer(const State *s, int head, const int* gold) except -1:
|
cdef int children_in_buffer(const State *s, int head, int* gold) except -1:
|
||||||
# Golds holds an array of head offsets --- the head of word i is i - golds[i]
|
# Golds holds an array of head offsets --- the head of word i is i - golds[i]
|
||||||
# Iterate over the tokens of the queue, and check whether their gold head is
|
# Iterate over the tokens of the queue, and check whether their gold head is
|
||||||
# our target
|
# our target
|
||||||
|
@ -55,21 +42,20 @@ cdef int children_in_buffer(const State *s, int head, const int* gold) except -1
|
||||||
return n
|
return n
|
||||||
|
|
||||||
|
|
||||||
cdef int head_in_buffer(const State *s, const int child, const int* gold) except -1:
|
cdef int head_in_buffer(const State *s, const int child, int* gold) except -1:
|
||||||
return gold[child] >= s.i
|
return gold[child] >= s.i
|
||||||
|
|
||||||
|
|
||||||
cdef int children_in_stack(const State *s, const int head, const int* gold) except -1:
|
cdef int children_in_stack(const State *s, const int head, int* gold) except -1:
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef int n = 0
|
cdef int n = 0
|
||||||
for i in range(s.stack_len):
|
for i in range(s.stack_len):
|
||||||
if gold[s.stack[-i]] == head:
|
if gold[s.stack[-i]] == head:
|
||||||
if NON_MONOTONIC or not has_head(get_s0(s)):
|
n += 1
|
||||||
n += 1
|
|
||||||
return n
|
return n
|
||||||
|
|
||||||
|
|
||||||
cdef int head_in_stack(const State *s, const int child, const int* gold) except -1:
|
cdef int head_in_stack(const State *s, const int child, int* gold) except -1:
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(s.stack_len):
|
for i in range(s.stack_len):
|
||||||
if gold[child] == s.stack[-i]:
|
if gold[child] == s.stack[-i]:
|
||||||
|
@ -86,7 +72,7 @@ cdef const TokenC* get_left(const State* s, const TokenC* head, const int idx) n
|
||||||
if child >= s.sent:
|
if child >= s.sent:
|
||||||
return child
|
return child
|
||||||
else:
|
else:
|
||||||
return NULL
|
return s.sent - 1
|
||||||
|
|
||||||
|
|
||||||
cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx) nogil:
|
cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx) nogil:
|
||||||
|
@ -98,20 +84,10 @@ cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx)
|
||||||
if child < (s.sent + s.sent_len):
|
if child < (s.sent + s.sent_len):
|
||||||
return child
|
return child
|
||||||
else:
|
else:
|
||||||
return NULL
|
return s.sent - 1
|
||||||
|
|
||||||
|
|
||||||
cdef bint has_head(const TokenC* t) nogil:
|
DEF PADDING = 5
|
||||||
return t.head != 0
|
|
||||||
|
|
||||||
|
|
||||||
cdef int count_left_kids(const TokenC* head) nogil:
|
|
||||||
return _popcount(head.l_kids)
|
|
||||||
|
|
||||||
|
|
||||||
cdef int count_right_kids(const TokenC* head) nogil:
|
|
||||||
return _popcount(head.r_kids)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL:
|
cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL:
|
||||||
|
@ -126,5 +102,4 @@ cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NUL
|
||||||
s.stack_len = 0
|
s.stack_len = 0
|
||||||
s.i = 0
|
s.i = 0
|
||||||
s.sent_len = sent_length
|
s.sent_len = sent_length
|
||||||
push_stack(s)
|
|
||||||
return s
|
return s
|
||||||
|
|
|
@ -7,11 +7,8 @@ from ._state cimport State
|
||||||
|
|
||||||
|
|
||||||
cdef struct Transition:
|
cdef struct Transition:
|
||||||
int clas
|
|
||||||
int move
|
int move
|
||||||
int label
|
int label
|
||||||
int cost
|
|
||||||
weight_t score
|
|
||||||
|
|
||||||
|
|
||||||
cdef class TransitionSystem:
|
cdef class TransitionSystem:
|
||||||
|
@ -21,8 +18,7 @@ cdef class TransitionSystem:
|
||||||
|
|
||||||
cdef const Transition* _moves
|
cdef const Transition* _moves
|
||||||
|
|
||||||
cdef Transition best_valid(self, const weight_t* scores, const State* s) except *
|
cdef Transition best_valid(self, const weight_t* scores, const State* s) except -1
|
||||||
cdef Transition best_gold(self, Transition* guess, const weight_t* scores,
|
cdef Transition best_gold(self, const weight_t* scores, const State* s,
|
||||||
const State* s,
|
int* gold_heads, int* gold_labels) except -1
|
||||||
const int* gold_heads, const int* gold_labels) except *
|
|
||||||
cdef int transition(self, State *s, const Transition* t) except -1
|
cdef int transition(self, State *s, const Transition* t) except -1
|
||||||
|
|
|
@ -7,8 +7,6 @@ from ._state cimport head_in_stack, children_in_stack
|
||||||
|
|
||||||
from ..tokens cimport TokenC
|
from ..tokens cimport TokenC
|
||||||
|
|
||||||
DEF NON_MONOTONIC = True
|
|
||||||
|
|
||||||
|
|
||||||
cdef enum:
|
cdef enum:
|
||||||
SHIFT
|
SHIFT
|
||||||
|
@ -27,30 +25,22 @@ cdef inline bint _can_right(const State* s) nogil:
|
||||||
|
|
||||||
|
|
||||||
cdef inline bint _can_left(const State* s) nogil:
|
cdef inline bint _can_left(const State* s) nogil:
|
||||||
if NON_MONOTONIC:
|
return s.stack_len >= 1 and not has_head(get_s0(s))
|
||||||
return s.stack_len >= 1
|
|
||||||
else:
|
|
||||||
return s.stack_len >= 1 and not has_head(get_s0(s))
|
|
||||||
|
|
||||||
|
|
||||||
cdef inline bint _can_reduce(const State* s) nogil:
|
cdef inline bint _can_reduce(const State* s) nogil:
|
||||||
if NON_MONOTONIC:
|
return s.stack_len >= 2 and has_head(get_s0(s))
|
||||||
return s.stack_len >= 2
|
|
||||||
else:
|
|
||||||
return s.stack_len >= 2 and has_head(get_s0(s))
|
|
||||||
|
|
||||||
|
|
||||||
cdef int _shift_cost(const State* s, const int* gold) except -1:
|
cdef int _shift_cost(const State* s, int* gold) except -1:
|
||||||
assert not at_eol(s)
|
assert not at_eol(s)
|
||||||
cost = 0
|
cost = 0
|
||||||
cost += head_in_stack(s, s.i, gold)
|
cost += head_in_stack(s, s.i, gold)
|
||||||
cost += children_in_stack(s, s.i, gold)
|
cost += children_in_stack(s, s.i, gold)
|
||||||
if NON_MONOTONIC:
|
|
||||||
cost += gold[s.stack[0]] == s.i
|
|
||||||
return cost
|
return cost
|
||||||
|
|
||||||
|
|
||||||
cdef int _right_cost(const State* s, const int* gold) except -1:
|
cdef int _right_cost(const State* s, int* gold) except -1:
|
||||||
assert s.stack_len >= 1
|
assert s.stack_len >= 1
|
||||||
cost = 0
|
cost = 0
|
||||||
if gold[s.i] == s.stack[0]:
|
if gold[s.i] == s.stack[0]:
|
||||||
|
@ -58,12 +48,10 @@ cdef int _right_cost(const State* s, const int* gold) except -1:
|
||||||
cost += head_in_buffer(s, s.i, gold)
|
cost += head_in_buffer(s, s.i, gold)
|
||||||
cost += children_in_stack(s, s.i, gold)
|
cost += children_in_stack(s, s.i, gold)
|
||||||
cost += head_in_stack(s, s.i, gold)
|
cost += head_in_stack(s, s.i, gold)
|
||||||
if NON_MONOTONIC:
|
|
||||||
cost += gold[s.stack[0]] == s.i
|
|
||||||
return cost
|
return cost
|
||||||
|
|
||||||
|
|
||||||
cdef int _left_cost(const State* s, const int* gold) except -1:
|
cdef int _left_cost(const State* s, int* gold) except -1:
|
||||||
assert s.stack_len >= 1
|
assert s.stack_len >= 1
|
||||||
cost = 0
|
cost = 0
|
||||||
if gold[s.stack[0]] == s.i:
|
if gold[s.stack[0]] == s.i:
|
||||||
|
@ -71,17 +59,11 @@ cdef int _left_cost(const State* s, const int* gold) except -1:
|
||||||
|
|
||||||
cost += head_in_buffer(s, s.stack[0], gold)
|
cost += head_in_buffer(s, s.stack[0], gold)
|
||||||
cost += children_in_buffer(s, s.stack[0], gold)
|
cost += children_in_buffer(s, s.stack[0], gold)
|
||||||
if NON_MONOTONIC and s.stack_len >= 2:
|
|
||||||
cost += gold[s.stack[0]] == s.stack[-1]
|
|
||||||
return cost
|
return cost
|
||||||
|
|
||||||
|
|
||||||
cdef int _reduce_cost(const State* s, const int* gold) except -1:
|
cdef int _reduce_cost(const State* s, int* gold) except -1:
|
||||||
cdef int cost = 0
|
return children_in_buffer(s, s.stack[0], gold)
|
||||||
cost += children_in_buffer(s, s.stack[0], gold)
|
|
||||||
if NON_MONOTONIC:
|
|
||||||
cost += head_in_buffer(s, s.stack[0], gold)
|
|
||||||
return cost
|
|
||||||
|
|
||||||
|
|
||||||
cdef class TransitionSystem:
|
cdef class TransitionSystem:
|
||||||
|
@ -91,40 +73,38 @@ cdef class TransitionSystem:
|
||||||
right_labels.sort()
|
right_labels.sort()
|
||||||
if 'ROOT' in right_labels:
|
if 'ROOT' in right_labels:
|
||||||
right_labels.pop(right_labels.index('ROOT'))
|
right_labels.pop(right_labels.index('ROOT'))
|
||||||
|
if 'dep' in right_labels:
|
||||||
|
right_labels.pop(right_labels.index('dep'))
|
||||||
if 'ROOT' in left_labels:
|
if 'ROOT' in left_labels:
|
||||||
left_labels.pop(left_labels.index('ROOT'))
|
left_labels.pop(left_labels.index('ROOT'))
|
||||||
|
if 'dep' in left_labels:
|
||||||
|
left_labels.pop(left_labels.index('dep'))
|
||||||
self.n_moves = 2 + len(left_labels) + len(right_labels)
|
self.n_moves = 2 + len(left_labels) + len(right_labels)
|
||||||
moves = <Transition*>self.mem.alloc(self.n_moves, sizeof(Transition))
|
moves = <Transition*>self.mem.alloc(self.n_moves, sizeof(Transition))
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
moves[i].move = SHIFT
|
moves[i].move = SHIFT
|
||||||
moves[i].label = 0
|
moves[i].label = 0
|
||||||
moves[i].clas = i
|
|
||||||
i += 1
|
i += 1
|
||||||
moves[i].move = REDUCE
|
moves[i].move = REDUCE
|
||||||
moves[i].label = 0
|
moves[i].label = 0
|
||||||
moves[i].clas = i
|
|
||||||
i += 1
|
i += 1
|
||||||
self.label_ids = {'ROOT': 0}
|
self.label_ids = {'ROOT': 0, 'dep': -1}
|
||||||
cdef int label_id
|
cdef int label_id
|
||||||
for label_str in left_labels:
|
for label_str in left_labels:
|
||||||
label_id = self.label_ids.setdefault(label_str, len(self.label_ids))
|
label_id = self.label_ids.setdefault(label_str, len(self.label_ids))
|
||||||
moves[i].move = LEFT
|
moves[i].move = LEFT
|
||||||
moves[i].label = label_id
|
moves[i].label = label_id
|
||||||
moves[i].clas = i
|
|
||||||
i += 1
|
i += 1
|
||||||
for label_str in right_labels:
|
for label_str in right_labels:
|
||||||
label_id = self.label_ids.setdefault(label_str, len(self.label_ids))
|
label_id = self.label_ids.setdefault(label_str, len(self.label_ids))
|
||||||
moves[i].move = RIGHT
|
moves[i].move = RIGHT
|
||||||
moves[i].label = label_id
|
moves[i].label = label_id
|
||||||
moves[i].clas = i
|
|
||||||
i += 1
|
i += 1
|
||||||
self._moves = moves
|
self._moves = moves
|
||||||
|
|
||||||
cdef int transition(self, State *s, const Transition* t) except -1:
|
cdef int transition(self, State *s, const int clas) except -1:
|
||||||
|
cdef const Transition* t = &self._moves[clas]
|
||||||
if t.move == SHIFT:
|
if t.move == SHIFT:
|
||||||
# Set the dep label, in case we need it after we reduce
|
|
||||||
if NON_MONOTONIC:
|
|
||||||
get_s0(s).dep_tag = t.label
|
|
||||||
push_stack(s)
|
push_stack(s)
|
||||||
elif t.move == LEFT:
|
elif t.move == LEFT:
|
||||||
add_dep(s, s.i, s.stack[0], t.label)
|
add_dep(s, s.i, s.stack[0], t.label)
|
||||||
|
@ -133,12 +113,11 @@ cdef class TransitionSystem:
|
||||||
add_dep(s, s.stack[0], s.i, t.label)
|
add_dep(s, s.stack[0], s.i, t.label)
|
||||||
push_stack(s)
|
push_stack(s)
|
||||||
elif t.move == REDUCE:
|
elif t.move == REDUCE:
|
||||||
add_dep(s, s.stack[-1], s.stack[0], get_s0(s).dep_tag)
|
|
||||||
pop_stack(s)
|
pop_stack(s)
|
||||||
else:
|
else:
|
||||||
raise StandardError(t.move)
|
raise StandardError(t.move)
|
||||||
|
|
||||||
cdef Transition best_valid(self, const weight_t* scores, const State* s) except *:
|
cdef int best_valid(self, const weight_t* scores, const State* s) except -1:
|
||||||
cdef bint[N_MOVES] valid
|
cdef bint[N_MOVES] valid
|
||||||
valid[SHIFT] = _can_shift(s)
|
valid[SHIFT] = _can_shift(s)
|
||||||
valid[LEFT] = _can_left(s)
|
valid[LEFT] = _can_left(s)
|
||||||
|
@ -147,61 +126,59 @@ cdef class TransitionSystem:
|
||||||
|
|
||||||
cdef int best = -1
|
cdef int best = -1
|
||||||
cdef weight_t score = 0
|
cdef weight_t score = 0
|
||||||
cdef weight_t best_r_score = -9000
|
|
||||||
cdef int best_r_label = -1
|
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(self.n_moves):
|
for i in range(self.n_moves):
|
||||||
if valid[self._moves[i].move] and (best == -1 or scores[i] > score):
|
if valid[self._moves[i].move] and (best == -1 or scores[i] > score):
|
||||||
best = i
|
best = i
|
||||||
score = scores[i]
|
score = scores[i]
|
||||||
if self._moves[i].move == RIGHT and scores[i] > best_r_score:
|
|
||||||
best_r_label = self._moves[i].label
|
|
||||||
assert best >= 0
|
assert best >= 0
|
||||||
cdef Transition t = self._moves[best]
|
return best
|
||||||
t.score = score
|
|
||||||
if t.move == SHIFT:
|
|
||||||
t.label = best_r_label
|
|
||||||
return t
|
|
||||||
|
|
||||||
cdef Transition best_gold(self, Transition* guess, const weight_t* scores,
|
cdef int best_gold(self, const weight_t* scores, const State* s,
|
||||||
const State* s,
|
int* gold_heads, int* gold_labels) except -1:
|
||||||
const int* gold_heads, const int* gold_labels) except *:
|
|
||||||
# If we can create a gold dependency, only one action can be correct
|
|
||||||
cdef int[N_MOVES] unl_costs
|
cdef int[N_MOVES] unl_costs
|
||||||
unl_costs[SHIFT] = _shift_cost(s, gold_heads) if _can_shift(s) else -1
|
unl_costs[SHIFT] = _shift_cost(s, gold_heads) if _can_shift(s) else -1
|
||||||
unl_costs[LEFT] = _left_cost(s, gold_heads) if _can_left(s) else -1
|
unl_costs[LEFT] = _left_cost(s, gold_heads) if _can_left(s) else -1
|
||||||
unl_costs[RIGHT] = _right_cost(s, gold_heads) if _can_right(s) else -1
|
unl_costs[RIGHT] = _right_cost(s, gold_heads) if _can_right(s) else -1
|
||||||
unl_costs[REDUCE] = _reduce_cost(s, gold_heads) if _can_reduce(s) else -1
|
unl_costs[REDUCE] = _reduce_cost(s, gold_heads) if _can_reduce(s) else -1
|
||||||
|
|
||||||
guess.cost = unl_costs[guess.move]
|
cdef int cost
|
||||||
cdef Transition t
|
cdef int move
|
||||||
cdef int target_label
|
cdef int label
|
||||||
cdef int i
|
|
||||||
if gold_heads[s.stack[0]] == s.i:
|
|
||||||
target_label = gold_labels[s.stack[0]]
|
|
||||||
if guess.move == LEFT:
|
|
||||||
guess.cost += guess.label != target_label
|
|
||||||
for i in range(self.n_moves):
|
|
||||||
t = self._moves[i]
|
|
||||||
if t.move == LEFT and t.label == target_label:
|
|
||||||
return t
|
|
||||||
elif gold_heads[s.i] == s.stack[0]:
|
|
||||||
target_label = gold_labels[s.i]
|
|
||||||
if guess.move == RIGHT:
|
|
||||||
guess.cost += guess.label != target_label
|
|
||||||
for i in range(self.n_moves):
|
|
||||||
t = self._moves[i]
|
|
||||||
if t.move == RIGHT and t.label == target_label:
|
|
||||||
return t
|
|
||||||
|
|
||||||
cdef int best = -1
|
cdef int best = -1
|
||||||
cdef weight_t score = -9000
|
cdef weight_t score = -9000
|
||||||
|
cdef int i
|
||||||
for i in range(self.n_moves):
|
for i in range(self.n_moves):
|
||||||
t = self._moves[i]
|
move = self._moves[i].move
|
||||||
if unl_costs[t.move] == 0 and (best == -1 or scores[i] > score):
|
label = self._moves[i].label
|
||||||
best = i
|
if unl_costs[move] == 0:
|
||||||
score = scores[i]
|
if move == SHIFT or move == REDUCE:
|
||||||
t = self._moves[best]
|
cost = 0
|
||||||
t.score = score
|
elif move == LEFT:
|
||||||
assert best >= 0
|
if gold_heads[s.stack[0]] == s.i and gold_labels[s.stack[0]] != -1:
|
||||||
return t
|
cost = label != gold_labels[s.stack[0]]
|
||||||
|
else:
|
||||||
|
cost = 0
|
||||||
|
elif move == RIGHT:
|
||||||
|
if gold_heads[s.i] == s.stack[0] and gold_labels[s.i] != -1:
|
||||||
|
cost = label != gold_labels[s.i]
|
||||||
|
else:
|
||||||
|
cost = 0
|
||||||
|
else:
|
||||||
|
raise StandardError("Unknown Move")
|
||||||
|
if cost == 0 and (best == -1 or scores[i] > score):
|
||||||
|
best = i
|
||||||
|
score = scores[i]
|
||||||
|
|
||||||
|
if best < 0:
|
||||||
|
print unl_costs[SHIFT], unl_costs[REDUCE], unl_costs[LEFT], unl_costs[RIGHT]
|
||||||
|
print s.stack_len
|
||||||
|
print has_head(get_s0(s))
|
||||||
|
print s.sent[s.stack[0]].head
|
||||||
|
print s.stack[0], s.i
|
||||||
|
print gold_heads[s.stack[0]], gold_heads[s.i]
|
||||||
|
print gold_labels[s.i]
|
||||||
|
print children_in_buffer(s, s.stack[0], gold_heads)
|
||||||
|
print head_in_buffer(s, s.stack[0], gold_heads)
|
||||||
|
raise StandardError
|
||||||
|
return best
|
||||||
|
|
|
@ -2,6 +2,8 @@
|
||||||
# cython: embedsignature=True
|
# cython: embedsignature=True
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from os import path
|
||||||
|
|
||||||
from cython.operator cimport dereference as deref
|
from cython.operator cimport dereference as deref
|
||||||
from cython.operator cimport preincrement as preinc
|
from cython.operator cimport preincrement as preinc
|
||||||
|
|
||||||
|
@ -28,6 +30,17 @@ cdef class Tokenizer:
|
||||||
self.vocab = Vocab(self.get_props)
|
self.vocab = Vocab(self.get_props)
|
||||||
self._load_special_tokenization(rules)
|
self._load_special_tokenization(rules)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dir(cls, Vocab vocab, object data_dir):
|
||||||
|
if not path.exists(data_dir):
|
||||||
|
raise IOError("Directory %s not found -- cannot load Tokenizer." % data_dir)
|
||||||
|
if not path.isdir(data_dir):
|
||||||
|
raise IOError("Path %s is a file, not a dir -- cannot load Tokenizer." % data_dir)
|
||||||
|
|
||||||
|
assert path.exists(data_dir) and path.isdir(data_dir)
|
||||||
|
rules, prefix_re, suffix_re, infix_re = util.read_lang_data(data_dir)
|
||||||
|
return cls(vocab, rules, prefix_re, suffix_re, infix_re)
|
||||||
|
|
||||||
cpdef Tokens tokens_from_list(self, list strings):
|
cpdef Tokens tokens_from_list(self, list strings):
|
||||||
cdef int length = sum([len(s) for s in strings])
|
cdef int length = sum([len(s) for s in strings])
|
||||||
cdef Tokens tokens = Tokens(self.vocab.strings, length)
|
cdef Tokens tokens = Tokens(self.vocab.strings, length)
|
||||||
|
|
|
@ -1,6 +1,26 @@
|
||||||
from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t
|
from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t
|
||||||
from libc.stdint cimport uint8_t
|
from libc.stdint cimport uint8_t
|
||||||
|
|
||||||
|
|
||||||
|
# Google universal tag set
|
||||||
|
cpdef enum univ_tag_t:
|
||||||
|
NO_TAG
|
||||||
|
ADJ
|
||||||
|
ADV
|
||||||
|
ADP
|
||||||
|
CONJ
|
||||||
|
DET
|
||||||
|
NOUN
|
||||||
|
NUM
|
||||||
|
PRON
|
||||||
|
PRT
|
||||||
|
VERB
|
||||||
|
X
|
||||||
|
PUNCT
|
||||||
|
EOL
|
||||||
|
N_UNIV_TAGS
|
||||||
|
|
||||||
|
|
||||||
ctypedef uint64_t hash_t
|
ctypedef uint64_t hash_t
|
||||||
ctypedef char* utf8_t
|
ctypedef char* utf8_t
|
||||||
ctypedef uint32_t attr_t
|
ctypedef uint32_t attr_t
|
||||||
|
@ -10,11 +30,3 @@ ctypedef uint16_t len_t
|
||||||
ctypedef uint16_t tag_t
|
ctypedef uint16_t tag_t
|
||||||
|
|
||||||
|
|
||||||
cdef struct Morphology:
|
|
||||||
uint8_t number
|
|
||||||
uint8_t tenspect # Tense/aspect/voice
|
|
||||||
uint8_t mood
|
|
||||||
uint8_t gender
|
|
||||||
uint8_t person
|
|
||||||
uint8_t case
|
|
||||||
uint8_t misc
|
|
||||||
|
|
|
@ -1,34 +0,0 @@
|
||||||
from preshed.maps cimport PreshMap
|
|
||||||
from cymem.cymem cimport Pool
|
|
||||||
from murmurhash.mrmr cimport hash64
|
|
||||||
|
|
||||||
from .typedefs cimport utf8_t, id_t, hash_t
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct Utf8Str:
|
|
||||||
id_t i
|
|
||||||
hash_t key
|
|
||||||
utf8_t chars
|
|
||||||
int length
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct UniStr:
|
|
||||||
Py_UNICODE* chars
|
|
||||||
size_t n
|
|
||||||
hash_t key
|
|
||||||
|
|
||||||
|
|
||||||
cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) nogil:
|
|
||||||
s.chars = &chars[start]
|
|
||||||
s.n = end - start
|
|
||||||
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
|
|
||||||
|
|
||||||
|
|
||||||
cdef class StringStore:
|
|
||||||
cdef Pool mem
|
|
||||||
cdef PreshMap _map
|
|
||||||
cdef Utf8Str* strings
|
|
||||||
cdef int size
|
|
||||||
cdef int _resize_at
|
|
||||||
|
|
||||||
cdef const Utf8Str* intern(self, char* chars, int length) except NULL
|
|
|
@ -1,80 +0,0 @@
|
||||||
from libc.string cimport memcpy
|
|
||||||
|
|
||||||
from murmurhash.mrmr cimport hash64
|
|
||||||
import codecs
|
|
||||||
|
|
||||||
SEPARATOR = '\n|-SEP-|\n'
|
|
||||||
|
|
||||||
|
|
||||||
cdef class StringStore:
|
|
||||||
def __init__(self):
|
|
||||||
self.mem = Pool()
|
|
||||||
self._map = PreshMap()
|
|
||||||
self._resize_at = 10000
|
|
||||||
self.strings = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
|
|
||||||
self.size = 1
|
|
||||||
|
|
||||||
property size:
|
|
||||||
def __get__(self):
|
|
||||||
return self.size-1
|
|
||||||
|
|
||||||
def __getitem__(self, object string_or_id):
|
|
||||||
cdef bytes byte_string
|
|
||||||
cdef const Utf8Str* utf8str
|
|
||||||
if isinstance(string_or_id, int) or isinstance(string_or_id, long):
|
|
||||||
if string_or_id < 1 or string_or_id >= self.size:
|
|
||||||
raise IndexError(string_or_id)
|
|
||||||
utf8str = &self.strings[<int>string_or_id]
|
|
||||||
return utf8str.chars[:utf8str.length]
|
|
||||||
elif isinstance(string_or_id, bytes):
|
|
||||||
utf8str = self.intern(<char*>string_or_id, len(string_or_id))
|
|
||||||
return utf8str.i
|
|
||||||
elif isinstance(string_or_id, unicode):
|
|
||||||
byte_string = string_or_id.encode('utf8')
|
|
||||||
utf8str = self.intern(<char*>byte_string, len(byte_string))
|
|
||||||
return utf8str.i
|
|
||||||
else:
|
|
||||||
raise TypeError(type(string_or_id))
|
|
||||||
|
|
||||||
cdef const Utf8Str* intern(self, char* chars, int length) except NULL:
|
|
||||||
# 0 means missing, but we don't bother offsetting the index. We waste
|
|
||||||
# slot 0 to simplify the code, because it doesn't matter.
|
|
||||||
assert length != 0
|
|
||||||
cdef hash_t key = hash64(chars, length * sizeof(char), 0)
|
|
||||||
cdef void* value = self._map.get(key)
|
|
||||||
cdef size_t i
|
|
||||||
if value == NULL:
|
|
||||||
if self.size == self._resize_at:
|
|
||||||
self._resize_at *= 2
|
|
||||||
self.strings = <Utf8Str*>self.mem.realloc(self.strings, self._resize_at * sizeof(Utf8Str))
|
|
||||||
i = self.size
|
|
||||||
self.strings[i].i = self.size
|
|
||||||
self.strings[i].key = key
|
|
||||||
self.strings[i].chars = <char*>self.mem.alloc(length, sizeof(char))
|
|
||||||
memcpy(self.strings[i].chars, chars, length)
|
|
||||||
self.strings[i].length = length
|
|
||||||
self._map.set(key, <void*>self.size)
|
|
||||||
self.size += 1
|
|
||||||
else:
|
|
||||||
i = <size_t>value
|
|
||||||
return &self.strings[i]
|
|
||||||
|
|
||||||
def dump(self, loc):
|
|
||||||
strings = []
|
|
||||||
cdef Utf8Str* string
|
|
||||||
cdef bytes py_string
|
|
||||||
for i in range(self.size):
|
|
||||||
string = &self.strings[i]
|
|
||||||
py_string = string.chars[:string.length]
|
|
||||||
strings.append(py_string.decode('utf8'))
|
|
||||||
with codecs.open(loc, 'w', 'utf8') as file_:
|
|
||||||
file_.write(SEPARATOR.join(strings))
|
|
||||||
|
|
||||||
def load(self, loc):
|
|
||||||
with codecs.open(loc, 'r', 'utf8') as file_:
|
|
||||||
strings = file_.read().split(SEPARATOR)
|
|
||||||
cdef unicode string
|
|
||||||
cdef bytes byte_string
|
|
||||||
for string in strings[1:]:
|
|
||||||
byte_string = string.encode('utf8')
|
|
||||||
self.intern(byte_string, len(byte_string))
|
|
|
@ -11,8 +11,7 @@ def utf8open(loc, mode='r'):
|
||||||
return codecs.open(loc, mode, 'utf8')
|
return codecs.open(loc, mode, 'utf8')
|
||||||
|
|
||||||
|
|
||||||
def read_lang_data(name):
|
def read_lang_data(data_dir):
|
||||||
data_dir = path.join(DATA_DIR, name)
|
|
||||||
with open(path.join(data_dir, 'specials.json')) as file_:
|
with open(path.join(data_dir, 'specials.json')) as file_:
|
||||||
tokenization = ujson.load(file_)
|
tokenization = ujson.load(file_)
|
||||||
prefix = read_prefix(data_dir)
|
prefix = read_prefix(data_dir)
|
||||||
|
|
|
@ -19,6 +19,17 @@ cdef class Vocab:
|
||||||
self.lexemes.push_back(&EMPTY_LEXEME)
|
self.lexemes.push_back(&EMPTY_LEXEME)
|
||||||
self.get_lex_props = get_props
|
self.get_lex_props = get_props
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dir(cls, object data_dir, object get_lex_props=None):
|
||||||
|
if not path.exists(data_dir):
|
||||||
|
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
|
||||||
|
if not path.isdir(data_dir):
|
||||||
|
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
|
||||||
|
cdef Vocab self = cls(get_props)
|
||||||
|
self.strings.load(path.join(data_dir, 'strings'))
|
||||||
|
self.load(path.join(data_dir, 'lexemes'))
|
||||||
|
return self
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return self.lexemes.size()
|
return self.lexemes.size()
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user