mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
* Tmp
This commit is contained in:
parent
d11c1edf8c
commit
e1c1a4b868
135
spacy/en.pxd
135
spacy/en.pxd
|
@ -1,135 +0,0 @@
|
|||
from thinc.typedefs cimport atom_t
|
||||
|
||||
from .lang cimport Language
|
||||
from .tokens cimport Tokens
|
||||
from .tokens cimport TokenC
|
||||
|
||||
|
||||
cpdef enum en_person_t:
|
||||
NO_PERSON
|
||||
FIRST
|
||||
SECOND
|
||||
THIRD
|
||||
NON_THIRD
|
||||
|
||||
|
||||
cpdef enum en_number_t:
|
||||
NO_NUMBER
|
||||
SINGULAR
|
||||
PLURAL
|
||||
MASS
|
||||
|
||||
|
||||
cpdef enum en_gender_t:
|
||||
NO_GENDER
|
||||
MASCULINE
|
||||
FEMININE
|
||||
NEUTER
|
||||
|
||||
|
||||
cpdef enum en_case_t:
|
||||
NO_CASE
|
||||
NOMINATIVE
|
||||
GENITIVE
|
||||
ACCUSATIVE
|
||||
REFLEXIVE
|
||||
DEMONYM
|
||||
|
||||
|
||||
cpdef enum en_tenspect_t:
|
||||
NO_TENSE
|
||||
BASE_VERB
|
||||
PRESENT
|
||||
PAST
|
||||
PASSIVE
|
||||
ING
|
||||
MODAL
|
||||
|
||||
|
||||
cpdef enum misc_t:
|
||||
NO_MISC
|
||||
COMPARATIVE
|
||||
SUPERLATIVE
|
||||
RELATIVE
|
||||
NAME
|
||||
|
||||
|
||||
# Flags
|
||||
cpdef enum FlagID:
|
||||
IS_ALPHA
|
||||
IS_ASCII
|
||||
IS_DIGIT
|
||||
IS_LOWER
|
||||
IS_PUNCT
|
||||
IS_SPACE
|
||||
IS_TITLE
|
||||
IS_UPPER
|
||||
|
||||
LIKE_URL
|
||||
LIKE_NUMBER
|
||||
|
||||
OFT_LOWER
|
||||
OFT_TITLE
|
||||
OFT_UPPER
|
||||
|
||||
IN_MALES
|
||||
IN_FEMALES
|
||||
IN_SURNAMES
|
||||
IN_PLACES
|
||||
IN_GAMES
|
||||
IN_CELEBS
|
||||
IN_NAMES
|
||||
|
||||
|
||||
cpdef enum:
|
||||
P2_sic
|
||||
P2_cluster
|
||||
P2_shape
|
||||
P2_prefix
|
||||
P2_suffix
|
||||
P2_pos
|
||||
P2_lemma
|
||||
P2_pos_type
|
||||
|
||||
P1_sic
|
||||
P1_cluster
|
||||
P1_shape
|
||||
P1_prefix
|
||||
P1_suffix
|
||||
P1_pos
|
||||
P1_lemma
|
||||
P1_pos_type
|
||||
|
||||
W_sic
|
||||
W_cluster
|
||||
W_shape
|
||||
W_prefix
|
||||
W_suffix
|
||||
W_pos
|
||||
W_lemma
|
||||
W_pos_type
|
||||
|
||||
N1_sic
|
||||
N1_cluster
|
||||
N1_shape
|
||||
N1_prefix
|
||||
N1_suffix
|
||||
N1_pos
|
||||
N1_lemma
|
||||
N1_pos_type
|
||||
|
||||
N2_sic
|
||||
N2_cluster
|
||||
N2_shape
|
||||
N2_prefix
|
||||
N2_suffix
|
||||
N2_pos
|
||||
N2_lemma
|
||||
N2_pos_type
|
||||
|
||||
N_CONTEXT_FIELDS
|
||||
|
||||
|
||||
cdef class English(Language):
|
||||
cdef int is_base_np_end(self, const TokenC* token) except -1
|
||||
cdef int is_outside_base_np(self, const TokenC* token) except -1
|
213
spacy/en.pyx
213
spacy/en.pyx
|
@ -1,213 +0,0 @@
|
|||
# cython: profile=True
|
||||
# cython: embedsignature=True
|
||||
'''Tokenize English text, using a scheme that differs from the Penn Treebank 3
|
||||
scheme in several important respects:
|
||||
|
||||
* Whitespace is added as tokens, except for single spaces. e.g.,
|
||||
|
||||
>>> [w.string for w in EN.tokenize(u'\\nHello \\tThere')]
|
||||
[u'\\n', u'Hello', u' ', u'\\t', u'There']
|
||||
|
||||
* Contractions are normalized, e.g.
|
||||
|
||||
>>> [w.string for w in EN.tokenize(u"isn't ain't won't he's")]
|
||||
[u'is', u'not', u'are', u'not', u'will', u'not', u'he', u"__s"]
|
||||
|
||||
* Hyphenated words are split, with the hyphen preserved, e.g.:
|
||||
|
||||
>>> [w.string for w in EN.tokenize(u'New York-based')]
|
||||
[u'New', u'York', u'-', u'based']
|
||||
|
||||
Other improvements:
|
||||
|
||||
* Email addresses, URLs, European-formatted dates and other numeric entities not
|
||||
found in the PTB are tokenized correctly
|
||||
* Heuristic handling of word-final periods (PTB expects sentence boundary detection
|
||||
as a pre-process before tokenization.)
|
||||
|
||||
Take care to ensure your training and run-time data is tokenized according to the
|
||||
same scheme. Tokenization problems are a major cause of poor performance for
|
||||
NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
|
||||
provides a fully Penn Treebank 3-compliant tokenizer.
|
||||
'''
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from murmurhash.mrmr cimport hash64
|
||||
|
||||
cimport lang
|
||||
from .typedefs cimport hash_t, id_t, flags_t
|
||||
import orth
|
||||
from .morphology cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
|
||||
from .morphology cimport X, PUNCT, EOL
|
||||
|
||||
from .tokens cimport Morphology
|
||||
|
||||
|
||||
DEF USE_POS_CACHE = True
|
||||
|
||||
|
||||
POS_TAGS = {
|
||||
'NULL': (NO_TAG, {}),
|
||||
'EOL': (EOL, {}),
|
||||
'CC': (CONJ, {}),
|
||||
'CD': (NUM, {}),
|
||||
'DT': (DET, {}),
|
||||
'EX': (DET, {}),
|
||||
'FW': (X, {}),
|
||||
'IN': (ADP, {}),
|
||||
'JJ': (ADJ, {}),
|
||||
'JJR': (ADJ, {'misc': COMPARATIVE}),
|
||||
'JJS': (ADJ, {'misc': SUPERLATIVE}),
|
||||
'LS': (X, {}),
|
||||
'MD': (VERB, {'tenspect': MODAL}),
|
||||
'NN': (NOUN, {}),
|
||||
'NNS': (NOUN, {'number': PLURAL}),
|
||||
'NNP': (NOUN, {'misc': NAME}),
|
||||
'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}),
|
||||
'PDT': (DET, {}),
|
||||
'POS': (PRT, {'case': GENITIVE}),
|
||||
'PRP': (NOUN, {}),
|
||||
'PRP$': (NOUN, {'case': GENITIVE}),
|
||||
'RB': (ADV, {}),
|
||||
'RBR': (ADV, {'misc': COMPARATIVE}),
|
||||
'RBS': (ADV, {'misc': SUPERLATIVE}),
|
||||
'RP': (PRT, {}),
|
||||
'SYM': (X, {}),
|
||||
'TO': (PRT, {}),
|
||||
'UH': (X, {}),
|
||||
'VB': (VERB, {}),
|
||||
'VBD': (VERB, {'tenspect': PAST}),
|
||||
'VBG': (VERB, {'tenspect': ING}),
|
||||
'VBN': (VERB, {'tenspect': PASSIVE}),
|
||||
'VBP': (VERB, {'tenspect': PRESENT}),
|
||||
'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}),
|
||||
'WDT': (DET, {'misc': RELATIVE}),
|
||||
'WP': (PRON, {'misc': RELATIVE}),
|
||||
'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}),
|
||||
'WRB': (ADV, {'misc': RELATIVE}),
|
||||
'!': (PUNCT, {}),
|
||||
'#': (PUNCT, {}),
|
||||
'$': (PUNCT, {}),
|
||||
"''": (PUNCT, {}),
|
||||
"(": (PUNCT, {}),
|
||||
")": (PUNCT, {}),
|
||||
"-LRB-": (PUNCT, {}),
|
||||
"-RRB-": (PUNCT, {}),
|
||||
".": (PUNCT, {}),
|
||||
",": (PUNCT, {}),
|
||||
"``": (PUNCT, {}),
|
||||
":": (PUNCT, {}),
|
||||
"?": (PUNCT, {}),
|
||||
}
|
||||
|
||||
|
||||
POS_TEMPLATES = (
|
||||
(W_sic,),
|
||||
(P1_lemma, P1_pos),
|
||||
(P2_lemma, P2_pos),
|
||||
(N1_sic,),
|
||||
(N2_sic,),
|
||||
|
||||
(W_suffix,),
|
||||
(W_prefix,),
|
||||
|
||||
(P1_pos,),
|
||||
(P2_pos,),
|
||||
(P1_pos, P2_pos),
|
||||
(P1_pos, W_sic),
|
||||
(P1_suffix,),
|
||||
(N1_suffix,),
|
||||
|
||||
(W_shape,),
|
||||
(W_cluster,),
|
||||
(N1_cluster,),
|
||||
(N2_cluster,),
|
||||
(P1_cluster,),
|
||||
(P2_cluster,),
|
||||
|
||||
(W_pos_type,),
|
||||
(N1_pos_type,),
|
||||
(N1_pos_type,),
|
||||
(P1_pos, W_pos_type, N1_pos_type),
|
||||
)
|
||||
|
||||
|
||||
cdef class English(Language):
|
||||
"""English tokenizer, tightly coupled to lexicon.
|
||||
|
||||
Attributes:
|
||||
name (unicode): The two letter code used by Wikipedia for the language.
|
||||
lexicon (Lexicon): The lexicon. Exposes the lookup method.
|
||||
"""
|
||||
def get_props(self, unicode string):
|
||||
return {'flags': self.set_flags(string), 'dense': orth.word_shape(string)}
|
||||
|
||||
def set_flags(self, unicode string):
|
||||
cdef flags_t flags = 0
|
||||
flags |= orth.is_alpha(string) << IS_ALPHA
|
||||
flags |= orth.is_ascii(string) << IS_ASCII
|
||||
flags |= orth.is_digit(string) << IS_DIGIT
|
||||
flags |= orth.is_lower(string) << IS_LOWER
|
||||
flags |= orth.is_punct(string) << IS_PUNCT
|
||||
flags |= orth.is_space(string) << IS_SPACE
|
||||
flags |= orth.is_title(string) << IS_TITLE
|
||||
flags |= orth.is_upper(string) << IS_UPPER
|
||||
|
||||
flags |= orth.like_url(string) << LIKE_URL
|
||||
flags |= orth.like_number(string) << LIKE_NUMBER
|
||||
return flags
|
||||
|
||||
def set_pos(self, Tokens tokens):
|
||||
cdef int i
|
||||
cdef atom_t[N_CONTEXT_FIELDS] context
|
||||
cdef TokenC* t = tokens.data
|
||||
cdef id_t[2] bigram
|
||||
cdef hash_t cache_key
|
||||
cdef void* cached = NULL
|
||||
assert self.morphologizer is not None
|
||||
cdef dict tagdict = self.pos_tagger.tagdict
|
||||
for i in range(tokens.length):
|
||||
fill_pos_context(context, i, t)
|
||||
t[i].pos = self.pos_tagger.predict(context)
|
||||
self.morphologizer.set_morph(i, t)
|
||||
|
||||
def train_pos(self, Tokens tokens, golds):
|
||||
cdef int i
|
||||
cdef atom_t[N_CONTEXT_FIELDS] context
|
||||
c = 0
|
||||
cdef TokenC* t = tokens.data
|
||||
for i in range(tokens.length):
|
||||
fill_pos_context(context, i, t)
|
||||
t[i].pos = self.pos_tagger.predict(context, [golds[i]])
|
||||
self.morphologizer.set_morph(i, t)
|
||||
c += t[i].pos == golds[i]
|
||||
return c
|
||||
|
||||
cdef int is_base_np_end(self, const TokenC* token) except -1:
|
||||
pass
|
||||
|
||||
cdef int is_outside_base_np(self, const TokenC* token) except -1:
|
||||
pass
|
||||
|
||||
|
||||
|
||||
cdef int fill_pos_context(atom_t* context, const int i, const TokenC* tokens) except -1:
|
||||
_fill_from_token(&context[P2_sic], &tokens[i-2])
|
||||
_fill_from_token(&context[P1_sic], &tokens[i-1])
|
||||
_fill_from_token(&context[W_sic], &tokens[i])
|
||||
_fill_from_token(&context[N1_sic], &tokens[i+1])
|
||||
_fill_from_token(&context[N2_sic], &tokens[i+2])
|
||||
|
||||
|
||||
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
||||
context[0] = t.lex.sic
|
||||
context[1] = t.lex.cluster
|
||||
context[2] = t.lex.shape
|
||||
context[3] = t.lex.prefix
|
||||
context[4] = t.lex.suffix
|
||||
context[5] = t.pos
|
||||
context[6] = t.lemma
|
||||
context[7] = t.lex.pos_type
|
||||
|
||||
|
||||
EN = English('en')
|
|
@ -1,44 +0,0 @@
|
|||
from libcpp.vector cimport vector
|
||||
from libcpp.pair cimport pair
|
||||
|
||||
from preshed.counter cimport count_t
|
||||
from preshed.maps cimport PreshMap
|
||||
from preshed.counter cimport PreshCounter
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
from .lang cimport Lexicon
|
||||
from .tokens cimport Tokens, TokenC
|
||||
from .typedefs cimport id_t
|
||||
from .lexeme cimport attr_id_t
|
||||
from .typedefs cimport attr_t
|
||||
from .typedefs cimport hash_t
|
||||
|
||||
from murmurhash.mrmr cimport hash64
|
||||
|
||||
|
||||
ctypedef vector[pair[id_t, count_t]] count_vector_t
|
||||
|
||||
|
||||
cdef class Index:
|
||||
cdef attr_id_t attr_id
|
||||
cdef readonly attr_t max_value
|
||||
cdef vector[count_vector_t] counts
|
||||
|
||||
cpdef int count(self, Tokens tokens) except -1
|
||||
|
||||
|
||||
cdef class DecisionMemory:
|
||||
cdef int n_classes
|
||||
cdef Pool mem
|
||||
cdef PreshCounter _counts
|
||||
cdef PreshCounter _class_counts
|
||||
cdef PreshMap memos
|
||||
cdef list class_names
|
||||
|
||||
cdef int inc(self, hash_t context_key, hash_t clas, count_t inc) except -1
|
||||
cdef int find_best_class(self, count_t* counts, hash_t context_key) except -1
|
||||
|
||||
cdef inline int get(self, hash_t context_key) nogil:
|
||||
return <int><size_t>self.memos.get(context_key) - 1
|
||||
|
||||
|
120
spacy/index.pyx
120
spacy/index.pyx
|
@ -1,120 +0,0 @@
|
|||
"""Create a term-document matrix"""
|
||||
cimport cython
|
||||
from libc.stdint cimport int64_t
|
||||
from libc.string cimport memmove
|
||||
|
||||
from cymem.cymem cimport Address
|
||||
|
||||
from .lexeme cimport Lexeme, get_attr
|
||||
from .tokens cimport TokenC
|
||||
from .typedefs cimport hash_t
|
||||
|
||||
from preshed.maps cimport MapStruct, Cell, map_get, map_set, map_init
|
||||
from murmurhash.mrmr cimport hash64
|
||||
|
||||
|
||||
cdef class Index:
|
||||
def __init__(self, attr_id_t attr_id):
|
||||
self.attr_id = attr_id
|
||||
self.max_value = 0
|
||||
|
||||
cpdef int count(self, Tokens tokens) except -1:
|
||||
cdef PreshCounter counts = PreshCounter(2 ** 8)
|
||||
cdef attr_id_t attr_id = self.attr_id
|
||||
cdef attr_t term
|
||||
cdef int i
|
||||
for i in range(tokens.length):
|
||||
term = get_attr(tokens.data[i].lex, attr_id)
|
||||
counts.inc(term, 1)
|
||||
if term > self.max_value:
|
||||
self.max_value = term
|
||||
cdef count_t count
|
||||
cdef count_vector_t doc_counts
|
||||
for term, count in counts:
|
||||
doc_counts.push_back(pair[id_t, count_t](term, count))
|
||||
self.counts.push_back(doc_counts)
|
||||
|
||||
|
||||
cdef class DecisionMemory:
|
||||
def __init__(self, class_names):
|
||||
self.class_names = class_names
|
||||
self.n_classes = len(class_names)
|
||||
self.mem = Pool()
|
||||
self._counts = PreshCounter()
|
||||
self._class_counts = PreshCounter()
|
||||
self.memos = PreshMap()
|
||||
|
||||
def load(self, loc, thresh=50):
|
||||
cdef:
|
||||
count_t freq
|
||||
hash_t key
|
||||
int clas
|
||||
for line in open(loc):
|
||||
freq, key, clas = [int(p) for p in line.split()]
|
||||
if thresh == 0 or freq >= thresh:
|
||||
self.memos.set(key, <void*>(clas+1))
|
||||
|
||||
def __getitem__(self, ids):
|
||||
cdef id_t[2] context
|
||||
context[0] = context[0]
|
||||
context[1] = context[1]
|
||||
cdef hash_t context_key = hash64(context, 2 * sizeof(id_t), 0)
|
||||
cdef hash_t[2] class_context
|
||||
class_context[0] = context_key
|
||||
counts = {}
|
||||
cdef id_t i
|
||||
for i, clas in enumerate(self.clas_names):
|
||||
class_context[1] = <hash_t>i
|
||||
key = hash64(class_context, sizeof(hash_t) * 2, 0)
|
||||
count = self._class_counts[key]
|
||||
counts[clas] = count
|
||||
return counts
|
||||
|
||||
@cython.cdivision(True)
|
||||
def iter_contexts(self, float min_acc=0.99, count_t min_freq=10):
|
||||
cdef Address counts_addr = Address(self.n_classes, sizeof(count_t))
|
||||
cdef count_t* counts = <count_t*>counts_addr.ptr
|
||||
cdef MapStruct* context_counts = self._counts.c_map
|
||||
cdef hash_t context_key
|
||||
cdef count_t context_freq
|
||||
cdef int best_class
|
||||
cdef float acc
|
||||
|
||||
cdef int i
|
||||
for i in range(context_counts.length):
|
||||
context_key = context_counts.cells[i].key
|
||||
context_freq = <count_t>context_counts.cells[i].value
|
||||
if context_key != 0 and context_freq >= min_freq:
|
||||
best_class = self.find_best_class(counts, context_key)
|
||||
acc = counts[best_class] / context_freq
|
||||
if acc >= min_acc:
|
||||
yield counts[best_class], context_key, best_class
|
||||
|
||||
cdef int inc(self, hash_t context_key, hash_t clas, count_t inc) except -1:
|
||||
cdef hash_t context_and_class_key
|
||||
cdef hash_t[2] context_and_class
|
||||
context_and_class[0] = context_key
|
||||
context_and_class[1] = clas
|
||||
context_and_class_key = hash64(context_and_class, 2 * sizeof(hash_t), 0)
|
||||
self._counts.inc(context_key, inc)
|
||||
self._class_counts.inc(context_and_class_key, inc)
|
||||
|
||||
cdef int find_best_class(self, count_t* counts, hash_t context_key) except -1:
|
||||
cdef hash_t[2] unhashed_key
|
||||
unhashed_key[0] = context_key
|
||||
|
||||
cdef count_t total = 0
|
||||
cdef hash_t key
|
||||
cdef int clas
|
||||
cdef int best
|
||||
cdef int mode = 0
|
||||
for clas in range(self.n_classes):
|
||||
unhashed_key[1] = <hash_t>clas
|
||||
key = hash64(unhashed_key, sizeof(hash_t) * 2, 0)
|
||||
count = self._class_counts[key]
|
||||
counts[clas] = count
|
||||
if count >= mode:
|
||||
mode = count
|
||||
best = clas
|
||||
total += count
|
||||
return best
|
|
@ -1,90 +0,0 @@
|
|||
from os import path
|
||||
|
||||
|
||||
NOUN_RULES = (
|
||||
('s', ''),
|
||||
('ses', 's'),
|
||||
('ves', 'f'),
|
||||
('xes', 'x'),
|
||||
('zes', 'z'),
|
||||
('ches', 'ch'),
|
||||
('shes', 'sh'),
|
||||
('men', 'man'),
|
||||
('ies', 'y')
|
||||
)
|
||||
|
||||
|
||||
VERB_RULES = (
|
||||
("s", ""),
|
||||
("ies", "y"),
|
||||
("es", "e"),
|
||||
("es", ""),
|
||||
("ed", "e"),
|
||||
("ed", ""),
|
||||
("ing", "e"),
|
||||
("ing", "")
|
||||
)
|
||||
|
||||
|
||||
ADJ_RULES = (
|
||||
("er", ""),
|
||||
("est", ""),
|
||||
("er", "e"),
|
||||
("est", "e")
|
||||
)
|
||||
|
||||
|
||||
class Lemmatizer(object):
|
||||
def __init__(self, wn_dict_dir):
|
||||
self.index = {}
|
||||
self.exc = {}
|
||||
for pos in ['adj', 'adv', 'noun', 'verb']:
|
||||
self.index[pos] = read_index(path.join(wn_dict_dir, 'index.%s' % pos))
|
||||
self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos))
|
||||
|
||||
def noun(self, string):
|
||||
return lemmatize(string, self.index['noun'], self.exc['noun'], NOUN_RULES)
|
||||
|
||||
def verb(self, string):
|
||||
return lemmatize(string, self.index['verb'], self.exc['verb'], VERB_RULES)
|
||||
|
||||
def adj(self, string):
|
||||
return lemmatize(string, self.index['adj'], self.exc['adj'], ADJ_RULES)
|
||||
|
||||
|
||||
def lemmatize(string, index, exceptions, rules):
|
||||
string = string.lower()
|
||||
forms = []
|
||||
if string in index:
|
||||
forms.append(string)
|
||||
forms.extend(exceptions.get(string, []))
|
||||
for old, new in rules:
|
||||
if string.endswith(old):
|
||||
form = string[:len(string) - len(old)] + new
|
||||
if form in index:
|
||||
forms.append(form)
|
||||
if not forms:
|
||||
forms.append(string)
|
||||
return set(forms)
|
||||
|
||||
|
||||
def read_index(loc):
|
||||
index = set()
|
||||
for line in open(loc):
|
||||
if line.startswith(' '):
|
||||
continue
|
||||
pieces = line.split()
|
||||
word = pieces[0]
|
||||
if word.count('_') == 0:
|
||||
index.add(word)
|
||||
return index
|
||||
|
||||
|
||||
def read_exc(loc):
|
||||
exceptions = {}
|
||||
for line in open(loc):
|
||||
if line.startswith(' '):
|
||||
continue
|
||||
pieces = line.split()
|
||||
exceptions[pieces[0]] = tuple(pieces[1:])
|
||||
return exceptions
|
|
@ -36,11 +36,11 @@ cdef struct _Cached:
|
|||
cdef class Morphologizer:
|
||||
"""Given a POS tag and a Lexeme, find its lemma and morphological analysis.
|
||||
"""
|
||||
def __init__(self, StringStore strings, object lemmatizer, **kwargs):
|
||||
def __init__(self, StringStore strings, object lemmatizer,
|
||||
irregulars=None, tag_map=None, tag_names=None):
|
||||
self.mem = Pool()
|
||||
self.strings = strings
|
||||
tag_map = kwargs['tag_map']
|
||||
self.tag_names = kwargs['tag_names']
|
||||
self.tag_names = tag_names
|
||||
self.lemmatizer = lemmatizer
|
||||
self._cache = PreshMapArray(len(self.tag_names))
|
||||
self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
|
||||
|
@ -55,9 +55,16 @@ cdef class Morphologizer:
|
|||
self.tags[i].morph.person = props.get('person', 0)
|
||||
self.tags[i].morph.case = props.get('case', 0)
|
||||
self.tags[i].morph.misc = props.get('misc', 0)
|
||||
#if path.exists(path.join(data_dir, 'morphs.json')):
|
||||
# with open(path.join(data_dir, 'morphs.json')) as file_:
|
||||
# self.load_exceptions(json.load(file_))
|
||||
if irregulars is not None:
|
||||
self.load_exceptions(irregulars)
|
||||
|
||||
@classmethod
|
||||
def from_dir(cls, StringStore strings, object lemmatizer, data_dir):
|
||||
tag_map = None
|
||||
irregulars = None
|
||||
tag_names = None
|
||||
return cls(strings, lemmatizer, tag_map=tag_map, irregulars=irregulars,
|
||||
tag_names=tag_names)
|
||||
|
||||
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
|
||||
if self.lemmatizer is None:
|
||||
|
@ -86,7 +93,6 @@ cdef class Morphologizer:
|
|||
cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
|
||||
cached.morph = tag.morph
|
||||
self._cache.set(tag.id, tokens[i].lex.sic, <void*>cached)
|
||||
|
||||
tokens[i].lemma = cached.lemma
|
||||
tokens[i].morph = cached.morph
|
||||
|
||||
|
|
|
@ -1,169 +0,0 @@
|
|||
from spacy.context cimport FIELD_IDS, Token
|
||||
|
||||
|
||||
cdef Token P4 = FIELD_IDS.P4
|
||||
cdef Token P3 = FIELD_IDS.P3
|
||||
cdef Token P2 = FIELD_IDS.P2
|
||||
cdef Token P1 = FIELD_IDS.P1
|
||||
cdef Token N0 = FIELD_IDS.N0
|
||||
cdef Token N1 = FIELD_IDS.N1
|
||||
cdef Token N2 = FIELD_IDS.N2
|
||||
cdef Token N3 = FIELD_IDS.N3
|
||||
cdef Token N4 = FIELD_IDS.N4
|
||||
|
||||
"""
|
||||
TEMPLATES = (
|
||||
(N0.sic,),
|
||||
(N0.cluster,),
|
||||
|
||||
(P1.pos,),
|
||||
(P1.sic,),
|
||||
|
||||
(N1.norm,),
|
||||
(N1.pos,),
|
||||
|
||||
(P1.ner,),
|
||||
(P2.ner,),
|
||||
|
||||
(N0.cluster,),
|
||||
(P1.cluster,),
|
||||
(N1.cluster,),
|
||||
|
||||
(N0.is_alpha,),
|
||||
(N0.is_digit,),
|
||||
(N0.is_title,),
|
||||
(N0.is_upper,),
|
||||
|
||||
(N0.is_title, N0.oft_title),
|
||||
(N0.is_upper, N0.oft_upper),
|
||||
|
||||
(P1.cluster, N0.norm),
|
||||
(N0.norm, N1.cluster),
|
||||
|
||||
(P1.ner, N0.pos),
|
||||
(P2.ner, P1.ner, N0.pos),
|
||||
|
||||
(P2.pos, P1.pos, N0.sic),
|
||||
(N0.sic, N1.pos, N2.pos)
|
||||
)
|
||||
"""
|
||||
|
||||
LOCAL = (
|
||||
(N0.sic,),
|
||||
(P1.sic,),
|
||||
(N1.sic,),
|
||||
(P2.sic,),
|
||||
(N2.sic,),
|
||||
(P3.sic,),
|
||||
(N3.sic,),
|
||||
(P4.sic,),
|
||||
(N4.sic,),
|
||||
|
||||
(P1.sic, N0.sic,),
|
||||
(N0.sic, N1.sic),
|
||||
|
||||
(N0.prefix,),
|
||||
(N0.suffix,),
|
||||
|
||||
(P1.shape,),
|
||||
(N0.shape,),
|
||||
(N1.shape,),
|
||||
(P1.shape, N0.shape,),
|
||||
(N0.shape, P1.shape,),
|
||||
(P1.shape, N0.shape, N1.shape),
|
||||
(N2.shape,),
|
||||
(P2.shape,),
|
||||
(P3.shape,),
|
||||
(N3.shape,),
|
||||
(P4.shape,),
|
||||
(N4.shape,),
|
||||
|
||||
(P2.norm, P1.norm, N0.norm),
|
||||
(P1.norm, N0.norm, N1.norm),
|
||||
(N0.norm, N1.norm, N2.norm)
|
||||
)
|
||||
|
||||
BOOLS = (
|
||||
(N0.is_title,),
|
||||
)
|
||||
|
||||
|
||||
HISTORY = (
|
||||
(P1.ner,),
|
||||
(P1.ner, N0.sic,),
|
||||
(P2.ner,),
|
||||
(P2.ner, P1.ner),
|
||||
(P2.ner, P1.ner, N0.sic),
|
||||
(P2.pos, P1.ner, N0.pos),
|
||||
(P2.ner, P1.pos, N0.pos),
|
||||
(P3.ner,),
|
||||
(P4.ner,),
|
||||
)
|
||||
|
||||
POS = (
|
||||
(P4.pos,),
|
||||
(P3.pos,),
|
||||
(P2.pos,),
|
||||
(P1.pos,),
|
||||
(N0.pos,),
|
||||
(N1.pos,),
|
||||
(N2.pos,),
|
||||
(N3.pos,),
|
||||
(N4.pos,),
|
||||
|
||||
(P1.pos, N0.pos),
|
||||
(N0.pos, N1.pos),
|
||||
(P2.pos, P1.pos, N0.pos),
|
||||
(P1.pos, N0.pos, N1.pos),
|
||||
(N0.pos, N1.pos, N2.pos)
|
||||
)
|
||||
|
||||
CLUSTERS = (
|
||||
(P4.cluster,),
|
||||
(P3.cluster,),
|
||||
(P2.cluster,),
|
||||
(P1.cluster,),
|
||||
(N0.cluster,),
|
||||
(N1.cluster,),
|
||||
(N2.cluster,),
|
||||
(N3.cluster,),
|
||||
(N4.cluster,),
|
||||
|
||||
(P1.cluster, N0.cluster),
|
||||
(N0.cluster, N1.cluster),
|
||||
)
|
||||
|
||||
|
||||
CLUSTER_POS = (
|
||||
(P1.cluster, N0.pos),
|
||||
(N0.pos, P1.cluster),
|
||||
(N0.cluster, N1.pos),
|
||||
(N0.pos, N1.cluster)
|
||||
)
|
||||
|
||||
|
||||
GAZ = (
|
||||
(N0.in_males,),
|
||||
(N0.in_females,),
|
||||
(N0.in_surnames,),
|
||||
(N0.in_places,),
|
||||
(N0.in_games,),
|
||||
(N0.in_celebs,),
|
||||
(N0.in_names,),
|
||||
(P1.in_males,),
|
||||
(P1.in_females,),
|
||||
(P1.in_surnames,),
|
||||
(P1.in_places,),
|
||||
(P1.in_games,),
|
||||
(P1.in_celebs,),
|
||||
(P1.in_names,),
|
||||
(N1.in_males,),
|
||||
(N1.in_females,),
|
||||
(N1.in_surnames,),
|
||||
(N1.in_places,),
|
||||
(N1.in_games,),
|
||||
(N1.in_celebs,),
|
||||
(N1.in_names,),
|
||||
)
|
||||
|
||||
TEMPLATES = LOCAL + HISTORY + CLUSTERS + POS + CLUSTER_POS + GAZ + BOOLS
|
|
@ -1,15 +0,0 @@
|
|||
from cymem.cymem cimport Pool
|
||||
from .structs cimport State, Entity, Move
|
||||
|
||||
cdef int begin_entity(State* s, label) except -1
|
||||
|
||||
cdef int end_entity(State* s) except -1
|
||||
|
||||
cdef State* init_state(Pool mem, int sent_length) except NULL
|
||||
cdef int copy_state(Pool mem, State* dest, State* source) except -1
|
||||
|
||||
cdef bint entity_is_open(State *s) except -1
|
||||
|
||||
cdef int entity_is_sunk(State *s, Move* golds) except -1
|
||||
|
||||
cdef int is_done(State* s) except -1
|
|
@ -1,54 +0,0 @@
|
|||
from libc.string cimport memcpy
|
||||
|
||||
|
||||
cdef int begin_entity(State* s, label) except -1:
|
||||
s.j += 1
|
||||
s.ents[s.j].start = s.i
|
||||
s.ents[s.j].tag = label
|
||||
s.ents[s.j].end = s.i + 1
|
||||
|
||||
|
||||
cdef int end_entity(State* s) except -1:
|
||||
s.ents[s.j].end = s.i + 1
|
||||
|
||||
|
||||
cdef State* init_state(Pool mem, int sent_length) except NULL:
|
||||
s = <State*>mem.alloc(1, sizeof(State))
|
||||
s.ents = <Entity*>mem.alloc(sent_length, sizeof(Entity))
|
||||
s.tags = <int*>mem.alloc(sent_length, sizeof(int))
|
||||
s.length = sent_length
|
||||
|
||||
|
||||
cdef bint entity_is_open(State *s) except -1:
|
||||
return s.ents[s.j].start != 0
|
||||
|
||||
|
||||
cdef int entity_is_sunk(State *s, Move* golds) except -1:
|
||||
if not entity_is_open(s):
|
||||
return False
|
||||
raise StandardError
|
||||
#cdef Entity* ent = &s.ents[s.j]
|
||||
#cdef Move* gold = &golds[ent.start]
|
||||
#if gold.action != BEGIN and gold.action != UNIT:
|
||||
# return True
|
||||
#elif gold.label != ent.label:
|
||||
# return True
|
||||
#else:
|
||||
# return False
|
||||
|
||||
|
||||
cdef int copy_state(Pool mem, State* dest, State* source) except -1:
|
||||
'''Copy state source into state dest.'''
|
||||
if source.length > dest.length:
|
||||
dest.ents = <Entity*>mem.realloc(dest.ents, source.length * sizeof(Entity))
|
||||
dest.tags = <int*>mem.realloc(dest.tags, source.length * sizeof(int))
|
||||
memcpy(dest.ents, source.ents, source.length * sizeof(Entity))
|
||||
memcpy(dest.tags, source.tags, source.length * sizeof(int))
|
||||
dest.length = source.length
|
||||
dest.i = source.i
|
||||
dest.j = source.j
|
||||
dest.curr = source.curr
|
||||
|
||||
|
||||
cdef int is_done(State* s) except -1:
|
||||
return s.i >= s.length and not entity_is_open(s)
|
|
@ -1,8 +0,0 @@
|
|||
from cymem.cymem cimport Pool
|
||||
|
||||
cdef class NERAnnotation:
|
||||
cdef Pool mem
|
||||
cdef int* starts
|
||||
cdef int* ends
|
||||
cdef int* labels
|
||||
cdef readonly list entities
|
|
@ -1,94 +0,0 @@
|
|||
from libc.string cimport memset
|
||||
|
||||
|
||||
cdef class NERAnnotation:
|
||||
def __init__(self, entities, length, entity_types):
|
||||
self.mem = Pool()
|
||||
self.starts = <int*>self.mem.alloc(length, sizeof(int))
|
||||
self.ends = <int*>self.mem.alloc(length, sizeof(int))
|
||||
self.labels = <int*>self.mem.alloc(length, sizeof(int))
|
||||
self.entities = entities
|
||||
memset(self.starts, -1, sizeof(int) * length)
|
||||
memset(self.ends, -1, sizeof(int) * length)
|
||||
memset(self.labels, -1, sizeof(int) * length)
|
||||
|
||||
cdef int start, end, label
|
||||
for start, end, label in entities:
|
||||
for i in range(start, end):
|
||||
self.starts[i] = start
|
||||
self.ends[i] = end
|
||||
self.labels[i] = label
|
||||
|
||||
@classmethod
|
||||
def from_bilous(cls, tag_strs, entity_types):
|
||||
entities = []
|
||||
start = None
|
||||
for i, tag_str in enumerate(tag_strs):
|
||||
if tag_str == 'O' or tag_str == '-':
|
||||
continue
|
||||
move, label_str = tag_str.split('-')
|
||||
label = entity_types.index(label_str)
|
||||
if label == -1:
|
||||
label = len(entity_types)
|
||||
entity_types.append(label)
|
||||
if move == 'U':
|
||||
assert start is None
|
||||
entities.append((i, i+1, label))
|
||||
elif move == 'B':
|
||||
assert start is None
|
||||
start = i
|
||||
elif move == 'L':
|
||||
assert start is not None
|
||||
entities.append((start, i+1, label))
|
||||
start = None
|
||||
return cls(entities, len(tag_strs), entity_types)
|
||||
|
||||
|
||||
|
||||
def read_iob(file_, entity_types, create_tokens):
|
||||
sent_strs = file_.read().strip().split('\n\n')
|
||||
sents = []
|
||||
for sent_str in sent_strs:
|
||||
if sent_str.startswith('-DOCSTART-'):
|
||||
continue
|
||||
words = []
|
||||
iob = []
|
||||
for token_str in sent_str.split('\n'):
|
||||
word, pos, chunk, ner = token_str.split()
|
||||
words.append(word)
|
||||
iob.append(ner)
|
||||
bilou = iob_to_bilou(iob)
|
||||
tokens = create_tokens(words)
|
||||
sents.append((tokens, NERAnnotation.from_bilous(bilou, entity_types)))
|
||||
return sents
|
||||
|
||||
|
||||
def iob_to_bilou(tags):
|
||||
out = []
|
||||
curr_label = None
|
||||
tags = list(tags)
|
||||
while tags:
|
||||
out.extend(_consume_os(tags))
|
||||
out.extend(_consume_ent(tags))
|
||||
return out
|
||||
|
||||
def _consume_os(tags):
|
||||
while tags and tags[0] == 'O':
|
||||
yield tags.pop(0)
|
||||
|
||||
def _consume_ent(tags):
|
||||
if not tags:
|
||||
return []
|
||||
target = tags.pop(0).replace('B', 'I')
|
||||
length = 1
|
||||
while tags and tags[0] == target:
|
||||
length += 1
|
||||
tags.pop(0)
|
||||
label = target[2:]
|
||||
if length == 1:
|
||||
return ['U-' + label]
|
||||
else:
|
||||
start = 'B-' + label
|
||||
end = 'L-' + label
|
||||
middle = ['I-%s' % label for _ in range(1, length - 1)]
|
||||
return [start] + middle + [end]
|
|
@ -1,27 +0,0 @@
|
|||
from cymem.cymem cimport Pool
|
||||
|
||||
from thinc.typedefs cimport class_t
|
||||
from thinc.typedefs cimport weight_t
|
||||
|
||||
from .structs cimport State, Move
|
||||
|
||||
|
||||
cpdef enum ActionType:
|
||||
MISSING
|
||||
BEGIN
|
||||
IN
|
||||
LAST
|
||||
UNIT
|
||||
OUT
|
||||
N_ACTIONS
|
||||
|
||||
|
||||
cdef int set_accept_if_oracle(Move* moves, Move* golds, int n, State* s) except 0
|
||||
|
||||
cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0
|
||||
|
||||
cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL
|
||||
|
||||
cdef int transition(State *s, Move* m) except -1
|
||||
|
||||
cdef int fill_moves(Move* moves, list tag_names) except -1
|
|
@ -1,207 +0,0 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from ._state cimport begin_entity
|
||||
from ._state cimport end_entity
|
||||
from ._state cimport entity_is_open
|
||||
from ._state cimport entity_is_sunk
|
||||
|
||||
|
||||
ACTION_NAMES = ['' for _ in range(N_ACTIONS)]
|
||||
ACTION_NAMES[<int>MISSING] = '?'
|
||||
ACTION_NAMES[<int>BEGIN] = 'B'
|
||||
ACTION_NAMES[<int>IN] = 'I'
|
||||
ACTION_NAMES[<int>LAST] = 'L'
|
||||
ACTION_NAMES[<int>UNIT] = 'U'
|
||||
ACTION_NAMES[<int>OUT] = 'O'
|
||||
|
||||
|
||||
cdef bint can_begin(State* s, int label):
|
||||
return not entity_is_open(s)
|
||||
|
||||
|
||||
cdef bint can_in(State* s, int label):
|
||||
return entity_is_open(s) and s.curr.label == label
|
||||
|
||||
|
||||
cdef bint can_last(State* s, int label):
|
||||
return entity_is_open(s) and s.curr.label == label
|
||||
|
||||
|
||||
cdef bint can_unit(State* s, int label):
|
||||
return not entity_is_open(s)
|
||||
|
||||
|
||||
cdef bint can_out(State* s, int label):
|
||||
return not entity_is_open(s)
|
||||
|
||||
|
||||
cdef bint is_oracle(ActionType act, int tag, ActionType g_act, int g_tag,
|
||||
ActionType next_act, bint is_sunk):
|
||||
if g_act == MISSING:
|
||||
return True
|
||||
if act == BEGIN:
|
||||
if g_act == BEGIN:
|
||||
# B, Gold B --> Label match
|
||||
return tag == g_tag
|
||||
else:
|
||||
# B, Gold I --> False (P)
|
||||
# B, Gold L --> False (P)
|
||||
# B, Gold O --> False (P)
|
||||
# B, Gold U --> False (P)
|
||||
return False
|
||||
elif act == IN:
|
||||
if g_act == BEGIN:
|
||||
# I, Gold B --> True (P of bad open entity sunk, R of this entity sunk)
|
||||
return True
|
||||
elif g_act == IN:
|
||||
# I, Gold I --> True (label forced by prev, if mismatch, P and R both sunk)
|
||||
return True
|
||||
elif g_act == LAST:
|
||||
# I, Gold L --> True iff this entity sunk and next tag == O
|
||||
return is_sunk and (next_act == OUT or next_act == MISSING)
|
||||
elif g_act == OUT:
|
||||
# I, Gold O --> True iff next tag == O
|
||||
return next_act == OUT or next_act == MISSING
|
||||
elif g_act == UNIT:
|
||||
# I, Gold U --> True iff next tag == O
|
||||
return next_act == OUT
|
||||
elif act == LAST:
|
||||
if g_act == BEGIN:
|
||||
# L, Gold B --> True
|
||||
return True
|
||||
elif g_act == IN:
|
||||
# L, Gold I --> True iff this entity sunk
|
||||
return is_sunk
|
||||
elif g_act == LAST:
|
||||
# L, Gold L --> True
|
||||
return True
|
||||
elif g_act == OUT:
|
||||
# L, Gold O --> True
|
||||
return True
|
||||
elif g_act == UNIT:
|
||||
# L, Gold U --> True
|
||||
return True
|
||||
elif act == OUT:
|
||||
if g_act == BEGIN:
|
||||
# O, Gold B --> False
|
||||
return False
|
||||
elif g_act == IN:
|
||||
# O, Gold I --> True
|
||||
return True
|
||||
elif g_act == LAST:
|
||||
# O, Gold L --> True
|
||||
return True
|
||||
elif g_act == OUT:
|
||||
# O, Gold O --> True
|
||||
return True
|
||||
elif g_act == UNIT:
|
||||
# O, Gold U --> False
|
||||
return False
|
||||
elif act == UNIT:
|
||||
if g_act == UNIT:
|
||||
# U, Gold U --> True iff tag match
|
||||
return tag == g_tag
|
||||
else:
|
||||
# U, Gold B --> False
|
||||
# U, Gold I --> False
|
||||
# U, Gold L --> False
|
||||
# U, Gold O --> False
|
||||
return False
|
||||
|
||||
|
||||
cdef int set_accept_if_valid(Move* moves, int n_classes, State* s) except 0:
|
||||
cdef int n_accept = 0
|
||||
cdef Move* m
|
||||
moves[0].accept = False
|
||||
for i in range(1, n_classes):
|
||||
m = &moves[i]
|
||||
if m.action == BEGIN:
|
||||
m.accept = can_begin(s, m.label)
|
||||
elif m.action == IN:
|
||||
m.accept = can_in(s, m.label)
|
||||
elif m.action == LAST:
|
||||
m.accept = can_last(s, m.label)
|
||||
elif m.action == UNIT:
|
||||
m.accept = can_unit(s, m.label)
|
||||
elif m.action == OUT:
|
||||
m.accept = can_out(s, m.label)
|
||||
n_accept += m.accept
|
||||
assert n_accept != 0
|
||||
return n_accept
|
||||
|
||||
|
||||
cdef int set_accept_if_oracle(Move* moves, Move* golds, int n_classes, State* s) except 0:
|
||||
|
||||
cdef Move* g = &golds[s.i]
|
||||
cdef ActionType next_act = <ActionType>golds[s.i+1].action if s.i < s.length else OUT
|
||||
cdef bint is_sunk = entity_is_sunk(s, golds)
|
||||
cdef Move* m
|
||||
cdef int n_accept = 0
|
||||
set_accept_if_valid(moves, n_classes, s)
|
||||
for i in range(1, n_classes):
|
||||
m = &moves[i]
|
||||
if not m.accept:
|
||||
continue
|
||||
m.accept = is_oracle(<ActionType>m.action, m.label, <ActionType>g.action,
|
||||
g.label, next_act, is_sunk)
|
||||
n_accept += m.accept
|
||||
assert n_accept != 0
|
||||
return n_accept
|
||||
|
||||
|
||||
cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL:
|
||||
cdef int first_accept = -1
|
||||
for first_accept in range(1, n):
|
||||
if moves[first_accept].accept:
|
||||
break
|
||||
else:
|
||||
raise StandardError
|
||||
assert first_accept != -1
|
||||
cdef int best = first_accept
|
||||
cdef weight_t score = scores[first_accept-1]
|
||||
cdef int i
|
||||
for i in range(first_accept+1, n):
|
||||
if moves[i].accept and scores[i-1] > score:
|
||||
best = i
|
||||
score = scores[i-1]
|
||||
return &moves[best]
|
||||
|
||||
|
||||
cdef int transition(State *s, Move* move) except -1:
|
||||
if move.action == BEGIN:
|
||||
begin_entity(s, move.label)
|
||||
elif move.action == IN:
|
||||
pass
|
||||
elif move.action == LAST:
|
||||
end_entity(s)
|
||||
elif move.action == UNIT:
|
||||
begin_entity(s, move.label)
|
||||
end_entity(s)
|
||||
elif move.action == OUT:
|
||||
pass
|
||||
s.tags[s.i] = move.clas
|
||||
s.i += 1
|
||||
|
||||
|
||||
def get_n_moves(n_tags):
|
||||
return n_tags + n_tags + n_tags + n_tags + 1
|
||||
|
||||
|
||||
cdef int fill_moves(Move* moves, list tag_names) except -1:
|
||||
cdef Move* m
|
||||
label_names = {'-': 0}
|
||||
for i, tag_name in enumerate(tag_names):
|
||||
m = &moves[i]
|
||||
if '-' in tag_name:
|
||||
action_str, label = tag_name.split('-')
|
||||
elif tag_name == 'O':
|
||||
action_str = 'O'
|
||||
label = '-'
|
||||
elif tag_name == 'NULL' or tag_name == 'EOL':
|
||||
action_str = '?'
|
||||
label = '-'
|
||||
else:
|
||||
raise StandardError(tag_name)
|
||||
m.action = ACTION_NAMES.index(action_str)
|
||||
m.label = label_names.setdefault(label, len(label_names))
|
||||
m.clas = i
|
|
@ -1,155 +0,0 @@
|
|||
from thinc.typedefs cimport atom_t
|
||||
from ..typedefs cimport hash_t
|
||||
from ..tokens cimport Tokens
|
||||
from ..lexeme cimport Lexeme
|
||||
from .structs cimport State
|
||||
|
||||
|
||||
cpdef enum:
|
||||
T_sic
|
||||
T_cluster
|
||||
T_norm
|
||||
T_shape
|
||||
T_asciied
|
||||
T_prefix
|
||||
T_suffix
|
||||
T_length
|
||||
T_postype
|
||||
T_nertype
|
||||
T_sensetype
|
||||
T_is_alpha
|
||||
T_is_ascii
|
||||
T_is_digit
|
||||
T_is_lower
|
||||
T_is_punct
|
||||
T_is_space
|
||||
T_is_title
|
||||
T_is_upper
|
||||
T_like_url
|
||||
T_like_number
|
||||
T_oft_lower
|
||||
T_oft_title
|
||||
T_oft_upper
|
||||
T_in_males
|
||||
T_in_females
|
||||
T_in_surnames
|
||||
T_in_places
|
||||
T_in_celebs
|
||||
T_in_names
|
||||
T_pos
|
||||
T_sense
|
||||
T_ner
|
||||
|
||||
|
||||
cpdef enum:
|
||||
P2_sic
|
||||
P2_cluster
|
||||
P2_norm
|
||||
P2_shape
|
||||
P2_prefix
|
||||
P2_suffix
|
||||
P2_length
|
||||
P2_postype
|
||||
P2_is_alpha
|
||||
P2_is_digit
|
||||
P2_is_lower
|
||||
P2_is_punct
|
||||
P2_is_title
|
||||
P2_is_upper
|
||||
P2_like_number
|
||||
P2_pos
|
||||
|
||||
P1_sic
|
||||
P1_cluster
|
||||
P1_norm
|
||||
P1_shape
|
||||
P1_prefix
|
||||
P1_suffix
|
||||
P1_length
|
||||
P1_postype
|
||||
P1_is_alpha
|
||||
P1_is_digit
|
||||
P1_is_lower
|
||||
P1_is_punct
|
||||
P1_is_title
|
||||
P1_is_upper
|
||||
P1_like_number
|
||||
P1_pos
|
||||
|
||||
W_sic
|
||||
W_cluster
|
||||
W_norm
|
||||
W_shape
|
||||
W_prefix
|
||||
W_suffix
|
||||
W_length
|
||||
W_postype
|
||||
W_is_alpha
|
||||
W_is_digit
|
||||
W_is_lower
|
||||
W_is_punct
|
||||
W_is_space
|
||||
W_is_title
|
||||
W_is_upper
|
||||
W_like_number
|
||||
W_pos
|
||||
|
||||
N1_sic
|
||||
N1_cluster
|
||||
N1_norm
|
||||
N1_shape
|
||||
N1_prefix
|
||||
N1_suffix
|
||||
N1_length
|
||||
N1_postype
|
||||
N1_is_alpha
|
||||
N1_is_ascii
|
||||
N1_is_digit
|
||||
N1_is_lower
|
||||
N1_is_punct
|
||||
N1_is_space
|
||||
N1_is_title
|
||||
N1_is_upper
|
||||
N1_like_number
|
||||
N1_pos
|
||||
|
||||
N2_sic
|
||||
N2_cluster
|
||||
N2_norm
|
||||
N2_shape
|
||||
N2_asciied
|
||||
N2_prefix
|
||||
N2_suffix
|
||||
N2_length
|
||||
N2_postype
|
||||
N2_is_alpha
|
||||
N2_is_digit
|
||||
N2_is_lower
|
||||
N2_is_punct
|
||||
N2_is_space
|
||||
N2_is_title
|
||||
N2_is_upper
|
||||
N2_like_number
|
||||
N2_pos
|
||||
N2_sense
|
||||
|
||||
E_label
|
||||
|
||||
E0_sic
|
||||
E0_cluster
|
||||
E0_pos
|
||||
|
||||
E1_sic
|
||||
E1_cluster
|
||||
E1_pos
|
||||
|
||||
E_last_sic
|
||||
E_last_cluster
|
||||
E_last_pos
|
||||
|
||||
N_FIELDS
|
||||
|
||||
|
||||
cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1
|
||||
|
||||
|
|
@ -1,77 +0,0 @@
|
|||
from libc.string cimport memset
|
||||
|
||||
from murmurhash.mrmr cimport hash64
|
||||
from ._state cimport entity_is_open
|
||||
from ..lexeme cimport *
|
||||
|
||||
|
||||
cdef int _fill_token(atom_t* c, Lexeme* lex, atom_t pos):
|
||||
c[T_sic] = lex.sic
|
||||
c[T_cluster] = lex.cluster
|
||||
c[T_norm] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
|
||||
c[T_shape] = lex.shape
|
||||
c[T_asciied] = lex.asciied
|
||||
c[T_prefix] = lex.prefix
|
||||
c[T_suffix] = lex.suffix
|
||||
c[T_length] = lex.length
|
||||
|
||||
c[T_postype] = lex.postype
|
||||
c[T_nertype] = 0
|
||||
c[T_sensetype] = 0
|
||||
|
||||
c[T_is_alpha] = lex.flags & (1 << IS_ALPHA)
|
||||
c[T_is_digit] = lex.flags & (1 << IS_DIGIT)
|
||||
c[T_is_lower] = lex.flags & (1 << IS_LOWER)
|
||||
c[T_is_punct] = lex.flags & (1 << IS_PUNCT)
|
||||
c[T_is_space] = lex.flags & (1 << IS_SPACE)
|
||||
c[T_is_title] = lex.flags & (1 << IS_TITLE)
|
||||
c[T_is_upper] = lex.flags & (1 << IS_UPPER)
|
||||
c[T_like_url] = lex.flags & (1 << LIKE_URL)
|
||||
c[T_like_number] = lex.flags & (1 << LIKE_NUMBER)
|
||||
c[T_oft_lower] = lex.flags & (1 << OFT_LOWER)
|
||||
c[T_oft_title] = lex.flags & (1 << OFT_TITLE)
|
||||
c[T_oft_upper] = lex.flags & (1 << OFT_UPPER)
|
||||
|
||||
c[T_in_males] = lex.flags & (1 << IN_MALES)
|
||||
c[T_in_females] = lex.flags & (1 << IN_FEMALES)
|
||||
c[T_in_surnames] = lex.flags & (1 << IN_SURNAMES)
|
||||
c[T_in_places] = lex.flags & (1 << IN_PLACES)
|
||||
c[T_in_celebs] = lex.flags & (1 << IN_CELEBS)
|
||||
c[T_in_names] = lex.flags & (1 << IN_NAMES)
|
||||
|
||||
c[T_pos] = pos
|
||||
c[T_sense] = 0
|
||||
|
||||
|
||||
cdef int _fill_outer_token(atom_t* c, Lexeme* lex, atom_t pos):
|
||||
c[0] = lex.sic
|
||||
c[1] = lex.cluster
|
||||
c[2] = lex.shape
|
||||
c[3] = pos
|
||||
|
||||
|
||||
cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1:
|
||||
cdef int i
|
||||
for i in range(N_FIELDS):
|
||||
context[i] = 0
|
||||
i = s.i
|
||||
_fill_token(&context[P2_sic], tokens.lex[i-2], tokens.pos[i-2])
|
||||
_fill_token(&context[P1_sic], tokens.lex[i-1], tokens.pos[i-1])
|
||||
_fill_token(&context[W_sic], tokens.lex[i], tokens.pos[i])
|
||||
_fill_token(&context[N1_sic], tokens.lex[i+1], tokens.pos[i+1])
|
||||
_fill_token(&context[N2_sic], tokens.lex[i+2], tokens.pos[i+2])
|
||||
|
||||
cdef atom_t[5] ent_vals
|
||||
if entity_is_open(s):
|
||||
context[E_label] = s.curr.label
|
||||
context[E0_sic] = tokens.lex[s.curr.start].sic
|
||||
context[E0_cluster] = tokens.lex[s.curr.start].cluster
|
||||
context[E0_pos] = tokens.pos[s.curr.start]
|
||||
context[E_last_sic] = tokens.lex[s.i-1].sic
|
||||
context[E_last_cluster] = tokens.lex[s.i-1].cluster
|
||||
context[E_last_pos] = tokens.pos[s.i-1]
|
||||
if (s.curr.start + 1) < s.i:
|
||||
context[E1_sic] = tokens.lex[s.curr.start+1].sic
|
||||
context[E1_cluster] = tokens.lex[s.curr.start+1].cluster
|
||||
context[E1_pos] = tokens.pos[s.curr.start+1]
|
||||
return 1
|
|
@ -1,107 +0,0 @@
|
|||
from .context import *
|
||||
|
||||
|
||||
LOCAL = (
|
||||
(W_sic,),
|
||||
(P1_sic,),
|
||||
(N1_sic,),
|
||||
(P2_sic,),
|
||||
(N2_sic,),
|
||||
|
||||
(P1_sic, W_sic,),
|
||||
(W_sic, N1_sic),
|
||||
|
||||
(W_prefix,),
|
||||
(W_suffix,),
|
||||
|
||||
(P1_shape,),
|
||||
(W_shape,),
|
||||
(N1_shape,),
|
||||
(P1_shape, W_shape,),
|
||||
(W_shape, P1_shape,),
|
||||
(P1_shape, W_shape, N1_shape),
|
||||
(N2_shape,),
|
||||
(P2_shape,),
|
||||
|
||||
(P2_norm, P1_norm, W_norm),
|
||||
(P1_norm, W_norm, N1_norm),
|
||||
(W_norm, N1_norm, N2_norm)
|
||||
)
|
||||
|
||||
POS = (
|
||||
(P2_pos,),
|
||||
(P1_pos,),
|
||||
(W_pos,),
|
||||
(N1_pos,),
|
||||
(N2_pos,),
|
||||
|
||||
(P1_pos, W_pos),
|
||||
(W_pos, N1_pos),
|
||||
(P2_pos, P1_pos, W_pos),
|
||||
(P1_pos, W_pos, N1_pos),
|
||||
(W_pos, N1_pos, N2_pos)
|
||||
)
|
||||
|
||||
CLUSTERS = (
|
||||
(P2_cluster,),
|
||||
(P1_cluster,),
|
||||
(W_cluster,),
|
||||
(N1_cluster,),
|
||||
(N2_cluster,),
|
||||
|
||||
(P1_cluster, W_cluster),
|
||||
(W_cluster, N1_cluster),
|
||||
)
|
||||
|
||||
|
||||
CLUSTER_POS = (
|
||||
(P1_cluster, W_pos),
|
||||
(W_pos, P1_cluster),
|
||||
(W_cluster, N1_pos),
|
||||
(W_pos, N1_cluster)
|
||||
)
|
||||
|
||||
|
||||
STATE = (
|
||||
(E0_sic,),
|
||||
(E0_cluster,),
|
||||
(E0_pos,),
|
||||
(E_last_sic,),
|
||||
(E_last_cluster,),
|
||||
(E_last_pos,),
|
||||
|
||||
(E0_sic, W_sic),
|
||||
(E0_cluster, W_cluster),
|
||||
(E0_pos, W_pos),
|
||||
(E_last_sic, W_sic),
|
||||
(E_last_pos, W_pos),
|
||||
|
||||
(E0_pos, E_last_pos, W_pos),
|
||||
(E0_cluster, E_last_cluster, W_cluster),
|
||||
|
||||
(E0_sic, E_last_sic),
|
||||
(E0_pos, E_last_pos),
|
||||
(E0_cluster, E_last_cluster),
|
||||
(E0_pos, E_last_cluster),
|
||||
(E0_cluster, E_last_pos),
|
||||
|
||||
(E1_sic,),
|
||||
(E1_cluster,),
|
||||
(E1_pos,),
|
||||
|
||||
(E0_sic, E1_sic),
|
||||
(E0_sic, E1_pos,),
|
||||
(E0_pos, E1_sic,),
|
||||
(E0_pos, E1_pos),
|
||||
|
||||
(E_label,),
|
||||
(E_label, W_sic),
|
||||
(E_label, W_pos),
|
||||
(E_label, W_cluster),
|
||||
(E_label, W_shape),
|
||||
(E_label, E_last_sic),
|
||||
(E_label, E0_pos, E_last_pos),
|
||||
)
|
||||
|
||||
|
||||
TEMPLATES = LOCAL + CLUSTERS + POS + CLUSTER_POS + STATE
|
|
@ -1,29 +0,0 @@
|
|||
from cymem.cymem cimport Pool
|
||||
from thinc.features cimport Extractor
|
||||
from thinc.learner cimport LinearModel
|
||||
from thinc.typedefs cimport *
|
||||
|
||||
from ..tokens cimport Tokens
|
||||
from ..typedefs cimport *
|
||||
|
||||
from .structs cimport Move
|
||||
from .annot cimport NERAnnotation
|
||||
|
||||
|
||||
cdef class NERParser:
|
||||
cdef Pool mem
|
||||
cdef Extractor extractor
|
||||
cdef LinearModel model
|
||||
cdef readonly list tag_names
|
||||
cdef readonly list entity_types
|
||||
cdef readonly int n_classes
|
||||
|
||||
cdef Move* _moves
|
||||
cdef atom_t* _context
|
||||
cdef feat_t* _feats
|
||||
cdef weight_t* _values
|
||||
cdef weight_t* _scores
|
||||
|
||||
|
||||
cpdef list train(self, Tokens tokens, NERAnnotation annot)
|
||||
cpdef list set_tags(self, Tokens tokens)
|
|
@ -1,81 +0,0 @@
|
|||
cimport cython
|
||||
import random
|
||||
import os
|
||||
from os import path
|
||||
import shutil
|
||||
import json
|
||||
|
||||
from thinc.features cimport ConjFeat
|
||||
|
||||
from ..context cimport fill_context
|
||||
from ..context cimport N_FIELDS
|
||||
from .moves cimport Move
|
||||
from .moves cimport fill_moves, transition, best_accepted
|
||||
from .moves cimport set_accept_if_valid, set_accept_if_oracle
|
||||
from .moves import get_n_moves
|
||||
from ._state cimport State
|
||||
from ._state cimport init_state
|
||||
|
||||
|
||||
cdef class NERParser:
|
||||
def __init__(self, model_dir):
|
||||
self.mem = Pool()
|
||||
cfg = json.load(open(path.join(model_dir, 'config.json')))
|
||||
templates = cfg['templates']
|
||||
self.entity_types = cfg['entity_types']
|
||||
self.extractor = Extractor(templates, [ConjFeat] * len(templates))
|
||||
self.n_classes = get_n_moves(len(self.entity_types))
|
||||
self._moves = <Move*>self.mem.alloc(self.n_classes, sizeof(Move))
|
||||
fill_moves(self._moves, len(self.entity_types))
|
||||
self.model = LinearModel(len(self.tag_names))
|
||||
if path.exists(path.join(model_dir, 'model')):
|
||||
self.model.load(path.join(model_dir, 'model'))
|
||||
|
||||
self._context = <atom_t*>self.mem.alloc(N_FIELDS, sizeof(atom_t))
|
||||
self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))
|
||||
self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
|
||||
self._scores = <weight_t*>self.mem.alloc(self.model.nr_class, sizeof(weight_t))
|
||||
|
||||
cpdef int train(self, Tokens tokens, gold_classes):
|
||||
cdef Pool mem = Pool()
|
||||
cdef State* s = init_state(mem, tokens.length)
|
||||
cdef Move* golds = <Move*>mem.alloc(len(gold_classes), sizeof(Move))
|
||||
for i, clas in enumerate(gold_classes):
|
||||
golds[i] = self.moves[clas - 1]
|
||||
assert golds[i].id == clas
|
||||
cdef Move* guess
|
||||
while s.i < tokens.length:
|
||||
fill_context(self._context, s.i, tokens)
|
||||
self.extractor.extract(self._feats, self._values, self._context, NULL)
|
||||
self.model.score(self._scores, self._feats, self._values)
|
||||
|
||||
set_accept_if_valid(self._moves, self.n_classes, s)
|
||||
guess = best_accepted(self._moves, self._scores, self.n_classes)
|
||||
|
||||
set_accept_if_oracle(self._moves, golds, self.n_classes, s) # TODO
|
||||
gold = best_accepted(self._moves, self._scores, self.n_classes)
|
||||
|
||||
if guess.clas == gold.clas:
|
||||
self.model.update({})
|
||||
return 0
|
||||
|
||||
counts = {guess.clas: {}, gold.clas: {}}
|
||||
self.extractor.count(counts[gold.clas], self._feats, 1)
|
||||
self.extractor.count(counts[guess.clas], self._feats, -1)
|
||||
self.model.update(counts)
|
||||
|
||||
transition(s, guess)
|
||||
tokens.ner[s.i-1] = s.tags[s.i-1]
|
||||
|
||||
cpdef int set_tags(self, Tokens tokens) except -1:
|
||||
cdef Pool mem = Pool()
|
||||
cdef State* s = init_state(mem, tokens.length)
|
||||
cdef Move* move
|
||||
while s.i < tokens.length:
|
||||
fill_context(self._context, s.i, tokens)
|
||||
self.extractor.extract(self._feats, self._values, self._context, NULL)
|
||||
self.model.score(self._scores, self._feats, self._values)
|
||||
set_accept_if_valid(self._moves, self.n_classes, s)
|
||||
move = best_accepted(self._moves, self._scores, self.n_classes)
|
||||
transition(s, move)
|
||||
tokens.ner[s.i-1] = s.tags[s.i-1]
|
|
@ -1,26 +0,0 @@
|
|||
from cymem.cymem cimport Pool
|
||||
|
||||
from thinc.typedefs cimport class_t
|
||||
from thinc.typedefs cimport weight_t
|
||||
|
||||
from .structs cimport State, Move
|
||||
|
||||
|
||||
cpdef enum ActionType:
|
||||
MISSING
|
||||
SHIFT
|
||||
REDUCE
|
||||
OUT
|
||||
N_ACTIONS
|
||||
|
||||
|
||||
cdef int set_accept_if_oracle(Move* moves, int n, State* s,
|
||||
int* g_starts, int* g_ends, int* g_labels) except 0
|
||||
|
||||
cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0
|
||||
|
||||
cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL
|
||||
|
||||
cdef int transition(State *s, Move* m) except -1
|
||||
|
||||
cdef int fill_moves(Move* moves, int n, list entity_types) except -1
|
|
@ -1,161 +0,0 @@
|
|||
from __future__ import unicode_literals
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
from thinc.typedefs cimport class_t
|
||||
from thinc.typedefs cimport weight_t
|
||||
|
||||
from ._state cimport begin_entity
|
||||
from ._state cimport end_entity
|
||||
from ._state cimport entity_is_open
|
||||
|
||||
|
||||
ACTION_NAMES = ['' for _ in range(N_ACTIONS)]
|
||||
ACTION_NAMES[<int>MISSING] = '?'
|
||||
ACTION_NAMES[<int>SHIFT] = 'S'
|
||||
ACTION_NAMES[<int>REDUCE] = 'R'
|
||||
ACTION_NAMES[<int>OUT] = 'O'
|
||||
|
||||
|
||||
cdef int set_accept_if_oracle(Move* moves, int n, State* s,
|
||||
int* g_starts, int* g_ends, int* g_labels) except 0:
|
||||
# If curr entity: (O invalid)
|
||||
# if cost is not sunk (start matches, end is i-1 or greater
|
||||
# - If i-1 == gold.end --> R=True, S=False
|
||||
# - Shift if end >= i --> S=True, R=False
|
||||
# else
|
||||
# - If i == gold.start --> R=True, S=False
|
||||
# - Else --> R=True, S=True
|
||||
# Else (R invalid):
|
||||
# if start == gold.start: S=True, O=False
|
||||
# else: O=True, S=False
|
||||
if entity_is_open(s):
|
||||
g_start = g_starts[s.curr.start]
|
||||
g_end = g_ends[s.curr.start]
|
||||
accept_o = False
|
||||
if g_start == s.curr.start and g_end == s.i:
|
||||
accept_r = True
|
||||
accept_s = False
|
||||
elif g_start == s.curr.start and g_end > s.i:
|
||||
accept_s = True
|
||||
s_label = s.curr.label
|
||||
accept_r = False
|
||||
elif g_starts[s.i] == s.i:
|
||||
accept_r = True
|
||||
accept_s = False
|
||||
else:
|
||||
accept_r = True
|
||||
accept_s = True
|
||||
s_label = s.curr.label
|
||||
else:
|
||||
accept_r = False
|
||||
if g_starts[s.i] == s.i:
|
||||
accept_s = True
|
||||
s_label = g_labels[s.i]
|
||||
accept_o = False
|
||||
else:
|
||||
accept_o = True
|
||||
accept_s = False
|
||||
n_accept = 0
|
||||
moves[0].accept = False
|
||||
for i in range(1, n):
|
||||
m = &moves[i]
|
||||
if m.action == SHIFT:
|
||||
m.accept = accept_s and m.label == s_label
|
||||
elif m.action == REDUCE:
|
||||
m.accept = accept_r
|
||||
elif m.action == OUT:
|
||||
m.accept = accept_o
|
||||
n_accept += m.accept
|
||||
assert n_accept != 0
|
||||
return n_accept
|
||||
|
||||
|
||||
cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0:
|
||||
cdef int i
|
||||
cdef bint open_ent = entity_is_open(s)
|
||||
cdef int n_accept = 0
|
||||
moves[0].accept = False
|
||||
for i in range(1, n):
|
||||
if moves[i].action == SHIFT:
|
||||
if s.i >= s.length:
|
||||
moves[i].accept = False
|
||||
elif open_ent and moves[i].label != s.curr.label:
|
||||
moves[i].accept = False
|
||||
else:
|
||||
moves[i].accept = True
|
||||
elif moves[i].action == REDUCE:
|
||||
moves[i].accept = open_ent
|
||||
elif moves[i].action == OUT:
|
||||
moves[i].accept = s.i < s.length and not open_ent
|
||||
n_accept += moves[i].accept
|
||||
return n_accept
|
||||
|
||||
|
||||
cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL:
|
||||
cdef int first_accept = -1
|
||||
for first_accept in range(1, n):
|
||||
if moves[first_accept].accept:
|
||||
break
|
||||
else:
|
||||
raise StandardError
|
||||
assert first_accept != -1
|
||||
cdef int best = first_accept
|
||||
cdef weight_t score = scores[first_accept-1]
|
||||
cdef int i
|
||||
for i in range(first_accept+1, n):
|
||||
if moves[i].accept and scores[i-1] > score:
|
||||
best = i
|
||||
score = scores[i-1]
|
||||
return &moves[best]
|
||||
|
||||
|
||||
cdef int transition(State *s, Move* move) except -1:
|
||||
s.tags[s.i] = move.clas
|
||||
if move.action == OUT:
|
||||
s.i += 1
|
||||
elif move.action == SHIFT:
|
||||
if not entity_is_open(s):
|
||||
s.curr.start = s.i
|
||||
s.curr.label = move.label
|
||||
s.i += 1
|
||||
elif move.action == REDUCE:
|
||||
s.curr.end = s.i
|
||||
s.ents[s.j] = s.curr
|
||||
s.j += 1
|
||||
s.curr.start = 0
|
||||
s.curr.label = -1
|
||||
s.curr.end = 0
|
||||
else:
|
||||
raise ValueError(move.action)
|
||||
|
||||
|
||||
def get_n_moves(n_tags):
|
||||
return 1 + 1 + 1 + n_tags
|
||||
|
||||
|
||||
cdef int fill_moves(Move* moves, int n, list entity_types) except -1:
|
||||
cdef Move* m
|
||||
label_names = {'-': 0}
|
||||
# Reserve class 0
|
||||
cdef int i = 0
|
||||
moves[i].clas = i
|
||||
moves[i].action = MISSING
|
||||
moves[i].label = 0
|
||||
i += 1
|
||||
for entity_type in entity_types:
|
||||
moves[i].action = SHIFT
|
||||
moves[i].label = label_names.setdefault(entity_type, len(label_names))
|
||||
moves[i].clas = i
|
||||
i += 1
|
||||
moves[i].clas = i
|
||||
moves[i].action = OUT
|
||||
moves[i].label = 0
|
||||
i += 1
|
||||
moves[i].action = REDUCE
|
||||
moves[i].clas = i
|
||||
moves[i].label = 0
|
||||
i += 1
|
||||
|
||||
|
||||
cdef bint is_final(State* s):
|
||||
return s.i == s.length and not entity_is_open(s)
|
|
@ -1,16 +0,0 @@
|
|||
from cymem.cymem cimport Pool
|
||||
|
||||
from .structs cimport Move, State
|
||||
|
||||
|
||||
cdef class PyState:
|
||||
cdef Pool mem
|
||||
cdef readonly list tag_names
|
||||
cdef readonly int n_classes
|
||||
cdef readonly dict moves_by_name
|
||||
|
||||
cdef Move* _moves
|
||||
cdef Move* _golds
|
||||
cdef State* _s
|
||||
|
||||
cdef Move* _get_move(self, unicode move_name) except NULL
|
|
@ -1,60 +0,0 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from ._state cimport init_state
|
||||
from ._state cimport entity_is_open
|
||||
from .bilou_moves cimport fill_moves
|
||||
from .bilou_moves cimport transition
|
||||
from .bilou_moves cimport set_accept_if_valid, set_accept_if_oracle
|
||||
from .bilou_moves import get_n_moves
|
||||
from .bilou_moves import ACTION_NAMES
|
||||
|
||||
|
||||
cdef class PyState:
|
||||
def __init__(self, tag_names, n_tokens):
|
||||
self.mem = Pool()
|
||||
self.tag_names = tag_names
|
||||
self.n_classes = len(tag_names)
|
||||
assert self.n_classes != 0
|
||||
self._moves = <Move*>self.mem.alloc(self.n_classes, sizeof(Move))
|
||||
fill_moves(self._moves, tag_names)
|
||||
self._s = init_state(self.mem, n_tokens)
|
||||
self._golds = <Move*>self.mem.alloc(n_tokens, sizeof(Move))
|
||||
|
||||
cdef Move* _get_move(self, unicode move_name) except NULL:
|
||||
return &self._moves[self.tag_names.index(move_name)]
|
||||
|
||||
def set_golds(self, list gold_names):
|
||||
cdef Move* m
|
||||
for i, name in enumerate(gold_names):
|
||||
m = self._get_move(name)
|
||||
self._golds[i] = m[0]
|
||||
|
||||
def transition(self, unicode move_name):
|
||||
cdef Move* m = self._get_move(move_name)
|
||||
transition(self._s, m)
|
||||
|
||||
def is_valid(self, unicode move_name):
|
||||
cdef Move* m = self._get_move(move_name)
|
||||
set_accept_if_valid(self._moves, self.n_classes, self._s)
|
||||
return m.accept
|
||||
|
||||
def is_gold(self, unicode move_name):
|
||||
cdef Move* m = self._get_move(move_name)
|
||||
set_accept_if_oracle(self._moves, self._golds, self.n_classes, self._s)
|
||||
return m.accept
|
||||
|
||||
property ent:
|
||||
def __get__(self):
|
||||
return self._s.curr
|
||||
|
||||
property n_ents:
|
||||
def __get__(self):
|
||||
return self._s.j
|
||||
|
||||
property i:
|
||||
def __get__(self):
|
||||
return self._s.i
|
||||
|
||||
property open_entity:
|
||||
def __get__(self):
|
||||
return entity_is_open(self._s)
|
|
@ -1,23 +0,0 @@
|
|||
from thinc.typedefs cimport class_t
|
||||
|
||||
|
||||
cdef struct Entity:
|
||||
int start
|
||||
int end
|
||||
int label
|
||||
|
||||
|
||||
cdef struct State:
|
||||
Entity curr
|
||||
Entity* ents
|
||||
int* tags
|
||||
int i
|
||||
int j
|
||||
int length
|
||||
|
||||
|
||||
cdef struct Move:
|
||||
class_t clas
|
||||
int action
|
||||
int label
|
||||
bint accept
|
|
@ -1,41 +0,0 @@
|
|||
from spacy.context cimport FIELD_IDS, Token
|
||||
|
||||
|
||||
cpdef Token P2 = FIELD_IDS.P2
|
||||
cpdef Token P1 = FIELD_IDS.P1
|
||||
cpdef Token N0 = FIELD_IDS.N0
|
||||
cpdef Token N1 = FIELD_IDS.N1
|
||||
cpdef Token N2 = FIELD_IDS.N2
|
||||
|
||||
|
||||
TEMPLATES = (
|
||||
(N0.sic,),
|
||||
(N0.norm,),
|
||||
(N0.suffix,),
|
||||
(N0.prefix,),
|
||||
(P1.pos,),
|
||||
(P2.pos,),
|
||||
(P1.pos, P2.pos),
|
||||
(P1.pos, N0.norm),
|
||||
(P1.norm,),
|
||||
(P1.suffix,),
|
||||
(P2.norm,),
|
||||
(N1.norm,),
|
||||
(N1.suffix,),
|
||||
(N2.norm,),
|
||||
|
||||
(N0.shape,),
|
||||
(N0.cluster,),
|
||||
(N1.cluster,),
|
||||
(N2.cluster,),
|
||||
(P1.cluster,),
|
||||
(P2.cluster,),
|
||||
(N0.oft_upper,),
|
||||
(N0.oft_title,),
|
||||
|
||||
(N0.postype,),
|
||||
|
||||
(P1.like_url,),
|
||||
(N1.like_number,),
|
||||
(N1.like_url,),
|
||||
)
|
|
@ -1,153 +0,0 @@
|
|||
from __future__ import unicode_literals
|
||||
from . import util
|
||||
from . import tokens
|
||||
from .en import EN
|
||||
|
||||
|
||||
def read_gold(file_, tag_list, col):
|
||||
paras = file_.read().strip().split('\n\n')
|
||||
golds = []
|
||||
tag_ids = dict((tag, i) for i, tag in enumerate(tag_list))
|
||||
for para in paras:
|
||||
if not para.strip():
|
||||
continue
|
||||
lines = para.strip().split('\n')
|
||||
raw = lines.pop(0)
|
||||
gold_toks = lines.pop(0)
|
||||
tokens = EN.tokenize(raw)
|
||||
tags = []
|
||||
conll_toks = []
|
||||
for line in lines:
|
||||
pieces = line.split()
|
||||
conll_toks.append((int(pieces[0]), len(pieces[1]), pieces[col]))
|
||||
for i, token in enumerate(tokens):
|
||||
if not conll_toks:
|
||||
tags.append('NULL')
|
||||
elif token.idx == conll_toks[0][0]:
|
||||
tags.append(conll_toks[0][2])
|
||||
conll_toks.pop(0)
|
||||
elif token.idx < conll_toks[0]:
|
||||
tags.append('NULL')
|
||||
else:
|
||||
conll_toks.pop(0)
|
||||
assert len(tags) == len(tokens)
|
||||
tags = [_encode_pos(t, tag_ids, tag_list) for t in tags]
|
||||
golds.append((tokens, tags))
|
||||
return golds
|
||||
|
||||
def _encode_pos(tag, tag_ids, tag_list):
|
||||
if tag == '-':
|
||||
return 0
|
||||
if tag not in tag_ids:
|
||||
tag_ids[tag] = len(tag_list)
|
||||
tag_list.append(tag)
|
||||
return tag_ids[tag]
|
||||
|
||||
|
||||
def ptb_to_univ(tag):
|
||||
mapping = dict(tuple(line.split()) for line in """
|
||||
NULL NULL
|
||||
HYPH .
|
||||
ADD X
|
||||
NFP .
|
||||
AFX X
|
||||
XX X
|
||||
BES VERB
|
||||
HVS VERB
|
||||
GW X
|
||||
! .
|
||||
# .
|
||||
$ .
|
||||
'' .
|
||||
( .
|
||||
) .
|
||||
, .
|
||||
-LRB- .
|
||||
-RRB- .
|
||||
. .
|
||||
: .
|
||||
? .
|
||||
CC CONJ
|
||||
CD NUM
|
||||
CD|RB X
|
||||
DT DET
|
||||
EX DET
|
||||
FW X
|
||||
IN ADP
|
||||
IN|RP ADP
|
||||
JJ ADJ
|
||||
JJR ADJ
|
||||
JJRJR ADJ
|
||||
JJS ADJ
|
||||
JJ|RB ADJ
|
||||
JJ|VBG ADJ
|
||||
LS X
|
||||
MD VERB
|
||||
NN NOUN
|
||||
NNP NOUN
|
||||
NNPS NOUN
|
||||
NNS NOUN
|
||||
NN|NNS NOUN
|
||||
NN|SYM NOUN
|
||||
NN|VBG NOUN
|
||||
NP NOUN
|
||||
PDT DET
|
||||
POS PRT
|
||||
PRP PRON
|
||||
PRP$ PRON
|
||||
PRP|VBP PRON
|
||||
PRT PRT
|
||||
RB ADV
|
||||
RBR ADV
|
||||
RBS ADV
|
||||
RB|RP ADV
|
||||
RB|VBG ADV
|
||||
RN X
|
||||
RP PRT
|
||||
SYM X
|
||||
TO PRT
|
||||
UH X
|
||||
VB VERB
|
||||
VBD VERB
|
||||
VBD|VBN VERB
|
||||
VBG VERB
|
||||
VBG|NN VERB
|
||||
VBN VERB
|
||||
VBP VERB
|
||||
VBP|TO VERB
|
||||
VBZ VERB
|
||||
VP VERB
|
||||
WDT DET
|
||||
WH X
|
||||
WP PRON
|
||||
WP$ PRON
|
||||
WRB ADV
|
||||
! PRT
|
||||
# X
|
||||
$ NUM
|
||||
& CONJ
|
||||
, .
|
||||
@ X
|
||||
A ADJ
|
||||
D DET
|
||||
E X
|
||||
G X
|
||||
L PRT
|
||||
M PRT
|
||||
N NOUN
|
||||
O PRON
|
||||
P ADP
|
||||
R ADV
|
||||
S NOUN
|
||||
T PRT
|
||||
U X
|
||||
V VERB
|
||||
X PRT
|
||||
Y PRT
|
||||
Z NOUN
|
||||
^ NOUN
|
||||
~ X
|
||||
`` .
|
||||
EOL EOL""".strip().split('\n'))
|
||||
return mapping[tag]
|
||||
|
|
@ -8,7 +8,7 @@ from .structs cimport Utf8Str, UniStr
|
|||
cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) nogil:
|
||||
s.chars = &chars[start]
|
||||
s.n = end - start
|
||||
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
|
||||
s.key = hash64(s.chars, <int>(s.n * sizeof(Py_UNICODE)), 0)
|
||||
|
||||
|
||||
cdef class StringStore:
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from libc.stdint cimport uint8_t, uint32_t
|
||||
|
||||
from .typedefs cimport flags_t, attr_t, id_t, hash_t
|
||||
from .typedefs cimport flags_t, attr_t, id_t, hash_t, univ_tag_t
|
||||
|
||||
|
||||
cdef struct Lexeme:
|
||||
|
@ -34,7 +34,7 @@ cdef struct Morphology:
|
|||
cdef struct PosTag:
|
||||
Morphology morph
|
||||
int id
|
||||
int pos
|
||||
univ_tag_t pos
|
||||
|
||||
|
||||
cdef struct TokenC:
|
||||
|
|
|
@ -2,7 +2,7 @@ from libc.stdint cimport uint32_t
|
|||
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
from ..tokens cimport TokenC
|
||||
from ..structs cimport TokenC
|
||||
|
||||
|
||||
cdef struct State:
|
||||
|
@ -20,7 +20,8 @@ cdef int pop_stack(State *s) except -1
|
|||
cdef int push_stack(State *s) except -1
|
||||
|
||||
|
||||
cdef bint has_head(const TokenC* t) nogil
|
||||
cdef inline bint has_head(const TokenC* t) nogil:
|
||||
return t.head != 0
|
||||
|
||||
|
||||
cdef inline int get_idx(const State* s, const TokenC* t) nogil:
|
||||
|
@ -70,29 +71,14 @@ cdef inline bint is_final(const State *s) nogil:
|
|||
return at_eol(s) # The stack will be attached to root anyway
|
||||
|
||||
|
||||
cdef int children_in_buffer(const State *s, const int head, const int* gold) except -1
|
||||
cdef int head_in_buffer(const State *s, const int child, const int* gold) except -1
|
||||
cdef int children_in_stack(const State *s, const int head, const int* gold) except -1
|
||||
cdef int head_in_stack(const State *s, const int child, const int* gold) except -1
|
||||
cdef int children_in_buffer(const State *s, const int head, int* gold) except -1
|
||||
cdef int head_in_buffer(const State *s, const int child, int* gold) except -1
|
||||
cdef int children_in_stack(const State *s, const int head, int* gold) except -1
|
||||
cdef int head_in_stack(const State *s, const int child, int* gold) except -1
|
||||
|
||||
cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL
|
||||
|
||||
|
||||
cdef int count_left_kids(const TokenC* head) nogil
|
||||
|
||||
|
||||
cdef int count_right_kids(const TokenC* head) nogil
|
||||
|
||||
|
||||
# From https://en.wikipedia.org/wiki/Hamming_weight
|
||||
cdef inline uint32_t _popcount(uint32_t x) nogil:
|
||||
"""Find number of non-zero bits."""
|
||||
cdef int count = 0
|
||||
while x != 0:
|
||||
x &= x - 1
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
cdef inline uint32_t _nth_significant_bit(uint32_t bits, int n) nogil:
|
||||
cdef int i
|
||||
|
|
|
@ -3,31 +3,23 @@ from libc.string cimport memmove
|
|||
from cymem.cymem cimport Pool
|
||||
|
||||
from ..lexeme cimport EMPTY_LEXEME
|
||||
from ..tokens cimport TokenC
|
||||
|
||||
|
||||
DEF PADDING = 5
|
||||
DEF NON_MONOTONIC = True
|
||||
|
||||
|
||||
cdef int add_dep(State *s, int head, int child, int label) except -1:
|
||||
cdef int dist = head - child
|
||||
s.sent[child].head = dist
|
||||
s.sent[child].head = head - child
|
||||
s.sent[child].dep_tag = label
|
||||
# Keep a bit-vector tracking child dependencies. If a word has a child at
|
||||
# offset i from it, set that bit (tracking left and right separately)
|
||||
if child > head:
|
||||
s.sent[head].r_kids |= 1 << (-dist)
|
||||
s.sent[head].r_kids |= 1 << (-s.sent[child].head)
|
||||
else:
|
||||
s.sent[head].l_kids |= 1 << dist
|
||||
s.sent[head].l_kids |= 1 << s.sent[child].head
|
||||
|
||||
|
||||
cdef int pop_stack(State *s) except -1:
|
||||
assert s.stack_len >= 1
|
||||
s.stack_len -= 1
|
||||
s.stack -= 1
|
||||
if s.stack_len == 0 and not at_eol(s):
|
||||
push_stack(s)
|
||||
|
||||
|
||||
cdef int push_stack(State *s) except -1:
|
||||
|
@ -36,14 +28,9 @@ cdef int push_stack(State *s) except -1:
|
|||
s.stack[0] = s.i
|
||||
s.stack_len += 1
|
||||
s.i += 1
|
||||
if at_eol(s):
|
||||
while s.stack_len != 0:
|
||||
if not has_head(get_s0(s)):
|
||||
get_s0(s).dep_tag = 0
|
||||
pop_stack(s)
|
||||
|
||||
|
||||
cdef int children_in_buffer(const State *s, int head, const int* gold) except -1:
|
||||
cdef int children_in_buffer(const State *s, int head, int* gold) except -1:
|
||||
# Golds holds an array of head offsets --- the head of word i is i - golds[i]
|
||||
# Iterate over the tokens of the queue, and check whether their gold head is
|
||||
# our target
|
||||
|
@ -55,21 +42,20 @@ cdef int children_in_buffer(const State *s, int head, const int* gold) except -1
|
|||
return n
|
||||
|
||||
|
||||
cdef int head_in_buffer(const State *s, const int child, const int* gold) except -1:
|
||||
cdef int head_in_buffer(const State *s, const int child, int* gold) except -1:
|
||||
return gold[child] >= s.i
|
||||
|
||||
|
||||
cdef int children_in_stack(const State *s, const int head, const int* gold) except -1:
|
||||
cdef int children_in_stack(const State *s, const int head, int* gold) except -1:
|
||||
cdef int i
|
||||
cdef int n = 0
|
||||
for i in range(s.stack_len):
|
||||
if gold[s.stack[-i]] == head:
|
||||
if NON_MONOTONIC or not has_head(get_s0(s)):
|
||||
n += 1
|
||||
return n
|
||||
|
||||
|
||||
cdef int head_in_stack(const State *s, const int child, const int* gold) except -1:
|
||||
cdef int head_in_stack(const State *s, const int child, int* gold) except -1:
|
||||
cdef int i
|
||||
for i in range(s.stack_len):
|
||||
if gold[child] == s.stack[-i]:
|
||||
|
@ -86,7 +72,7 @@ cdef const TokenC* get_left(const State* s, const TokenC* head, const int idx) n
|
|||
if child >= s.sent:
|
||||
return child
|
||||
else:
|
||||
return NULL
|
||||
return s.sent - 1
|
||||
|
||||
|
||||
cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx) nogil:
|
||||
|
@ -98,20 +84,10 @@ cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx)
|
|||
if child < (s.sent + s.sent_len):
|
||||
return child
|
||||
else:
|
||||
return NULL
|
||||
return s.sent - 1
|
||||
|
||||
|
||||
cdef bint has_head(const TokenC* t) nogil:
|
||||
return t.head != 0
|
||||
|
||||
|
||||
cdef int count_left_kids(const TokenC* head) nogil:
|
||||
return _popcount(head.l_kids)
|
||||
|
||||
|
||||
cdef int count_right_kids(const TokenC* head) nogil:
|
||||
return _popcount(head.r_kids)
|
||||
|
||||
DEF PADDING = 5
|
||||
|
||||
|
||||
cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL:
|
||||
|
@ -126,5 +102,4 @@ cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NUL
|
|||
s.stack_len = 0
|
||||
s.i = 0
|
||||
s.sent_len = sent_length
|
||||
push_stack(s)
|
||||
return s
|
||||
|
|
|
@ -7,11 +7,8 @@ from ._state cimport State
|
|||
|
||||
|
||||
cdef struct Transition:
|
||||
int clas
|
||||
int move
|
||||
int label
|
||||
int cost
|
||||
weight_t score
|
||||
|
||||
|
||||
cdef class TransitionSystem:
|
||||
|
@ -21,8 +18,7 @@ cdef class TransitionSystem:
|
|||
|
||||
cdef const Transition* _moves
|
||||
|
||||
cdef Transition best_valid(self, const weight_t* scores, const State* s) except *
|
||||
cdef Transition best_gold(self, Transition* guess, const weight_t* scores,
|
||||
const State* s,
|
||||
const int* gold_heads, const int* gold_labels) except *
|
||||
cdef Transition best_valid(self, const weight_t* scores, const State* s) except -1
|
||||
cdef Transition best_gold(self, const weight_t* scores, const State* s,
|
||||
int* gold_heads, int* gold_labels) except -1
|
||||
cdef int transition(self, State *s, const Transition* t) except -1
|
||||
|
|
|
@ -7,8 +7,6 @@ from ._state cimport head_in_stack, children_in_stack
|
|||
|
||||
from ..tokens cimport TokenC
|
||||
|
||||
DEF NON_MONOTONIC = True
|
||||
|
||||
|
||||
cdef enum:
|
||||
SHIFT
|
||||
|
@ -27,30 +25,22 @@ cdef inline bint _can_right(const State* s) nogil:
|
|||
|
||||
|
||||
cdef inline bint _can_left(const State* s) nogil:
|
||||
if NON_MONOTONIC:
|
||||
return s.stack_len >= 1
|
||||
else:
|
||||
return s.stack_len >= 1 and not has_head(get_s0(s))
|
||||
|
||||
|
||||
cdef inline bint _can_reduce(const State* s) nogil:
|
||||
if NON_MONOTONIC:
|
||||
return s.stack_len >= 2
|
||||
else:
|
||||
return s.stack_len >= 2 and has_head(get_s0(s))
|
||||
|
||||
|
||||
cdef int _shift_cost(const State* s, const int* gold) except -1:
|
||||
cdef int _shift_cost(const State* s, int* gold) except -1:
|
||||
assert not at_eol(s)
|
||||
cost = 0
|
||||
cost += head_in_stack(s, s.i, gold)
|
||||
cost += children_in_stack(s, s.i, gold)
|
||||
if NON_MONOTONIC:
|
||||
cost += gold[s.stack[0]] == s.i
|
||||
return cost
|
||||
|
||||
|
||||
cdef int _right_cost(const State* s, const int* gold) except -1:
|
||||
cdef int _right_cost(const State* s, int* gold) except -1:
|
||||
assert s.stack_len >= 1
|
||||
cost = 0
|
||||
if gold[s.i] == s.stack[0]:
|
||||
|
@ -58,12 +48,10 @@ cdef int _right_cost(const State* s, const int* gold) except -1:
|
|||
cost += head_in_buffer(s, s.i, gold)
|
||||
cost += children_in_stack(s, s.i, gold)
|
||||
cost += head_in_stack(s, s.i, gold)
|
||||
if NON_MONOTONIC:
|
||||
cost += gold[s.stack[0]] == s.i
|
||||
return cost
|
||||
|
||||
|
||||
cdef int _left_cost(const State* s, const int* gold) except -1:
|
||||
cdef int _left_cost(const State* s, int* gold) except -1:
|
||||
assert s.stack_len >= 1
|
||||
cost = 0
|
||||
if gold[s.stack[0]] == s.i:
|
||||
|
@ -71,17 +59,11 @@ cdef int _left_cost(const State* s, const int* gold) except -1:
|
|||
|
||||
cost += head_in_buffer(s, s.stack[0], gold)
|
||||
cost += children_in_buffer(s, s.stack[0], gold)
|
||||
if NON_MONOTONIC and s.stack_len >= 2:
|
||||
cost += gold[s.stack[0]] == s.stack[-1]
|
||||
return cost
|
||||
|
||||
|
||||
cdef int _reduce_cost(const State* s, const int* gold) except -1:
|
||||
cdef int cost = 0
|
||||
cost += children_in_buffer(s, s.stack[0], gold)
|
||||
if NON_MONOTONIC:
|
||||
cost += head_in_buffer(s, s.stack[0], gold)
|
||||
return cost
|
||||
cdef int _reduce_cost(const State* s, int* gold) except -1:
|
||||
return children_in_buffer(s, s.stack[0], gold)
|
||||
|
||||
|
||||
cdef class TransitionSystem:
|
||||
|
@ -91,40 +73,38 @@ cdef class TransitionSystem:
|
|||
right_labels.sort()
|
||||
if 'ROOT' in right_labels:
|
||||
right_labels.pop(right_labels.index('ROOT'))
|
||||
if 'dep' in right_labels:
|
||||
right_labels.pop(right_labels.index('dep'))
|
||||
if 'ROOT' in left_labels:
|
||||
left_labels.pop(left_labels.index('ROOT'))
|
||||
if 'dep' in left_labels:
|
||||
left_labels.pop(left_labels.index('dep'))
|
||||
self.n_moves = 2 + len(left_labels) + len(right_labels)
|
||||
moves = <Transition*>self.mem.alloc(self.n_moves, sizeof(Transition))
|
||||
cdef int i = 0
|
||||
moves[i].move = SHIFT
|
||||
moves[i].label = 0
|
||||
moves[i].clas = i
|
||||
i += 1
|
||||
moves[i].move = REDUCE
|
||||
moves[i].label = 0
|
||||
moves[i].clas = i
|
||||
i += 1
|
||||
self.label_ids = {'ROOT': 0}
|
||||
self.label_ids = {'ROOT': 0, 'dep': -1}
|
||||
cdef int label_id
|
||||
for label_str in left_labels:
|
||||
label_id = self.label_ids.setdefault(label_str, len(self.label_ids))
|
||||
moves[i].move = LEFT
|
||||
moves[i].label = label_id
|
||||
moves[i].clas = i
|
||||
i += 1
|
||||
for label_str in right_labels:
|
||||
label_id = self.label_ids.setdefault(label_str, len(self.label_ids))
|
||||
moves[i].move = RIGHT
|
||||
moves[i].label = label_id
|
||||
moves[i].clas = i
|
||||
i += 1
|
||||
self._moves = moves
|
||||
|
||||
cdef int transition(self, State *s, const Transition* t) except -1:
|
||||
cdef int transition(self, State *s, const int clas) except -1:
|
||||
cdef const Transition* t = &self._moves[clas]
|
||||
if t.move == SHIFT:
|
||||
# Set the dep label, in case we need it after we reduce
|
||||
if NON_MONOTONIC:
|
||||
get_s0(s).dep_tag = t.label
|
||||
push_stack(s)
|
||||
elif t.move == LEFT:
|
||||
add_dep(s, s.i, s.stack[0], t.label)
|
||||
|
@ -133,12 +113,11 @@ cdef class TransitionSystem:
|
|||
add_dep(s, s.stack[0], s.i, t.label)
|
||||
push_stack(s)
|
||||
elif t.move == REDUCE:
|
||||
add_dep(s, s.stack[-1], s.stack[0], get_s0(s).dep_tag)
|
||||
pop_stack(s)
|
||||
else:
|
||||
raise StandardError(t.move)
|
||||
|
||||
cdef Transition best_valid(self, const weight_t* scores, const State* s) except *:
|
||||
cdef int best_valid(self, const weight_t* scores, const State* s) except -1:
|
||||
cdef bint[N_MOVES] valid
|
||||
valid[SHIFT] = _can_shift(s)
|
||||
valid[LEFT] = _can_left(s)
|
||||
|
@ -147,61 +126,59 @@ cdef class TransitionSystem:
|
|||
|
||||
cdef int best = -1
|
||||
cdef weight_t score = 0
|
||||
cdef weight_t best_r_score = -9000
|
||||
cdef int best_r_label = -1
|
||||
cdef int i
|
||||
for i in range(self.n_moves):
|
||||
if valid[self._moves[i].move] and (best == -1 or scores[i] > score):
|
||||
best = i
|
||||
score = scores[i]
|
||||
if self._moves[i].move == RIGHT and scores[i] > best_r_score:
|
||||
best_r_label = self._moves[i].label
|
||||
assert best >= 0
|
||||
cdef Transition t = self._moves[best]
|
||||
t.score = score
|
||||
if t.move == SHIFT:
|
||||
t.label = best_r_label
|
||||
return t
|
||||
return best
|
||||
|
||||
cdef Transition best_gold(self, Transition* guess, const weight_t* scores,
|
||||
const State* s,
|
||||
const int* gold_heads, const int* gold_labels) except *:
|
||||
# If we can create a gold dependency, only one action can be correct
|
||||
cdef int best_gold(self, const weight_t* scores, const State* s,
|
||||
int* gold_heads, int* gold_labels) except -1:
|
||||
cdef int[N_MOVES] unl_costs
|
||||
unl_costs[SHIFT] = _shift_cost(s, gold_heads) if _can_shift(s) else -1
|
||||
unl_costs[LEFT] = _left_cost(s, gold_heads) if _can_left(s) else -1
|
||||
unl_costs[RIGHT] = _right_cost(s, gold_heads) if _can_right(s) else -1
|
||||
unl_costs[REDUCE] = _reduce_cost(s, gold_heads) if _can_reduce(s) else -1
|
||||
|
||||
guess.cost = unl_costs[guess.move]
|
||||
cdef Transition t
|
||||
cdef int target_label
|
||||
cdef int i
|
||||
if gold_heads[s.stack[0]] == s.i:
|
||||
target_label = gold_labels[s.stack[0]]
|
||||
if guess.move == LEFT:
|
||||
guess.cost += guess.label != target_label
|
||||
for i in range(self.n_moves):
|
||||
t = self._moves[i]
|
||||
if t.move == LEFT and t.label == target_label:
|
||||
return t
|
||||
elif gold_heads[s.i] == s.stack[0]:
|
||||
target_label = gold_labels[s.i]
|
||||
if guess.move == RIGHT:
|
||||
guess.cost += guess.label != target_label
|
||||
for i in range(self.n_moves):
|
||||
t = self._moves[i]
|
||||
if t.move == RIGHT and t.label == target_label:
|
||||
return t
|
||||
|
||||
cdef int cost
|
||||
cdef int move
|
||||
cdef int label
|
||||
cdef int best = -1
|
||||
cdef weight_t score = -9000
|
||||
cdef int i
|
||||
for i in range(self.n_moves):
|
||||
t = self._moves[i]
|
||||
if unl_costs[t.move] == 0 and (best == -1 or scores[i] > score):
|
||||
move = self._moves[i].move
|
||||
label = self._moves[i].label
|
||||
if unl_costs[move] == 0:
|
||||
if move == SHIFT or move == REDUCE:
|
||||
cost = 0
|
||||
elif move == LEFT:
|
||||
if gold_heads[s.stack[0]] == s.i and gold_labels[s.stack[0]] != -1:
|
||||
cost = label != gold_labels[s.stack[0]]
|
||||
else:
|
||||
cost = 0
|
||||
elif move == RIGHT:
|
||||
if gold_heads[s.i] == s.stack[0] and gold_labels[s.i] != -1:
|
||||
cost = label != gold_labels[s.i]
|
||||
else:
|
||||
cost = 0
|
||||
else:
|
||||
raise StandardError("Unknown Move")
|
||||
if cost == 0 and (best == -1 or scores[i] > score):
|
||||
best = i
|
||||
score = scores[i]
|
||||
t = self._moves[best]
|
||||
t.score = score
|
||||
assert best >= 0
|
||||
return t
|
||||
|
||||
if best < 0:
|
||||
print unl_costs[SHIFT], unl_costs[REDUCE], unl_costs[LEFT], unl_costs[RIGHT]
|
||||
print s.stack_len
|
||||
print has_head(get_s0(s))
|
||||
print s.sent[s.stack[0]].head
|
||||
print s.stack[0], s.i
|
||||
print gold_heads[s.stack[0]], gold_heads[s.i]
|
||||
print gold_labels[s.i]
|
||||
print children_in_buffer(s, s.stack[0], gold_heads)
|
||||
print head_in_buffer(s, s.stack[0], gold_heads)
|
||||
raise StandardError
|
||||
return best
|
||||
|
|
|
@ -2,6 +2,8 @@
|
|||
# cython: embedsignature=True
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from os import path
|
||||
|
||||
from cython.operator cimport dereference as deref
|
||||
from cython.operator cimport preincrement as preinc
|
||||
|
||||
|
@ -28,6 +30,17 @@ cdef class Tokenizer:
|
|||
self.vocab = Vocab(self.get_props)
|
||||
self._load_special_tokenization(rules)
|
||||
|
||||
@classmethod
|
||||
def from_dir(cls, Vocab vocab, object data_dir):
|
||||
if not path.exists(data_dir):
|
||||
raise IOError("Directory %s not found -- cannot load Tokenizer." % data_dir)
|
||||
if not path.isdir(data_dir):
|
||||
raise IOError("Path %s is a file, not a dir -- cannot load Tokenizer." % data_dir)
|
||||
|
||||
assert path.exists(data_dir) and path.isdir(data_dir)
|
||||
rules, prefix_re, suffix_re, infix_re = util.read_lang_data(data_dir)
|
||||
return cls(vocab, rules, prefix_re, suffix_re, infix_re)
|
||||
|
||||
cpdef Tokens tokens_from_list(self, list strings):
|
||||
cdef int length = sum([len(s) for s in strings])
|
||||
cdef Tokens tokens = Tokens(self.vocab.strings, length)
|
||||
|
|
|
@ -1,6 +1,26 @@
|
|||
from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t
|
||||
from libc.stdint cimport uint8_t
|
||||
|
||||
|
||||
# Google universal tag set
|
||||
cpdef enum univ_tag_t:
|
||||
NO_TAG
|
||||
ADJ
|
||||
ADV
|
||||
ADP
|
||||
CONJ
|
||||
DET
|
||||
NOUN
|
||||
NUM
|
||||
PRON
|
||||
PRT
|
||||
VERB
|
||||
X
|
||||
PUNCT
|
||||
EOL
|
||||
N_UNIV_TAGS
|
||||
|
||||
|
||||
ctypedef uint64_t hash_t
|
||||
ctypedef char* utf8_t
|
||||
ctypedef uint32_t attr_t
|
||||
|
@ -10,11 +30,3 @@ ctypedef uint16_t len_t
|
|||
ctypedef uint16_t tag_t
|
||||
|
||||
|
||||
cdef struct Morphology:
|
||||
uint8_t number
|
||||
uint8_t tenspect # Tense/aspect/voice
|
||||
uint8_t mood
|
||||
uint8_t gender
|
||||
uint8_t person
|
||||
uint8_t case
|
||||
uint8_t misc
|
||||
|
|
|
@ -1,34 +0,0 @@
|
|||
from preshed.maps cimport PreshMap
|
||||
from cymem.cymem cimport Pool
|
||||
from murmurhash.mrmr cimport hash64
|
||||
|
||||
from .typedefs cimport utf8_t, id_t, hash_t
|
||||
|
||||
|
||||
cdef struct Utf8Str:
|
||||
id_t i
|
||||
hash_t key
|
||||
utf8_t chars
|
||||
int length
|
||||
|
||||
|
||||
cdef struct UniStr:
|
||||
Py_UNICODE* chars
|
||||
size_t n
|
||||
hash_t key
|
||||
|
||||
|
||||
cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) nogil:
|
||||
s.chars = &chars[start]
|
||||
s.n = end - start
|
||||
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
|
||||
|
||||
|
||||
cdef class StringStore:
|
||||
cdef Pool mem
|
||||
cdef PreshMap _map
|
||||
cdef Utf8Str* strings
|
||||
cdef int size
|
||||
cdef int _resize_at
|
||||
|
||||
cdef const Utf8Str* intern(self, char* chars, int length) except NULL
|
|
@ -1,80 +0,0 @@
|
|||
from libc.string cimport memcpy
|
||||
|
||||
from murmurhash.mrmr cimport hash64
|
||||
import codecs
|
||||
|
||||
SEPARATOR = '\n|-SEP-|\n'
|
||||
|
||||
|
||||
cdef class StringStore:
|
||||
def __init__(self):
|
||||
self.mem = Pool()
|
||||
self._map = PreshMap()
|
||||
self._resize_at = 10000
|
||||
self.strings = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
|
||||
self.size = 1
|
||||
|
||||
property size:
|
||||
def __get__(self):
|
||||
return self.size-1
|
||||
|
||||
def __getitem__(self, object string_or_id):
|
||||
cdef bytes byte_string
|
||||
cdef const Utf8Str* utf8str
|
||||
if isinstance(string_or_id, int) or isinstance(string_or_id, long):
|
||||
if string_or_id < 1 or string_or_id >= self.size:
|
||||
raise IndexError(string_or_id)
|
||||
utf8str = &self.strings[<int>string_or_id]
|
||||
return utf8str.chars[:utf8str.length]
|
||||
elif isinstance(string_or_id, bytes):
|
||||
utf8str = self.intern(<char*>string_or_id, len(string_or_id))
|
||||
return utf8str.i
|
||||
elif isinstance(string_or_id, unicode):
|
||||
byte_string = string_or_id.encode('utf8')
|
||||
utf8str = self.intern(<char*>byte_string, len(byte_string))
|
||||
return utf8str.i
|
||||
else:
|
||||
raise TypeError(type(string_or_id))
|
||||
|
||||
cdef const Utf8Str* intern(self, char* chars, int length) except NULL:
|
||||
# 0 means missing, but we don't bother offsetting the index. We waste
|
||||
# slot 0 to simplify the code, because it doesn't matter.
|
||||
assert length != 0
|
||||
cdef hash_t key = hash64(chars, length * sizeof(char), 0)
|
||||
cdef void* value = self._map.get(key)
|
||||
cdef size_t i
|
||||
if value == NULL:
|
||||
if self.size == self._resize_at:
|
||||
self._resize_at *= 2
|
||||
self.strings = <Utf8Str*>self.mem.realloc(self.strings, self._resize_at * sizeof(Utf8Str))
|
||||
i = self.size
|
||||
self.strings[i].i = self.size
|
||||
self.strings[i].key = key
|
||||
self.strings[i].chars = <char*>self.mem.alloc(length, sizeof(char))
|
||||
memcpy(self.strings[i].chars, chars, length)
|
||||
self.strings[i].length = length
|
||||
self._map.set(key, <void*>self.size)
|
||||
self.size += 1
|
||||
else:
|
||||
i = <size_t>value
|
||||
return &self.strings[i]
|
||||
|
||||
def dump(self, loc):
|
||||
strings = []
|
||||
cdef Utf8Str* string
|
||||
cdef bytes py_string
|
||||
for i in range(self.size):
|
||||
string = &self.strings[i]
|
||||
py_string = string.chars[:string.length]
|
||||
strings.append(py_string.decode('utf8'))
|
||||
with codecs.open(loc, 'w', 'utf8') as file_:
|
||||
file_.write(SEPARATOR.join(strings))
|
||||
|
||||
def load(self, loc):
|
||||
with codecs.open(loc, 'r', 'utf8') as file_:
|
||||
strings = file_.read().split(SEPARATOR)
|
||||
cdef unicode string
|
||||
cdef bytes byte_string
|
||||
for string in strings[1:]:
|
||||
byte_string = string.encode('utf8')
|
||||
self.intern(byte_string, len(byte_string))
|
|
@ -11,8 +11,7 @@ def utf8open(loc, mode='r'):
|
|||
return codecs.open(loc, mode, 'utf8')
|
||||
|
||||
|
||||
def read_lang_data(name):
|
||||
data_dir = path.join(DATA_DIR, name)
|
||||
def read_lang_data(data_dir):
|
||||
with open(path.join(data_dir, 'specials.json')) as file_:
|
||||
tokenization = ujson.load(file_)
|
||||
prefix = read_prefix(data_dir)
|
||||
|
|
|
@ -19,6 +19,17 @@ cdef class Vocab:
|
|||
self.lexemes.push_back(&EMPTY_LEXEME)
|
||||
self.get_lex_props = get_props
|
||||
|
||||
@classmethod
|
||||
def from_dir(cls, object data_dir, object get_lex_props=None):
|
||||
if not path.exists(data_dir):
|
||||
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
|
||||
if not path.isdir(data_dir):
|
||||
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
|
||||
cdef Vocab self = cls(get_props)
|
||||
self.strings.load(path.join(data_dir, 'strings'))
|
||||
self.load(path.join(data_dir, 'lexemes'))
|
||||
return self
|
||||
|
||||
def __len__(self):
|
||||
return self.lexemes.size()
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user