This commit is contained in:
Matthew Honnibal 2014-12-21 05:36:29 +11:00
parent d11c1edf8c
commit e1c1a4b868
42 changed files with 138 additions and 2382 deletions

View File

@ -1,135 +0,0 @@
from thinc.typedefs cimport atom_t
from .lang cimport Language
from .tokens cimport Tokens
from .tokens cimport TokenC
cpdef enum en_person_t:
NO_PERSON
FIRST
SECOND
THIRD
NON_THIRD
cpdef enum en_number_t:
NO_NUMBER
SINGULAR
PLURAL
MASS
cpdef enum en_gender_t:
NO_GENDER
MASCULINE
FEMININE
NEUTER
cpdef enum en_case_t:
NO_CASE
NOMINATIVE
GENITIVE
ACCUSATIVE
REFLEXIVE
DEMONYM
cpdef enum en_tenspect_t:
NO_TENSE
BASE_VERB
PRESENT
PAST
PASSIVE
ING
MODAL
cpdef enum misc_t:
NO_MISC
COMPARATIVE
SUPERLATIVE
RELATIVE
NAME
# Flags
cpdef enum FlagID:
IS_ALPHA
IS_ASCII
IS_DIGIT
IS_LOWER
IS_PUNCT
IS_SPACE
IS_TITLE
IS_UPPER
LIKE_URL
LIKE_NUMBER
OFT_LOWER
OFT_TITLE
OFT_UPPER
IN_MALES
IN_FEMALES
IN_SURNAMES
IN_PLACES
IN_GAMES
IN_CELEBS
IN_NAMES
cpdef enum:
P2_sic
P2_cluster
P2_shape
P2_prefix
P2_suffix
P2_pos
P2_lemma
P2_pos_type
P1_sic
P1_cluster
P1_shape
P1_prefix
P1_suffix
P1_pos
P1_lemma
P1_pos_type
W_sic
W_cluster
W_shape
W_prefix
W_suffix
W_pos
W_lemma
W_pos_type
N1_sic
N1_cluster
N1_shape
N1_prefix
N1_suffix
N1_pos
N1_lemma
N1_pos_type
N2_sic
N2_cluster
N2_shape
N2_prefix
N2_suffix
N2_pos
N2_lemma
N2_pos_type
N_CONTEXT_FIELDS
cdef class English(Language):
cdef int is_base_np_end(self, const TokenC* token) except -1
cdef int is_outside_base_np(self, const TokenC* token) except -1

View File

@ -1,213 +0,0 @@
# cython: profile=True
# cython: embedsignature=True
'''Tokenize English text, using a scheme that differs from the Penn Treebank 3
scheme in several important respects:
* Whitespace is added as tokens, except for single spaces. e.g.,
>>> [w.string for w in EN.tokenize(u'\\nHello \\tThere')]
[u'\\n', u'Hello', u' ', u'\\t', u'There']
* Contractions are normalized, e.g.
>>> [w.string for w in EN.tokenize(u"isn't ain't won't he's")]
[u'is', u'not', u'are', u'not', u'will', u'not', u'he', u"__s"]
* Hyphenated words are split, with the hyphen preserved, e.g.:
>>> [w.string for w in EN.tokenize(u'New York-based')]
[u'New', u'York', u'-', u'based']
Other improvements:
* Email addresses, URLs, European-formatted dates and other numeric entities not
found in the PTB are tokenized correctly
* Heuristic handling of word-final periods (PTB expects sentence boundary detection
as a pre-process before tokenization.)
Take care to ensure your training and run-time data is tokenized according to the
same scheme. Tokenization problems are a major cause of poor performance for
NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
provides a fully Penn Treebank 3-compliant tokenizer.
'''
from __future__ import unicode_literals
from murmurhash.mrmr cimport hash64
cimport lang
from .typedefs cimport hash_t, id_t, flags_t
import orth
from .morphology cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
from .morphology cimport X, PUNCT, EOL
from .tokens cimport Morphology
DEF USE_POS_CACHE = True
POS_TAGS = {
'NULL': (NO_TAG, {}),
'EOL': (EOL, {}),
'CC': (CONJ, {}),
'CD': (NUM, {}),
'DT': (DET, {}),
'EX': (DET, {}),
'FW': (X, {}),
'IN': (ADP, {}),
'JJ': (ADJ, {}),
'JJR': (ADJ, {'misc': COMPARATIVE}),
'JJS': (ADJ, {'misc': SUPERLATIVE}),
'LS': (X, {}),
'MD': (VERB, {'tenspect': MODAL}),
'NN': (NOUN, {}),
'NNS': (NOUN, {'number': PLURAL}),
'NNP': (NOUN, {'misc': NAME}),
'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}),
'PDT': (DET, {}),
'POS': (PRT, {'case': GENITIVE}),
'PRP': (NOUN, {}),
'PRP$': (NOUN, {'case': GENITIVE}),
'RB': (ADV, {}),
'RBR': (ADV, {'misc': COMPARATIVE}),
'RBS': (ADV, {'misc': SUPERLATIVE}),
'RP': (PRT, {}),
'SYM': (X, {}),
'TO': (PRT, {}),
'UH': (X, {}),
'VB': (VERB, {}),
'VBD': (VERB, {'tenspect': PAST}),
'VBG': (VERB, {'tenspect': ING}),
'VBN': (VERB, {'tenspect': PASSIVE}),
'VBP': (VERB, {'tenspect': PRESENT}),
'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}),
'WDT': (DET, {'misc': RELATIVE}),
'WP': (PRON, {'misc': RELATIVE}),
'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}),
'WRB': (ADV, {'misc': RELATIVE}),
'!': (PUNCT, {}),
'#': (PUNCT, {}),
'$': (PUNCT, {}),
"''": (PUNCT, {}),
"(": (PUNCT, {}),
")": (PUNCT, {}),
"-LRB-": (PUNCT, {}),
"-RRB-": (PUNCT, {}),
".": (PUNCT, {}),
",": (PUNCT, {}),
"``": (PUNCT, {}),
":": (PUNCT, {}),
"?": (PUNCT, {}),
}
POS_TEMPLATES = (
(W_sic,),
(P1_lemma, P1_pos),
(P2_lemma, P2_pos),
(N1_sic,),
(N2_sic,),
(W_suffix,),
(W_prefix,),
(P1_pos,),
(P2_pos,),
(P1_pos, P2_pos),
(P1_pos, W_sic),
(P1_suffix,),
(N1_suffix,),
(W_shape,),
(W_cluster,),
(N1_cluster,),
(N2_cluster,),
(P1_cluster,),
(P2_cluster,),
(W_pos_type,),
(N1_pos_type,),
(N1_pos_type,),
(P1_pos, W_pos_type, N1_pos_type),
)
cdef class English(Language):
"""English tokenizer, tightly coupled to lexicon.
Attributes:
name (unicode): The two letter code used by Wikipedia for the language.
lexicon (Lexicon): The lexicon. Exposes the lookup method.
"""
def get_props(self, unicode string):
return {'flags': self.set_flags(string), 'dense': orth.word_shape(string)}
def set_flags(self, unicode string):
cdef flags_t flags = 0
flags |= orth.is_alpha(string) << IS_ALPHA
flags |= orth.is_ascii(string) << IS_ASCII
flags |= orth.is_digit(string) << IS_DIGIT
flags |= orth.is_lower(string) << IS_LOWER
flags |= orth.is_punct(string) << IS_PUNCT
flags |= orth.is_space(string) << IS_SPACE
flags |= orth.is_title(string) << IS_TITLE
flags |= orth.is_upper(string) << IS_UPPER
flags |= orth.like_url(string) << LIKE_URL
flags |= orth.like_number(string) << LIKE_NUMBER
return flags
def set_pos(self, Tokens tokens):
cdef int i
cdef atom_t[N_CONTEXT_FIELDS] context
cdef TokenC* t = tokens.data
cdef id_t[2] bigram
cdef hash_t cache_key
cdef void* cached = NULL
assert self.morphologizer is not None
cdef dict tagdict = self.pos_tagger.tagdict
for i in range(tokens.length):
fill_pos_context(context, i, t)
t[i].pos = self.pos_tagger.predict(context)
self.morphologizer.set_morph(i, t)
def train_pos(self, Tokens tokens, golds):
cdef int i
cdef atom_t[N_CONTEXT_FIELDS] context
c = 0
cdef TokenC* t = tokens.data
for i in range(tokens.length):
fill_pos_context(context, i, t)
t[i].pos = self.pos_tagger.predict(context, [golds[i]])
self.morphologizer.set_morph(i, t)
c += t[i].pos == golds[i]
return c
cdef int is_base_np_end(self, const TokenC* token) except -1:
pass
cdef int is_outside_base_np(self, const TokenC* token) except -1:
pass
cdef int fill_pos_context(atom_t* context, const int i, const TokenC* tokens) except -1:
_fill_from_token(&context[P2_sic], &tokens[i-2])
_fill_from_token(&context[P1_sic], &tokens[i-1])
_fill_from_token(&context[W_sic], &tokens[i])
_fill_from_token(&context[N1_sic], &tokens[i+1])
_fill_from_token(&context[N2_sic], &tokens[i+2])
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
context[0] = t.lex.sic
context[1] = t.lex.cluster
context[2] = t.lex.shape
context[3] = t.lex.prefix
context[4] = t.lex.suffix
context[5] = t.pos
context[6] = t.lemma
context[7] = t.lex.pos_type
EN = English('en')

View File

@ -1,44 +0,0 @@
from libcpp.vector cimport vector
from libcpp.pair cimport pair
from preshed.counter cimport count_t
from preshed.maps cimport PreshMap
from preshed.counter cimport PreshCounter
from cymem.cymem cimport Pool
from .lang cimport Lexicon
from .tokens cimport Tokens, TokenC
from .typedefs cimport id_t
from .lexeme cimport attr_id_t
from .typedefs cimport attr_t
from .typedefs cimport hash_t
from murmurhash.mrmr cimport hash64
ctypedef vector[pair[id_t, count_t]] count_vector_t
cdef class Index:
cdef attr_id_t attr_id
cdef readonly attr_t max_value
cdef vector[count_vector_t] counts
cpdef int count(self, Tokens tokens) except -1
cdef class DecisionMemory:
cdef int n_classes
cdef Pool mem
cdef PreshCounter _counts
cdef PreshCounter _class_counts
cdef PreshMap memos
cdef list class_names
cdef int inc(self, hash_t context_key, hash_t clas, count_t inc) except -1
cdef int find_best_class(self, count_t* counts, hash_t context_key) except -1
cdef inline int get(self, hash_t context_key) nogil:
return <int><size_t>self.memos.get(context_key) - 1

View File

@ -1,120 +0,0 @@
"""Create a term-document matrix"""
cimport cython
from libc.stdint cimport int64_t
from libc.string cimport memmove
from cymem.cymem cimport Address
from .lexeme cimport Lexeme, get_attr
from .tokens cimport TokenC
from .typedefs cimport hash_t
from preshed.maps cimport MapStruct, Cell, map_get, map_set, map_init
from murmurhash.mrmr cimport hash64
cdef class Index:
def __init__(self, attr_id_t attr_id):
self.attr_id = attr_id
self.max_value = 0
cpdef int count(self, Tokens tokens) except -1:
cdef PreshCounter counts = PreshCounter(2 ** 8)
cdef attr_id_t attr_id = self.attr_id
cdef attr_t term
cdef int i
for i in range(tokens.length):
term = get_attr(tokens.data[i].lex, attr_id)
counts.inc(term, 1)
if term > self.max_value:
self.max_value = term
cdef count_t count
cdef count_vector_t doc_counts
for term, count in counts:
doc_counts.push_back(pair[id_t, count_t](term, count))
self.counts.push_back(doc_counts)
cdef class DecisionMemory:
def __init__(self, class_names):
self.class_names = class_names
self.n_classes = len(class_names)
self.mem = Pool()
self._counts = PreshCounter()
self._class_counts = PreshCounter()
self.memos = PreshMap()
def load(self, loc, thresh=50):
cdef:
count_t freq
hash_t key
int clas
for line in open(loc):
freq, key, clas = [int(p) for p in line.split()]
if thresh == 0 or freq >= thresh:
self.memos.set(key, <void*>(clas+1))
def __getitem__(self, ids):
cdef id_t[2] context
context[0] = context[0]
context[1] = context[1]
cdef hash_t context_key = hash64(context, 2 * sizeof(id_t), 0)
cdef hash_t[2] class_context
class_context[0] = context_key
counts = {}
cdef id_t i
for i, clas in enumerate(self.clas_names):
class_context[1] = <hash_t>i
key = hash64(class_context, sizeof(hash_t) * 2, 0)
count = self._class_counts[key]
counts[clas] = count
return counts
@cython.cdivision(True)
def iter_contexts(self, float min_acc=0.99, count_t min_freq=10):
cdef Address counts_addr = Address(self.n_classes, sizeof(count_t))
cdef count_t* counts = <count_t*>counts_addr.ptr
cdef MapStruct* context_counts = self._counts.c_map
cdef hash_t context_key
cdef count_t context_freq
cdef int best_class
cdef float acc
cdef int i
for i in range(context_counts.length):
context_key = context_counts.cells[i].key
context_freq = <count_t>context_counts.cells[i].value
if context_key != 0 and context_freq >= min_freq:
best_class = self.find_best_class(counts, context_key)
acc = counts[best_class] / context_freq
if acc >= min_acc:
yield counts[best_class], context_key, best_class
cdef int inc(self, hash_t context_key, hash_t clas, count_t inc) except -1:
cdef hash_t context_and_class_key
cdef hash_t[2] context_and_class
context_and_class[0] = context_key
context_and_class[1] = clas
context_and_class_key = hash64(context_and_class, 2 * sizeof(hash_t), 0)
self._counts.inc(context_key, inc)
self._class_counts.inc(context_and_class_key, inc)
cdef int find_best_class(self, count_t* counts, hash_t context_key) except -1:
cdef hash_t[2] unhashed_key
unhashed_key[0] = context_key
cdef count_t total = 0
cdef hash_t key
cdef int clas
cdef int best
cdef int mode = 0
for clas in range(self.n_classes):
unhashed_key[1] = <hash_t>clas
key = hash64(unhashed_key, sizeof(hash_t) * 2, 0)
count = self._class_counts[key]
counts[clas] = count
if count >= mode:
mode = count
best = clas
total += count
return best

View File

@ -1,90 +0,0 @@
from os import path
NOUN_RULES = (
('s', ''),
('ses', 's'),
('ves', 'f'),
('xes', 'x'),
('zes', 'z'),
('ches', 'ch'),
('shes', 'sh'),
('men', 'man'),
('ies', 'y')
)
VERB_RULES = (
("s", ""),
("ies", "y"),
("es", "e"),
("es", ""),
("ed", "e"),
("ed", ""),
("ing", "e"),
("ing", "")
)
ADJ_RULES = (
("er", ""),
("est", ""),
("er", "e"),
("est", "e")
)
class Lemmatizer(object):
def __init__(self, wn_dict_dir):
self.index = {}
self.exc = {}
for pos in ['adj', 'adv', 'noun', 'verb']:
self.index[pos] = read_index(path.join(wn_dict_dir, 'index.%s' % pos))
self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos))
def noun(self, string):
return lemmatize(string, self.index['noun'], self.exc['noun'], NOUN_RULES)
def verb(self, string):
return lemmatize(string, self.index['verb'], self.exc['verb'], VERB_RULES)
def adj(self, string):
return lemmatize(string, self.index['adj'], self.exc['adj'], ADJ_RULES)
def lemmatize(string, index, exceptions, rules):
string = string.lower()
forms = []
if string in index:
forms.append(string)
forms.extend(exceptions.get(string, []))
for old, new in rules:
if string.endswith(old):
form = string[:len(string) - len(old)] + new
if form in index:
forms.append(form)
if not forms:
forms.append(string)
return set(forms)
def read_index(loc):
index = set()
for line in open(loc):
if line.startswith(' '):
continue
pieces = line.split()
word = pieces[0]
if word.count('_') == 0:
index.add(word)
return index
def read_exc(loc):
exceptions = {}
for line in open(loc):
if line.startswith(' '):
continue
pieces = line.split()
exceptions[pieces[0]] = tuple(pieces[1:])
return exceptions

View File

@ -36,11 +36,11 @@ cdef struct _Cached:
cdef class Morphologizer:
"""Given a POS tag and a Lexeme, find its lemma and morphological analysis.
"""
def __init__(self, StringStore strings, object lemmatizer, **kwargs):
def __init__(self, StringStore strings, object lemmatizer,
irregulars=None, tag_map=None, tag_names=None):
self.mem = Pool()
self.strings = strings
tag_map = kwargs['tag_map']
self.tag_names = kwargs['tag_names']
self.tag_names = tag_names
self.lemmatizer = lemmatizer
self._cache = PreshMapArray(len(self.tag_names))
self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
@ -55,9 +55,16 @@ cdef class Morphologizer:
self.tags[i].morph.person = props.get('person', 0)
self.tags[i].morph.case = props.get('case', 0)
self.tags[i].morph.misc = props.get('misc', 0)
#if path.exists(path.join(data_dir, 'morphs.json')):
# with open(path.join(data_dir, 'morphs.json')) as file_:
# self.load_exceptions(json.load(file_))
if irregulars is not None:
self.load_exceptions(irregulars)
@classmethod
def from_dir(cls, StringStore strings, object lemmatizer, data_dir):
tag_map = None
irregulars = None
tag_names = None
return cls(strings, lemmatizer, tag_map=tag_map, irregulars=irregulars,
tag_names=tag_names)
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
if self.lemmatizer is None:
@ -86,7 +93,6 @@ cdef class Morphologizer:
cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
cached.morph = tag.morph
self._cache.set(tag.id, tokens[i].lex.sic, <void*>cached)
tokens[i].lemma = cached.lemma
tokens[i].morph = cached.morph

View File

View File

View File

View File

@ -1,169 +0,0 @@
from spacy.context cimport FIELD_IDS, Token
cdef Token P4 = FIELD_IDS.P4
cdef Token P3 = FIELD_IDS.P3
cdef Token P2 = FIELD_IDS.P2
cdef Token P1 = FIELD_IDS.P1
cdef Token N0 = FIELD_IDS.N0
cdef Token N1 = FIELD_IDS.N1
cdef Token N2 = FIELD_IDS.N2
cdef Token N3 = FIELD_IDS.N3
cdef Token N4 = FIELD_IDS.N4
"""
TEMPLATES = (
(N0.sic,),
(N0.cluster,),
(P1.pos,),
(P1.sic,),
(N1.norm,),
(N1.pos,),
(P1.ner,),
(P2.ner,),
(N0.cluster,),
(P1.cluster,),
(N1.cluster,),
(N0.is_alpha,),
(N0.is_digit,),
(N0.is_title,),
(N0.is_upper,),
(N0.is_title, N0.oft_title),
(N0.is_upper, N0.oft_upper),
(P1.cluster, N0.norm),
(N0.norm, N1.cluster),
(P1.ner, N0.pos),
(P2.ner, P1.ner, N0.pos),
(P2.pos, P1.pos, N0.sic),
(N0.sic, N1.pos, N2.pos)
)
"""
LOCAL = (
(N0.sic,),
(P1.sic,),
(N1.sic,),
(P2.sic,),
(N2.sic,),
(P3.sic,),
(N3.sic,),
(P4.sic,),
(N4.sic,),
(P1.sic, N0.sic,),
(N0.sic, N1.sic),
(N0.prefix,),
(N0.suffix,),
(P1.shape,),
(N0.shape,),
(N1.shape,),
(P1.shape, N0.shape,),
(N0.shape, P1.shape,),
(P1.shape, N0.shape, N1.shape),
(N2.shape,),
(P2.shape,),
(P3.shape,),
(N3.shape,),
(P4.shape,),
(N4.shape,),
(P2.norm, P1.norm, N0.norm),
(P1.norm, N0.norm, N1.norm),
(N0.norm, N1.norm, N2.norm)
)
BOOLS = (
(N0.is_title,),
)
HISTORY = (
(P1.ner,),
(P1.ner, N0.sic,),
(P2.ner,),
(P2.ner, P1.ner),
(P2.ner, P1.ner, N0.sic),
(P2.pos, P1.ner, N0.pos),
(P2.ner, P1.pos, N0.pos),
(P3.ner,),
(P4.ner,),
)
POS = (
(P4.pos,),
(P3.pos,),
(P2.pos,),
(P1.pos,),
(N0.pos,),
(N1.pos,),
(N2.pos,),
(N3.pos,),
(N4.pos,),
(P1.pos, N0.pos),
(N0.pos, N1.pos),
(P2.pos, P1.pos, N0.pos),
(P1.pos, N0.pos, N1.pos),
(N0.pos, N1.pos, N2.pos)
)
CLUSTERS = (
(P4.cluster,),
(P3.cluster,),
(P2.cluster,),
(P1.cluster,),
(N0.cluster,),
(N1.cluster,),
(N2.cluster,),
(N3.cluster,),
(N4.cluster,),
(P1.cluster, N0.cluster),
(N0.cluster, N1.cluster),
)
CLUSTER_POS = (
(P1.cluster, N0.pos),
(N0.pos, P1.cluster),
(N0.cluster, N1.pos),
(N0.pos, N1.cluster)
)
GAZ = (
(N0.in_males,),
(N0.in_females,),
(N0.in_surnames,),
(N0.in_places,),
(N0.in_games,),
(N0.in_celebs,),
(N0.in_names,),
(P1.in_males,),
(P1.in_females,),
(P1.in_surnames,),
(P1.in_places,),
(P1.in_games,),
(P1.in_celebs,),
(P1.in_names,),
(N1.in_males,),
(N1.in_females,),
(N1.in_surnames,),
(N1.in_places,),
(N1.in_games,),
(N1.in_celebs,),
(N1.in_names,),
)
TEMPLATES = LOCAL + HISTORY + CLUSTERS + POS + CLUSTER_POS + GAZ + BOOLS

View File

@ -1,15 +0,0 @@
from cymem.cymem cimport Pool
from .structs cimport State, Entity, Move
cdef int begin_entity(State* s, label) except -1
cdef int end_entity(State* s) except -1
cdef State* init_state(Pool mem, int sent_length) except NULL
cdef int copy_state(Pool mem, State* dest, State* source) except -1
cdef bint entity_is_open(State *s) except -1
cdef int entity_is_sunk(State *s, Move* golds) except -1
cdef int is_done(State* s) except -1

View File

@ -1,54 +0,0 @@
from libc.string cimport memcpy
cdef int begin_entity(State* s, label) except -1:
s.j += 1
s.ents[s.j].start = s.i
s.ents[s.j].tag = label
s.ents[s.j].end = s.i + 1
cdef int end_entity(State* s) except -1:
s.ents[s.j].end = s.i + 1
cdef State* init_state(Pool mem, int sent_length) except NULL:
s = <State*>mem.alloc(1, sizeof(State))
s.ents = <Entity*>mem.alloc(sent_length, sizeof(Entity))
s.tags = <int*>mem.alloc(sent_length, sizeof(int))
s.length = sent_length
cdef bint entity_is_open(State *s) except -1:
return s.ents[s.j].start != 0
cdef int entity_is_sunk(State *s, Move* golds) except -1:
if not entity_is_open(s):
return False
raise StandardError
#cdef Entity* ent = &s.ents[s.j]
#cdef Move* gold = &golds[ent.start]
#if gold.action != BEGIN and gold.action != UNIT:
# return True
#elif gold.label != ent.label:
# return True
#else:
# return False
cdef int copy_state(Pool mem, State* dest, State* source) except -1:
'''Copy state source into state dest.'''
if source.length > dest.length:
dest.ents = <Entity*>mem.realloc(dest.ents, source.length * sizeof(Entity))
dest.tags = <int*>mem.realloc(dest.tags, source.length * sizeof(int))
memcpy(dest.ents, source.ents, source.length * sizeof(Entity))
memcpy(dest.tags, source.tags, source.length * sizeof(int))
dest.length = source.length
dest.i = source.i
dest.j = source.j
dest.curr = source.curr
cdef int is_done(State* s) except -1:
return s.i >= s.length and not entity_is_open(s)

View File

@ -1,8 +0,0 @@
from cymem.cymem cimport Pool
cdef class NERAnnotation:
cdef Pool mem
cdef int* starts
cdef int* ends
cdef int* labels
cdef readonly list entities

View File

@ -1,94 +0,0 @@
from libc.string cimport memset
cdef class NERAnnotation:
def __init__(self, entities, length, entity_types):
self.mem = Pool()
self.starts = <int*>self.mem.alloc(length, sizeof(int))
self.ends = <int*>self.mem.alloc(length, sizeof(int))
self.labels = <int*>self.mem.alloc(length, sizeof(int))
self.entities = entities
memset(self.starts, -1, sizeof(int) * length)
memset(self.ends, -1, sizeof(int) * length)
memset(self.labels, -1, sizeof(int) * length)
cdef int start, end, label
for start, end, label in entities:
for i in range(start, end):
self.starts[i] = start
self.ends[i] = end
self.labels[i] = label
@classmethod
def from_bilous(cls, tag_strs, entity_types):
entities = []
start = None
for i, tag_str in enumerate(tag_strs):
if tag_str == 'O' or tag_str == '-':
continue
move, label_str = tag_str.split('-')
label = entity_types.index(label_str)
if label == -1:
label = len(entity_types)
entity_types.append(label)
if move == 'U':
assert start is None
entities.append((i, i+1, label))
elif move == 'B':
assert start is None
start = i
elif move == 'L':
assert start is not None
entities.append((start, i+1, label))
start = None
return cls(entities, len(tag_strs), entity_types)
def read_iob(file_, entity_types, create_tokens):
sent_strs = file_.read().strip().split('\n\n')
sents = []
for sent_str in sent_strs:
if sent_str.startswith('-DOCSTART-'):
continue
words = []
iob = []
for token_str in sent_str.split('\n'):
word, pos, chunk, ner = token_str.split()
words.append(word)
iob.append(ner)
bilou = iob_to_bilou(iob)
tokens = create_tokens(words)
sents.append((tokens, NERAnnotation.from_bilous(bilou, entity_types)))
return sents
def iob_to_bilou(tags):
out = []
curr_label = None
tags = list(tags)
while tags:
out.extend(_consume_os(tags))
out.extend(_consume_ent(tags))
return out
def _consume_os(tags):
while tags and tags[0] == 'O':
yield tags.pop(0)
def _consume_ent(tags):
if not tags:
return []
target = tags.pop(0).replace('B', 'I')
length = 1
while tags and tags[0] == target:
length += 1
tags.pop(0)
label = target[2:]
if length == 1:
return ['U-' + label]
else:
start = 'B-' + label
end = 'L-' + label
middle = ['I-%s' % label for _ in range(1, length - 1)]
return [start] + middle + [end]

View File

@ -1,27 +0,0 @@
from cymem.cymem cimport Pool
from thinc.typedefs cimport class_t
from thinc.typedefs cimport weight_t
from .structs cimport State, Move
cpdef enum ActionType:
MISSING
BEGIN
IN
LAST
UNIT
OUT
N_ACTIONS
cdef int set_accept_if_oracle(Move* moves, Move* golds, int n, State* s) except 0
cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0
cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL
cdef int transition(State *s, Move* m) except -1
cdef int fill_moves(Move* moves, list tag_names) except -1

View File

@ -1,207 +0,0 @@
from __future__ import unicode_literals
from ._state cimport begin_entity
from ._state cimport end_entity
from ._state cimport entity_is_open
from ._state cimport entity_is_sunk
ACTION_NAMES = ['' for _ in range(N_ACTIONS)]
ACTION_NAMES[<int>MISSING] = '?'
ACTION_NAMES[<int>BEGIN] = 'B'
ACTION_NAMES[<int>IN] = 'I'
ACTION_NAMES[<int>LAST] = 'L'
ACTION_NAMES[<int>UNIT] = 'U'
ACTION_NAMES[<int>OUT] = 'O'
cdef bint can_begin(State* s, int label):
return not entity_is_open(s)
cdef bint can_in(State* s, int label):
return entity_is_open(s) and s.curr.label == label
cdef bint can_last(State* s, int label):
return entity_is_open(s) and s.curr.label == label
cdef bint can_unit(State* s, int label):
return not entity_is_open(s)
cdef bint can_out(State* s, int label):
return not entity_is_open(s)
cdef bint is_oracle(ActionType act, int tag, ActionType g_act, int g_tag,
ActionType next_act, bint is_sunk):
if g_act == MISSING:
return True
if act == BEGIN:
if g_act == BEGIN:
# B, Gold B --> Label match
return tag == g_tag
else:
# B, Gold I --> False (P)
# B, Gold L --> False (P)
# B, Gold O --> False (P)
# B, Gold U --> False (P)
return False
elif act == IN:
if g_act == BEGIN:
# I, Gold B --> True (P of bad open entity sunk, R of this entity sunk)
return True
elif g_act == IN:
# I, Gold I --> True (label forced by prev, if mismatch, P and R both sunk)
return True
elif g_act == LAST:
# I, Gold L --> True iff this entity sunk and next tag == O
return is_sunk and (next_act == OUT or next_act == MISSING)
elif g_act == OUT:
# I, Gold O --> True iff next tag == O
return next_act == OUT or next_act == MISSING
elif g_act == UNIT:
# I, Gold U --> True iff next tag == O
return next_act == OUT
elif act == LAST:
if g_act == BEGIN:
# L, Gold B --> True
return True
elif g_act == IN:
# L, Gold I --> True iff this entity sunk
return is_sunk
elif g_act == LAST:
# L, Gold L --> True
return True
elif g_act == OUT:
# L, Gold O --> True
return True
elif g_act == UNIT:
# L, Gold U --> True
return True
elif act == OUT:
if g_act == BEGIN:
# O, Gold B --> False
return False
elif g_act == IN:
# O, Gold I --> True
return True
elif g_act == LAST:
# O, Gold L --> True
return True
elif g_act == OUT:
# O, Gold O --> True
return True
elif g_act == UNIT:
# O, Gold U --> False
return False
elif act == UNIT:
if g_act == UNIT:
# U, Gold U --> True iff tag match
return tag == g_tag
else:
# U, Gold B --> False
# U, Gold I --> False
# U, Gold L --> False
# U, Gold O --> False
return False
cdef int set_accept_if_valid(Move* moves, int n_classes, State* s) except 0:
cdef int n_accept = 0
cdef Move* m
moves[0].accept = False
for i in range(1, n_classes):
m = &moves[i]
if m.action == BEGIN:
m.accept = can_begin(s, m.label)
elif m.action == IN:
m.accept = can_in(s, m.label)
elif m.action == LAST:
m.accept = can_last(s, m.label)
elif m.action == UNIT:
m.accept = can_unit(s, m.label)
elif m.action == OUT:
m.accept = can_out(s, m.label)
n_accept += m.accept
assert n_accept != 0
return n_accept
cdef int set_accept_if_oracle(Move* moves, Move* golds, int n_classes, State* s) except 0:
cdef Move* g = &golds[s.i]
cdef ActionType next_act = <ActionType>golds[s.i+1].action if s.i < s.length else OUT
cdef bint is_sunk = entity_is_sunk(s, golds)
cdef Move* m
cdef int n_accept = 0
set_accept_if_valid(moves, n_classes, s)
for i in range(1, n_classes):
m = &moves[i]
if not m.accept:
continue
m.accept = is_oracle(<ActionType>m.action, m.label, <ActionType>g.action,
g.label, next_act, is_sunk)
n_accept += m.accept
assert n_accept != 0
return n_accept
cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL:
cdef int first_accept = -1
for first_accept in range(1, n):
if moves[first_accept].accept:
break
else:
raise StandardError
assert first_accept != -1
cdef int best = first_accept
cdef weight_t score = scores[first_accept-1]
cdef int i
for i in range(first_accept+1, n):
if moves[i].accept and scores[i-1] > score:
best = i
score = scores[i-1]
return &moves[best]
cdef int transition(State *s, Move* move) except -1:
if move.action == BEGIN:
begin_entity(s, move.label)
elif move.action == IN:
pass
elif move.action == LAST:
end_entity(s)
elif move.action == UNIT:
begin_entity(s, move.label)
end_entity(s)
elif move.action == OUT:
pass
s.tags[s.i] = move.clas
s.i += 1
def get_n_moves(n_tags):
return n_tags + n_tags + n_tags + n_tags + 1
cdef int fill_moves(Move* moves, list tag_names) except -1:
cdef Move* m
label_names = {'-': 0}
for i, tag_name in enumerate(tag_names):
m = &moves[i]
if '-' in tag_name:
action_str, label = tag_name.split('-')
elif tag_name == 'O':
action_str = 'O'
label = '-'
elif tag_name == 'NULL' or tag_name == 'EOL':
action_str = '?'
label = '-'
else:
raise StandardError(tag_name)
m.action = ACTION_NAMES.index(action_str)
m.label = label_names.setdefault(label, len(label_names))
m.clas = i

View File

@ -1,155 +0,0 @@
from thinc.typedefs cimport atom_t
from ..typedefs cimport hash_t
from ..tokens cimport Tokens
from ..lexeme cimport Lexeme
from .structs cimport State
cpdef enum:
T_sic
T_cluster
T_norm
T_shape
T_asciied
T_prefix
T_suffix
T_length
T_postype
T_nertype
T_sensetype
T_is_alpha
T_is_ascii
T_is_digit
T_is_lower
T_is_punct
T_is_space
T_is_title
T_is_upper
T_like_url
T_like_number
T_oft_lower
T_oft_title
T_oft_upper
T_in_males
T_in_females
T_in_surnames
T_in_places
T_in_celebs
T_in_names
T_pos
T_sense
T_ner
cpdef enum:
P2_sic
P2_cluster
P2_norm
P2_shape
P2_prefix
P2_suffix
P2_length
P2_postype
P2_is_alpha
P2_is_digit
P2_is_lower
P2_is_punct
P2_is_title
P2_is_upper
P2_like_number
P2_pos
P1_sic
P1_cluster
P1_norm
P1_shape
P1_prefix
P1_suffix
P1_length
P1_postype
P1_is_alpha
P1_is_digit
P1_is_lower
P1_is_punct
P1_is_title
P1_is_upper
P1_like_number
P1_pos
W_sic
W_cluster
W_norm
W_shape
W_prefix
W_suffix
W_length
W_postype
W_is_alpha
W_is_digit
W_is_lower
W_is_punct
W_is_space
W_is_title
W_is_upper
W_like_number
W_pos
N1_sic
N1_cluster
N1_norm
N1_shape
N1_prefix
N1_suffix
N1_length
N1_postype
N1_is_alpha
N1_is_ascii
N1_is_digit
N1_is_lower
N1_is_punct
N1_is_space
N1_is_title
N1_is_upper
N1_like_number
N1_pos
N2_sic
N2_cluster
N2_norm
N2_shape
N2_asciied
N2_prefix
N2_suffix
N2_length
N2_postype
N2_is_alpha
N2_is_digit
N2_is_lower
N2_is_punct
N2_is_space
N2_is_title
N2_is_upper
N2_like_number
N2_pos
N2_sense
E_label
E0_sic
E0_cluster
E0_pos
E1_sic
E1_cluster
E1_pos
E_last_sic
E_last_cluster
E_last_pos
N_FIELDS
cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1

View File

@ -1,77 +0,0 @@
from libc.string cimport memset
from murmurhash.mrmr cimport hash64
from ._state cimport entity_is_open
from ..lexeme cimport *
cdef int _fill_token(atom_t* c, Lexeme* lex, atom_t pos):
c[T_sic] = lex.sic
c[T_cluster] = lex.cluster
c[T_norm] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
c[T_shape] = lex.shape
c[T_asciied] = lex.asciied
c[T_prefix] = lex.prefix
c[T_suffix] = lex.suffix
c[T_length] = lex.length
c[T_postype] = lex.postype
c[T_nertype] = 0
c[T_sensetype] = 0
c[T_is_alpha] = lex.flags & (1 << IS_ALPHA)
c[T_is_digit] = lex.flags & (1 << IS_DIGIT)
c[T_is_lower] = lex.flags & (1 << IS_LOWER)
c[T_is_punct] = lex.flags & (1 << IS_PUNCT)
c[T_is_space] = lex.flags & (1 << IS_SPACE)
c[T_is_title] = lex.flags & (1 << IS_TITLE)
c[T_is_upper] = lex.flags & (1 << IS_UPPER)
c[T_like_url] = lex.flags & (1 << LIKE_URL)
c[T_like_number] = lex.flags & (1 << LIKE_NUMBER)
c[T_oft_lower] = lex.flags & (1 << OFT_LOWER)
c[T_oft_title] = lex.flags & (1 << OFT_TITLE)
c[T_oft_upper] = lex.flags & (1 << OFT_UPPER)
c[T_in_males] = lex.flags & (1 << IN_MALES)
c[T_in_females] = lex.flags & (1 << IN_FEMALES)
c[T_in_surnames] = lex.flags & (1 << IN_SURNAMES)
c[T_in_places] = lex.flags & (1 << IN_PLACES)
c[T_in_celebs] = lex.flags & (1 << IN_CELEBS)
c[T_in_names] = lex.flags & (1 << IN_NAMES)
c[T_pos] = pos
c[T_sense] = 0
cdef int _fill_outer_token(atom_t* c, Lexeme* lex, atom_t pos):
c[0] = lex.sic
c[1] = lex.cluster
c[2] = lex.shape
c[3] = pos
cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1:
cdef int i
for i in range(N_FIELDS):
context[i] = 0
i = s.i
_fill_token(&context[P2_sic], tokens.lex[i-2], tokens.pos[i-2])
_fill_token(&context[P1_sic], tokens.lex[i-1], tokens.pos[i-1])
_fill_token(&context[W_sic], tokens.lex[i], tokens.pos[i])
_fill_token(&context[N1_sic], tokens.lex[i+1], tokens.pos[i+1])
_fill_token(&context[N2_sic], tokens.lex[i+2], tokens.pos[i+2])
cdef atom_t[5] ent_vals
if entity_is_open(s):
context[E_label] = s.curr.label
context[E0_sic] = tokens.lex[s.curr.start].sic
context[E0_cluster] = tokens.lex[s.curr.start].cluster
context[E0_pos] = tokens.pos[s.curr.start]
context[E_last_sic] = tokens.lex[s.i-1].sic
context[E_last_cluster] = tokens.lex[s.i-1].cluster
context[E_last_pos] = tokens.pos[s.i-1]
if (s.curr.start + 1) < s.i:
context[E1_sic] = tokens.lex[s.curr.start+1].sic
context[E1_cluster] = tokens.lex[s.curr.start+1].cluster
context[E1_pos] = tokens.pos[s.curr.start+1]
return 1

View File

View File

@ -1,107 +0,0 @@
from .context import *
LOCAL = (
(W_sic,),
(P1_sic,),
(N1_sic,),
(P2_sic,),
(N2_sic,),
(P1_sic, W_sic,),
(W_sic, N1_sic),
(W_prefix,),
(W_suffix,),
(P1_shape,),
(W_shape,),
(N1_shape,),
(P1_shape, W_shape,),
(W_shape, P1_shape,),
(P1_shape, W_shape, N1_shape),
(N2_shape,),
(P2_shape,),
(P2_norm, P1_norm, W_norm),
(P1_norm, W_norm, N1_norm),
(W_norm, N1_norm, N2_norm)
)
POS = (
(P2_pos,),
(P1_pos,),
(W_pos,),
(N1_pos,),
(N2_pos,),
(P1_pos, W_pos),
(W_pos, N1_pos),
(P2_pos, P1_pos, W_pos),
(P1_pos, W_pos, N1_pos),
(W_pos, N1_pos, N2_pos)
)
CLUSTERS = (
(P2_cluster,),
(P1_cluster,),
(W_cluster,),
(N1_cluster,),
(N2_cluster,),
(P1_cluster, W_cluster),
(W_cluster, N1_cluster),
)
CLUSTER_POS = (
(P1_cluster, W_pos),
(W_pos, P1_cluster),
(W_cluster, N1_pos),
(W_pos, N1_cluster)
)
STATE = (
(E0_sic,),
(E0_cluster,),
(E0_pos,),
(E_last_sic,),
(E_last_cluster,),
(E_last_pos,),
(E0_sic, W_sic),
(E0_cluster, W_cluster),
(E0_pos, W_pos),
(E_last_sic, W_sic),
(E_last_pos, W_pos),
(E0_pos, E_last_pos, W_pos),
(E0_cluster, E_last_cluster, W_cluster),
(E0_sic, E_last_sic),
(E0_pos, E_last_pos),
(E0_cluster, E_last_cluster),
(E0_pos, E_last_cluster),
(E0_cluster, E_last_pos),
(E1_sic,),
(E1_cluster,),
(E1_pos,),
(E0_sic, E1_sic),
(E0_sic, E1_pos,),
(E0_pos, E1_sic,),
(E0_pos, E1_pos),
(E_label,),
(E_label, W_sic),
(E_label, W_pos),
(E_label, W_cluster),
(E_label, W_shape),
(E_label, E_last_sic),
(E_label, E0_pos, E_last_pos),
)
TEMPLATES = LOCAL + CLUSTERS + POS + CLUSTER_POS + STATE

View File

@ -1,29 +0,0 @@
from cymem.cymem cimport Pool
from thinc.features cimport Extractor
from thinc.learner cimport LinearModel
from thinc.typedefs cimport *
from ..tokens cimport Tokens
from ..typedefs cimport *
from .structs cimport Move
from .annot cimport NERAnnotation
cdef class NERParser:
cdef Pool mem
cdef Extractor extractor
cdef LinearModel model
cdef readonly list tag_names
cdef readonly list entity_types
cdef readonly int n_classes
cdef Move* _moves
cdef atom_t* _context
cdef feat_t* _feats
cdef weight_t* _values
cdef weight_t* _scores
cpdef list train(self, Tokens tokens, NERAnnotation annot)
cpdef list set_tags(self, Tokens tokens)

View File

@ -1,81 +0,0 @@
cimport cython
import random
import os
from os import path
import shutil
import json
from thinc.features cimport ConjFeat
from ..context cimport fill_context
from ..context cimport N_FIELDS
from .moves cimport Move
from .moves cimport fill_moves, transition, best_accepted
from .moves cimport set_accept_if_valid, set_accept_if_oracle
from .moves import get_n_moves
from ._state cimport State
from ._state cimport init_state
cdef class NERParser:
def __init__(self, model_dir):
self.mem = Pool()
cfg = json.load(open(path.join(model_dir, 'config.json')))
templates = cfg['templates']
self.entity_types = cfg['entity_types']
self.extractor = Extractor(templates, [ConjFeat] * len(templates))
self.n_classes = get_n_moves(len(self.entity_types))
self._moves = <Move*>self.mem.alloc(self.n_classes, sizeof(Move))
fill_moves(self._moves, len(self.entity_types))
self.model = LinearModel(len(self.tag_names))
if path.exists(path.join(model_dir, 'model')):
self.model.load(path.join(model_dir, 'model'))
self._context = <atom_t*>self.mem.alloc(N_FIELDS, sizeof(atom_t))
self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))
self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
self._scores = <weight_t*>self.mem.alloc(self.model.nr_class, sizeof(weight_t))
cpdef int train(self, Tokens tokens, gold_classes):
cdef Pool mem = Pool()
cdef State* s = init_state(mem, tokens.length)
cdef Move* golds = <Move*>mem.alloc(len(gold_classes), sizeof(Move))
for i, clas in enumerate(gold_classes):
golds[i] = self.moves[clas - 1]
assert golds[i].id == clas
cdef Move* guess
while s.i < tokens.length:
fill_context(self._context, s.i, tokens)
self.extractor.extract(self._feats, self._values, self._context, NULL)
self.model.score(self._scores, self._feats, self._values)
set_accept_if_valid(self._moves, self.n_classes, s)
guess = best_accepted(self._moves, self._scores, self.n_classes)
set_accept_if_oracle(self._moves, golds, self.n_classes, s) # TODO
gold = best_accepted(self._moves, self._scores, self.n_classes)
if guess.clas == gold.clas:
self.model.update({})
return 0
counts = {guess.clas: {}, gold.clas: {}}
self.extractor.count(counts[gold.clas], self._feats, 1)
self.extractor.count(counts[guess.clas], self._feats, -1)
self.model.update(counts)
transition(s, guess)
tokens.ner[s.i-1] = s.tags[s.i-1]
cpdef int set_tags(self, Tokens tokens) except -1:
cdef Pool mem = Pool()
cdef State* s = init_state(mem, tokens.length)
cdef Move* move
while s.i < tokens.length:
fill_context(self._context, s.i, tokens)
self.extractor.extract(self._feats, self._values, self._context, NULL)
self.model.score(self._scores, self._feats, self._values)
set_accept_if_valid(self._moves, self.n_classes, s)
move = best_accepted(self._moves, self._scores, self.n_classes)
transition(s, move)
tokens.ner[s.i-1] = s.tags[s.i-1]

View File

@ -1,26 +0,0 @@
from cymem.cymem cimport Pool
from thinc.typedefs cimport class_t
from thinc.typedefs cimport weight_t
from .structs cimport State, Move
cpdef enum ActionType:
MISSING
SHIFT
REDUCE
OUT
N_ACTIONS
cdef int set_accept_if_oracle(Move* moves, int n, State* s,
int* g_starts, int* g_ends, int* g_labels) except 0
cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0
cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL
cdef int transition(State *s, Move* m) except -1
cdef int fill_moves(Move* moves, int n, list entity_types) except -1

View File

@ -1,161 +0,0 @@
from __future__ import unicode_literals
from cymem.cymem cimport Pool
from thinc.typedefs cimport class_t
from thinc.typedefs cimport weight_t
from ._state cimport begin_entity
from ._state cimport end_entity
from ._state cimport entity_is_open
ACTION_NAMES = ['' for _ in range(N_ACTIONS)]
ACTION_NAMES[<int>MISSING] = '?'
ACTION_NAMES[<int>SHIFT] = 'S'
ACTION_NAMES[<int>REDUCE] = 'R'
ACTION_NAMES[<int>OUT] = 'O'
cdef int set_accept_if_oracle(Move* moves, int n, State* s,
int* g_starts, int* g_ends, int* g_labels) except 0:
# If curr entity: (O invalid)
# if cost is not sunk (start matches, end is i-1 or greater
# - If i-1 == gold.end --> R=True, S=False
# - Shift if end >= i --> S=True, R=False
# else
# - If i == gold.start --> R=True, S=False
# - Else --> R=True, S=True
# Else (R invalid):
# if start == gold.start: S=True, O=False
# else: O=True, S=False
if entity_is_open(s):
g_start = g_starts[s.curr.start]
g_end = g_ends[s.curr.start]
accept_o = False
if g_start == s.curr.start and g_end == s.i:
accept_r = True
accept_s = False
elif g_start == s.curr.start and g_end > s.i:
accept_s = True
s_label = s.curr.label
accept_r = False
elif g_starts[s.i] == s.i:
accept_r = True
accept_s = False
else:
accept_r = True
accept_s = True
s_label = s.curr.label
else:
accept_r = False
if g_starts[s.i] == s.i:
accept_s = True
s_label = g_labels[s.i]
accept_o = False
else:
accept_o = True
accept_s = False
n_accept = 0
moves[0].accept = False
for i in range(1, n):
m = &moves[i]
if m.action == SHIFT:
m.accept = accept_s and m.label == s_label
elif m.action == REDUCE:
m.accept = accept_r
elif m.action == OUT:
m.accept = accept_o
n_accept += m.accept
assert n_accept != 0
return n_accept
cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0:
cdef int i
cdef bint open_ent = entity_is_open(s)
cdef int n_accept = 0
moves[0].accept = False
for i in range(1, n):
if moves[i].action == SHIFT:
if s.i >= s.length:
moves[i].accept = False
elif open_ent and moves[i].label != s.curr.label:
moves[i].accept = False
else:
moves[i].accept = True
elif moves[i].action == REDUCE:
moves[i].accept = open_ent
elif moves[i].action == OUT:
moves[i].accept = s.i < s.length and not open_ent
n_accept += moves[i].accept
return n_accept
cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL:
cdef int first_accept = -1
for first_accept in range(1, n):
if moves[first_accept].accept:
break
else:
raise StandardError
assert first_accept != -1
cdef int best = first_accept
cdef weight_t score = scores[first_accept-1]
cdef int i
for i in range(first_accept+1, n):
if moves[i].accept and scores[i-1] > score:
best = i
score = scores[i-1]
return &moves[best]
cdef int transition(State *s, Move* move) except -1:
s.tags[s.i] = move.clas
if move.action == OUT:
s.i += 1
elif move.action == SHIFT:
if not entity_is_open(s):
s.curr.start = s.i
s.curr.label = move.label
s.i += 1
elif move.action == REDUCE:
s.curr.end = s.i
s.ents[s.j] = s.curr
s.j += 1
s.curr.start = 0
s.curr.label = -1
s.curr.end = 0
else:
raise ValueError(move.action)
def get_n_moves(n_tags):
return 1 + 1 + 1 + n_tags
cdef int fill_moves(Move* moves, int n, list entity_types) except -1:
cdef Move* m
label_names = {'-': 0}
# Reserve class 0
cdef int i = 0
moves[i].clas = i
moves[i].action = MISSING
moves[i].label = 0
i += 1
for entity_type in entity_types:
moves[i].action = SHIFT
moves[i].label = label_names.setdefault(entity_type, len(label_names))
moves[i].clas = i
i += 1
moves[i].clas = i
moves[i].action = OUT
moves[i].label = 0
i += 1
moves[i].action = REDUCE
moves[i].clas = i
moves[i].label = 0
i += 1
cdef bint is_final(State* s):
return s.i == s.length and not entity_is_open(s)

View File

@ -1,16 +0,0 @@
from cymem.cymem cimport Pool
from .structs cimport Move, State
cdef class PyState:
cdef Pool mem
cdef readonly list tag_names
cdef readonly int n_classes
cdef readonly dict moves_by_name
cdef Move* _moves
cdef Move* _golds
cdef State* _s
cdef Move* _get_move(self, unicode move_name) except NULL

View File

@ -1,60 +0,0 @@
from __future__ import unicode_literals
from ._state cimport init_state
from ._state cimport entity_is_open
from .bilou_moves cimport fill_moves
from .bilou_moves cimport transition
from .bilou_moves cimport set_accept_if_valid, set_accept_if_oracle
from .bilou_moves import get_n_moves
from .bilou_moves import ACTION_NAMES
cdef class PyState:
def __init__(self, tag_names, n_tokens):
self.mem = Pool()
self.tag_names = tag_names
self.n_classes = len(tag_names)
assert self.n_classes != 0
self._moves = <Move*>self.mem.alloc(self.n_classes, sizeof(Move))
fill_moves(self._moves, tag_names)
self._s = init_state(self.mem, n_tokens)
self._golds = <Move*>self.mem.alloc(n_tokens, sizeof(Move))
cdef Move* _get_move(self, unicode move_name) except NULL:
return &self._moves[self.tag_names.index(move_name)]
def set_golds(self, list gold_names):
cdef Move* m
for i, name in enumerate(gold_names):
m = self._get_move(name)
self._golds[i] = m[0]
def transition(self, unicode move_name):
cdef Move* m = self._get_move(move_name)
transition(self._s, m)
def is_valid(self, unicode move_name):
cdef Move* m = self._get_move(move_name)
set_accept_if_valid(self._moves, self.n_classes, self._s)
return m.accept
def is_gold(self, unicode move_name):
cdef Move* m = self._get_move(move_name)
set_accept_if_oracle(self._moves, self._golds, self.n_classes, self._s)
return m.accept
property ent:
def __get__(self):
return self._s.curr
property n_ents:
def __get__(self):
return self._s.j
property i:
def __get__(self):
return self._s.i
property open_entity:
def __get__(self):
return entity_is_open(self._s)

View File

@ -1,23 +0,0 @@
from thinc.typedefs cimport class_t
cdef struct Entity:
int start
int end
int label
cdef struct State:
Entity curr
Entity* ents
int* tags
int i
int j
int length
cdef struct Move:
class_t clas
int action
int label
bint accept

View File

View File

@ -1,41 +0,0 @@
from spacy.context cimport FIELD_IDS, Token
cpdef Token P2 = FIELD_IDS.P2
cpdef Token P1 = FIELD_IDS.P1
cpdef Token N0 = FIELD_IDS.N0
cpdef Token N1 = FIELD_IDS.N1
cpdef Token N2 = FIELD_IDS.N2
TEMPLATES = (
(N0.sic,),
(N0.norm,),
(N0.suffix,),
(N0.prefix,),
(P1.pos,),
(P2.pos,),
(P1.pos, P2.pos),
(P1.pos, N0.norm),
(P1.norm,),
(P1.suffix,),
(P2.norm,),
(N1.norm,),
(N1.suffix,),
(N2.norm,),
(N0.shape,),
(N0.cluster,),
(N1.cluster,),
(N2.cluster,),
(P1.cluster,),
(P2.cluster,),
(N0.oft_upper,),
(N0.oft_title,),
(N0.postype,),
(P1.like_url,),
(N1.like_number,),
(N1.like_url,),
)

View File

@ -1,153 +0,0 @@
from __future__ import unicode_literals
from . import util
from . import tokens
from .en import EN
def read_gold(file_, tag_list, col):
paras = file_.read().strip().split('\n\n')
golds = []
tag_ids = dict((tag, i) for i, tag in enumerate(tag_list))
for para in paras:
if not para.strip():
continue
lines = para.strip().split('\n')
raw = lines.pop(0)
gold_toks = lines.pop(0)
tokens = EN.tokenize(raw)
tags = []
conll_toks = []
for line in lines:
pieces = line.split()
conll_toks.append((int(pieces[0]), len(pieces[1]), pieces[col]))
for i, token in enumerate(tokens):
if not conll_toks:
tags.append('NULL')
elif token.idx == conll_toks[0][0]:
tags.append(conll_toks[0][2])
conll_toks.pop(0)
elif token.idx < conll_toks[0]:
tags.append('NULL')
else:
conll_toks.pop(0)
assert len(tags) == len(tokens)
tags = [_encode_pos(t, tag_ids, tag_list) for t in tags]
golds.append((tokens, tags))
return golds
def _encode_pos(tag, tag_ids, tag_list):
if tag == '-':
return 0
if tag not in tag_ids:
tag_ids[tag] = len(tag_list)
tag_list.append(tag)
return tag_ids[tag]
def ptb_to_univ(tag):
mapping = dict(tuple(line.split()) for line in """
NULL NULL
HYPH .
ADD X
NFP .
AFX X
XX X
BES VERB
HVS VERB
GW X
! .
# .
$ .
'' .
( .
) .
, .
-LRB- .
-RRB- .
. .
: .
? .
CC CONJ
CD NUM
CD|RB X
DT DET
EX DET
FW X
IN ADP
IN|RP ADP
JJ ADJ
JJR ADJ
JJRJR ADJ
JJS ADJ
JJ|RB ADJ
JJ|VBG ADJ
LS X
MD VERB
NN NOUN
NNP NOUN
NNPS NOUN
NNS NOUN
NN|NNS NOUN
NN|SYM NOUN
NN|VBG NOUN
NP NOUN
PDT DET
POS PRT
PRP PRON
PRP$ PRON
PRP|VBP PRON
PRT PRT
RB ADV
RBR ADV
RBS ADV
RB|RP ADV
RB|VBG ADV
RN X
RP PRT
SYM X
TO PRT
UH X
VB VERB
VBD VERB
VBD|VBN VERB
VBG VERB
VBG|NN VERB
VBN VERB
VBP VERB
VBP|TO VERB
VBZ VERB
VP VERB
WDT DET
WH X
WP PRON
WP$ PRON
WRB ADV
! PRT
# X
$ NUM
& CONJ
, .
@ X
A ADJ
D DET
E X
G X
L PRT
M PRT
N NOUN
O PRON
P ADP
R ADV
S NOUN
T PRT
U X
V VERB
X PRT
Y PRT
Z NOUN
^ NOUN
~ X
`` .
EOL EOL""".strip().split('\n'))
return mapping[tag]

View File

@ -8,7 +8,7 @@ from .structs cimport Utf8Str, UniStr
cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) nogil:
s.chars = &chars[start]
s.n = end - start
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
s.key = hash64(s.chars, <int>(s.n * sizeof(Py_UNICODE)), 0)
cdef class StringStore:

View File

@ -1,6 +1,6 @@
from libc.stdint cimport uint8_t, uint32_t
from .typedefs cimport flags_t, attr_t, id_t, hash_t
from .typedefs cimport flags_t, attr_t, id_t, hash_t, univ_tag_t
cdef struct Lexeme:
@ -34,7 +34,7 @@ cdef struct Morphology:
cdef struct PosTag:
Morphology morph
int id
int pos
univ_tag_t pos
cdef struct TokenC:

View File

@ -2,7 +2,7 @@ from libc.stdint cimport uint32_t
from cymem.cymem cimport Pool
from ..tokens cimport TokenC
from ..structs cimport TokenC
cdef struct State:
@ -20,7 +20,8 @@ cdef int pop_stack(State *s) except -1
cdef int push_stack(State *s) except -1
cdef bint has_head(const TokenC* t) nogil
cdef inline bint has_head(const TokenC* t) nogil:
return t.head != 0
cdef inline int get_idx(const State* s, const TokenC* t) nogil:
@ -70,29 +71,14 @@ cdef inline bint is_final(const State *s) nogil:
return at_eol(s) # The stack will be attached to root anyway
cdef int children_in_buffer(const State *s, const int head, const int* gold) except -1
cdef int head_in_buffer(const State *s, const int child, const int* gold) except -1
cdef int children_in_stack(const State *s, const int head, const int* gold) except -1
cdef int head_in_stack(const State *s, const int child, const int* gold) except -1
cdef int children_in_buffer(const State *s, const int head, int* gold) except -1
cdef int head_in_buffer(const State *s, const int child, int* gold) except -1
cdef int children_in_stack(const State *s, const int head, int* gold) except -1
cdef int head_in_stack(const State *s, const int child, int* gold) except -1
cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL
cdef int count_left_kids(const TokenC* head) nogil
cdef int count_right_kids(const TokenC* head) nogil
# From https://en.wikipedia.org/wiki/Hamming_weight
cdef inline uint32_t _popcount(uint32_t x) nogil:
"""Find number of non-zero bits."""
cdef int count = 0
while x != 0:
x &= x - 1
count += 1
return count
cdef inline uint32_t _nth_significant_bit(uint32_t bits, int n) nogil:
cdef int i

View File

@ -3,32 +3,24 @@ from libc.string cimport memmove
from cymem.cymem cimport Pool
from ..lexeme cimport EMPTY_LEXEME
from ..tokens cimport TokenC
DEF PADDING = 5
DEF NON_MONOTONIC = True
cdef int add_dep(State *s, int head, int child, int label) except -1:
cdef int dist = head - child
s.sent[child].head = dist
s.sent[child].head = head - child
s.sent[child].dep_tag = label
# Keep a bit-vector tracking child dependencies. If a word has a child at
# offset i from it, set that bit (tracking left and right separately)
if child > head:
s.sent[head].r_kids |= 1 << (-dist)
s.sent[head].r_kids |= 1 << (-s.sent[child].head)
else:
s.sent[head].l_kids |= 1 << dist
s.sent[head].l_kids |= 1 << s.sent[child].head
cdef int pop_stack(State *s) except -1:
assert s.stack_len >= 1
s.stack_len -= 1
s.stack -= 1
if s.stack_len == 0 and not at_eol(s):
push_stack(s)
cdef int push_stack(State *s) except -1:
assert s.i < s.sent_len
@ -36,14 +28,9 @@ cdef int push_stack(State *s) except -1:
s.stack[0] = s.i
s.stack_len += 1
s.i += 1
if at_eol(s):
while s.stack_len != 0:
if not has_head(get_s0(s)):
get_s0(s).dep_tag = 0
pop_stack(s)
cdef int children_in_buffer(const State *s, int head, const int* gold) except -1:
cdef int children_in_buffer(const State *s, int head, int* gold) except -1:
# Golds holds an array of head offsets --- the head of word i is i - golds[i]
# Iterate over the tokens of the queue, and check whether their gold head is
# our target
@ -55,21 +42,20 @@ cdef int children_in_buffer(const State *s, int head, const int* gold) except -1
return n
cdef int head_in_buffer(const State *s, const int child, const int* gold) except -1:
cdef int head_in_buffer(const State *s, const int child, int* gold) except -1:
return gold[child] >= s.i
cdef int children_in_stack(const State *s, const int head, const int* gold) except -1:
cdef int children_in_stack(const State *s, const int head, int* gold) except -1:
cdef int i
cdef int n = 0
for i in range(s.stack_len):
if gold[s.stack[-i]] == head:
if NON_MONOTONIC or not has_head(get_s0(s)):
n += 1
n += 1
return n
cdef int head_in_stack(const State *s, const int child, const int* gold) except -1:
cdef int head_in_stack(const State *s, const int child, int* gold) except -1:
cdef int i
for i in range(s.stack_len):
if gold[child] == s.stack[-i]:
@ -86,7 +72,7 @@ cdef const TokenC* get_left(const State* s, const TokenC* head, const int idx) n
if child >= s.sent:
return child
else:
return NULL
return s.sent - 1
cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx) nogil:
@ -98,20 +84,10 @@ cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx)
if child < (s.sent + s.sent_len):
return child
else:
return NULL
return s.sent - 1
cdef bint has_head(const TokenC* t) nogil:
return t.head != 0
cdef int count_left_kids(const TokenC* head) nogil:
return _popcount(head.l_kids)
cdef int count_right_kids(const TokenC* head) nogil:
return _popcount(head.r_kids)
DEF PADDING = 5
cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL:
@ -126,5 +102,4 @@ cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NUL
s.stack_len = 0
s.i = 0
s.sent_len = sent_length
push_stack(s)
return s

View File

@ -7,11 +7,8 @@ from ._state cimport State
cdef struct Transition:
int clas
int move
int label
int cost
weight_t score
cdef class TransitionSystem:
@ -21,8 +18,7 @@ cdef class TransitionSystem:
cdef const Transition* _moves
cdef Transition best_valid(self, const weight_t* scores, const State* s) except *
cdef Transition best_gold(self, Transition* guess, const weight_t* scores,
const State* s,
const int* gold_heads, const int* gold_labels) except *
cdef Transition best_valid(self, const weight_t* scores, const State* s) except -1
cdef Transition best_gold(self, const weight_t* scores, const State* s,
int* gold_heads, int* gold_labels) except -1
cdef int transition(self, State *s, const Transition* t) except -1

View File

@ -7,8 +7,6 @@ from ._state cimport head_in_stack, children_in_stack
from ..tokens cimport TokenC
DEF NON_MONOTONIC = True
cdef enum:
SHIFT
@ -27,30 +25,22 @@ cdef inline bint _can_right(const State* s) nogil:
cdef inline bint _can_left(const State* s) nogil:
if NON_MONOTONIC:
return s.stack_len >= 1
else:
return s.stack_len >= 1 and not has_head(get_s0(s))
return s.stack_len >= 1 and not has_head(get_s0(s))
cdef inline bint _can_reduce(const State* s) nogil:
if NON_MONOTONIC:
return s.stack_len >= 2
else:
return s.stack_len >= 2 and has_head(get_s0(s))
return s.stack_len >= 2 and has_head(get_s0(s))
cdef int _shift_cost(const State* s, const int* gold) except -1:
cdef int _shift_cost(const State* s, int* gold) except -1:
assert not at_eol(s)
cost = 0
cost += head_in_stack(s, s.i, gold)
cost += children_in_stack(s, s.i, gold)
if NON_MONOTONIC:
cost += gold[s.stack[0]] == s.i
return cost
cdef int _right_cost(const State* s, const int* gold) except -1:
cdef int _right_cost(const State* s, int* gold) except -1:
assert s.stack_len >= 1
cost = 0
if gold[s.i] == s.stack[0]:
@ -58,12 +48,10 @@ cdef int _right_cost(const State* s, const int* gold) except -1:
cost += head_in_buffer(s, s.i, gold)
cost += children_in_stack(s, s.i, gold)
cost += head_in_stack(s, s.i, gold)
if NON_MONOTONIC:
cost += gold[s.stack[0]] == s.i
return cost
cdef int _left_cost(const State* s, const int* gold) except -1:
cdef int _left_cost(const State* s, int* gold) except -1:
assert s.stack_len >= 1
cost = 0
if gold[s.stack[0]] == s.i:
@ -71,17 +59,11 @@ cdef int _left_cost(const State* s, const int* gold) except -1:
cost += head_in_buffer(s, s.stack[0], gold)
cost += children_in_buffer(s, s.stack[0], gold)
if NON_MONOTONIC and s.stack_len >= 2:
cost += gold[s.stack[0]] == s.stack[-1]
return cost
cdef int _reduce_cost(const State* s, const int* gold) except -1:
cdef int cost = 0
cost += children_in_buffer(s, s.stack[0], gold)
if NON_MONOTONIC:
cost += head_in_buffer(s, s.stack[0], gold)
return cost
cdef int _reduce_cost(const State* s, int* gold) except -1:
return children_in_buffer(s, s.stack[0], gold)
cdef class TransitionSystem:
@ -91,40 +73,38 @@ cdef class TransitionSystem:
right_labels.sort()
if 'ROOT' in right_labels:
right_labels.pop(right_labels.index('ROOT'))
if 'dep' in right_labels:
right_labels.pop(right_labels.index('dep'))
if 'ROOT' in left_labels:
left_labels.pop(left_labels.index('ROOT'))
if 'dep' in left_labels:
left_labels.pop(left_labels.index('dep'))
self.n_moves = 2 + len(left_labels) + len(right_labels)
moves = <Transition*>self.mem.alloc(self.n_moves, sizeof(Transition))
cdef int i = 0
moves[i].move = SHIFT
moves[i].label = 0
moves[i].clas = i
i += 1
moves[i].move = REDUCE
moves[i].label = 0
moves[i].clas = i
i += 1
self.label_ids = {'ROOT': 0}
self.label_ids = {'ROOT': 0, 'dep': -1}
cdef int label_id
for label_str in left_labels:
label_id = self.label_ids.setdefault(label_str, len(self.label_ids))
moves[i].move = LEFT
moves[i].label = label_id
moves[i].clas = i
i += 1
for label_str in right_labels:
label_id = self.label_ids.setdefault(label_str, len(self.label_ids))
moves[i].move = RIGHT
moves[i].label = label_id
moves[i].clas = i
i += 1
self._moves = moves
cdef int transition(self, State *s, const Transition* t) except -1:
cdef int transition(self, State *s, const int clas) except -1:
cdef const Transition* t = &self._moves[clas]
if t.move == SHIFT:
# Set the dep label, in case we need it after we reduce
if NON_MONOTONIC:
get_s0(s).dep_tag = t.label
push_stack(s)
elif t.move == LEFT:
add_dep(s, s.i, s.stack[0], t.label)
@ -133,12 +113,11 @@ cdef class TransitionSystem:
add_dep(s, s.stack[0], s.i, t.label)
push_stack(s)
elif t.move == REDUCE:
add_dep(s, s.stack[-1], s.stack[0], get_s0(s).dep_tag)
pop_stack(s)
else:
raise StandardError(t.move)
cdef Transition best_valid(self, const weight_t* scores, const State* s) except *:
cdef int best_valid(self, const weight_t* scores, const State* s) except -1:
cdef bint[N_MOVES] valid
valid[SHIFT] = _can_shift(s)
valid[LEFT] = _can_left(s)
@ -147,61 +126,59 @@ cdef class TransitionSystem:
cdef int best = -1
cdef weight_t score = 0
cdef weight_t best_r_score = -9000
cdef int best_r_label = -1
cdef int i
for i in range(self.n_moves):
if valid[self._moves[i].move] and (best == -1 or scores[i] > score):
best = i
score = scores[i]
if self._moves[i].move == RIGHT and scores[i] > best_r_score:
best_r_label = self._moves[i].label
assert best >= 0
cdef Transition t = self._moves[best]
t.score = score
if t.move == SHIFT:
t.label = best_r_label
return t
return best
cdef Transition best_gold(self, Transition* guess, const weight_t* scores,
const State* s,
const int* gold_heads, const int* gold_labels) except *:
# If we can create a gold dependency, only one action can be correct
cdef int best_gold(self, const weight_t* scores, const State* s,
int* gold_heads, int* gold_labels) except -1:
cdef int[N_MOVES] unl_costs
unl_costs[SHIFT] = _shift_cost(s, gold_heads) if _can_shift(s) else -1
unl_costs[LEFT] = _left_cost(s, gold_heads) if _can_left(s) else -1
unl_costs[RIGHT] = _right_cost(s, gold_heads) if _can_right(s) else -1
unl_costs[REDUCE] = _reduce_cost(s, gold_heads) if _can_reduce(s) else -1
guess.cost = unl_costs[guess.move]
cdef Transition t
cdef int target_label
cdef int i
if gold_heads[s.stack[0]] == s.i:
target_label = gold_labels[s.stack[0]]
if guess.move == LEFT:
guess.cost += guess.label != target_label
for i in range(self.n_moves):
t = self._moves[i]
if t.move == LEFT and t.label == target_label:
return t
elif gold_heads[s.i] == s.stack[0]:
target_label = gold_labels[s.i]
if guess.move == RIGHT:
guess.cost += guess.label != target_label
for i in range(self.n_moves):
t = self._moves[i]
if t.move == RIGHT and t.label == target_label:
return t
cdef int cost
cdef int move
cdef int label
cdef int best = -1
cdef weight_t score = -9000
cdef int i
for i in range(self.n_moves):
t = self._moves[i]
if unl_costs[t.move] == 0 and (best == -1 or scores[i] > score):
best = i
score = scores[i]
t = self._moves[best]
t.score = score
assert best >= 0
return t
move = self._moves[i].move
label = self._moves[i].label
if unl_costs[move] == 0:
if move == SHIFT or move == REDUCE:
cost = 0
elif move == LEFT:
if gold_heads[s.stack[0]] == s.i and gold_labels[s.stack[0]] != -1:
cost = label != gold_labels[s.stack[0]]
else:
cost = 0
elif move == RIGHT:
if gold_heads[s.i] == s.stack[0] and gold_labels[s.i] != -1:
cost = label != gold_labels[s.i]
else:
cost = 0
else:
raise StandardError("Unknown Move")
if cost == 0 and (best == -1 or scores[i] > score):
best = i
score = scores[i]
if best < 0:
print unl_costs[SHIFT], unl_costs[REDUCE], unl_costs[LEFT], unl_costs[RIGHT]
print s.stack_len
print has_head(get_s0(s))
print s.sent[s.stack[0]].head
print s.stack[0], s.i
print gold_heads[s.stack[0]], gold_heads[s.i]
print gold_labels[s.i]
print children_in_buffer(s, s.stack[0], gold_heads)
print head_in_buffer(s, s.stack[0], gold_heads)
raise StandardError
return best

View File

@ -2,6 +2,8 @@
# cython: embedsignature=True
from __future__ import unicode_literals
from os import path
from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as preinc
@ -28,6 +30,17 @@ cdef class Tokenizer:
self.vocab = Vocab(self.get_props)
self._load_special_tokenization(rules)
@classmethod
def from_dir(cls, Vocab vocab, object data_dir):
if not path.exists(data_dir):
raise IOError("Directory %s not found -- cannot load Tokenizer." % data_dir)
if not path.isdir(data_dir):
raise IOError("Path %s is a file, not a dir -- cannot load Tokenizer." % data_dir)
assert path.exists(data_dir) and path.isdir(data_dir)
rules, prefix_re, suffix_re, infix_re = util.read_lang_data(data_dir)
return cls(vocab, rules, prefix_re, suffix_re, infix_re)
cpdef Tokens tokens_from_list(self, list strings):
cdef int length = sum([len(s) for s in strings])
cdef Tokens tokens = Tokens(self.vocab.strings, length)

View File

@ -1,6 +1,26 @@
from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t
from libc.stdint cimport uint8_t
# Google universal tag set
cpdef enum univ_tag_t:
NO_TAG
ADJ
ADV
ADP
CONJ
DET
NOUN
NUM
PRON
PRT
VERB
X
PUNCT
EOL
N_UNIV_TAGS
ctypedef uint64_t hash_t
ctypedef char* utf8_t
ctypedef uint32_t attr_t
@ -10,11 +30,3 @@ ctypedef uint16_t len_t
ctypedef uint16_t tag_t
cdef struct Morphology:
uint8_t number
uint8_t tenspect # Tense/aspect/voice
uint8_t mood
uint8_t gender
uint8_t person
uint8_t case
uint8_t misc

View File

@ -1,34 +0,0 @@
from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64
from .typedefs cimport utf8_t, id_t, hash_t
cdef struct Utf8Str:
id_t i
hash_t key
utf8_t chars
int length
cdef struct UniStr:
Py_UNICODE* chars
size_t n
hash_t key
cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) nogil:
s.chars = &chars[start]
s.n = end - start
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
cdef class StringStore:
cdef Pool mem
cdef PreshMap _map
cdef Utf8Str* strings
cdef int size
cdef int _resize_at
cdef const Utf8Str* intern(self, char* chars, int length) except NULL

View File

@ -1,80 +0,0 @@
from libc.string cimport memcpy
from murmurhash.mrmr cimport hash64
import codecs
SEPARATOR = '\n|-SEP-|\n'
cdef class StringStore:
def __init__(self):
self.mem = Pool()
self._map = PreshMap()
self._resize_at = 10000
self.strings = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
self.size = 1
property size:
def __get__(self):
return self.size-1
def __getitem__(self, object string_or_id):
cdef bytes byte_string
cdef const Utf8Str* utf8str
if isinstance(string_or_id, int) or isinstance(string_or_id, long):
if string_or_id < 1 or string_or_id >= self.size:
raise IndexError(string_or_id)
utf8str = &self.strings[<int>string_or_id]
return utf8str.chars[:utf8str.length]
elif isinstance(string_or_id, bytes):
utf8str = self.intern(<char*>string_or_id, len(string_or_id))
return utf8str.i
elif isinstance(string_or_id, unicode):
byte_string = string_or_id.encode('utf8')
utf8str = self.intern(<char*>byte_string, len(byte_string))
return utf8str.i
else:
raise TypeError(type(string_or_id))
cdef const Utf8Str* intern(self, char* chars, int length) except NULL:
# 0 means missing, but we don't bother offsetting the index. We waste
# slot 0 to simplify the code, because it doesn't matter.
assert length != 0
cdef hash_t key = hash64(chars, length * sizeof(char), 0)
cdef void* value = self._map.get(key)
cdef size_t i
if value == NULL:
if self.size == self._resize_at:
self._resize_at *= 2
self.strings = <Utf8Str*>self.mem.realloc(self.strings, self._resize_at * sizeof(Utf8Str))
i = self.size
self.strings[i].i = self.size
self.strings[i].key = key
self.strings[i].chars = <char*>self.mem.alloc(length, sizeof(char))
memcpy(self.strings[i].chars, chars, length)
self.strings[i].length = length
self._map.set(key, <void*>self.size)
self.size += 1
else:
i = <size_t>value
return &self.strings[i]
def dump(self, loc):
strings = []
cdef Utf8Str* string
cdef bytes py_string
for i in range(self.size):
string = &self.strings[i]
py_string = string.chars[:string.length]
strings.append(py_string.decode('utf8'))
with codecs.open(loc, 'w', 'utf8') as file_:
file_.write(SEPARATOR.join(strings))
def load(self, loc):
with codecs.open(loc, 'r', 'utf8') as file_:
strings = file_.read().split(SEPARATOR)
cdef unicode string
cdef bytes byte_string
for string in strings[1:]:
byte_string = string.encode('utf8')
self.intern(byte_string, len(byte_string))

View File

@ -11,8 +11,7 @@ def utf8open(loc, mode='r'):
return codecs.open(loc, mode, 'utf8')
def read_lang_data(name):
data_dir = path.join(DATA_DIR, name)
def read_lang_data(data_dir):
with open(path.join(data_dir, 'specials.json')) as file_:
tokenization = ujson.load(file_)
prefix = read_prefix(data_dir)

View File

@ -19,6 +19,17 @@ cdef class Vocab:
self.lexemes.push_back(&EMPTY_LEXEME)
self.get_lex_props = get_props
@classmethod
def from_dir(cls, object data_dir, object get_lex_props=None):
if not path.exists(data_dir):
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
if not path.isdir(data_dir):
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
cdef Vocab self = cls(get_props)
self.strings.load(path.join(data_dir, 'strings'))
self.load(path.join(data_dir, 'lexemes'))
return self
def __len__(self):
return self.lexemes.size()