mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 09:44:36 +03:00
* Work on morphological processing
This commit is contained in:
parent
7b68f911cf
commit
99bbbb6feb
51
spacy/en.pxd
51
spacy/en.pxd
|
@ -5,6 +5,57 @@ from .tokens cimport Tokens
|
||||||
from .tokens cimport TokenC
|
from .tokens cimport TokenC
|
||||||
|
|
||||||
|
|
||||||
|
cpdef enum en_person_t:
|
||||||
|
NO_PERSON
|
||||||
|
FIRST
|
||||||
|
SECOND
|
||||||
|
THIRD
|
||||||
|
|
||||||
|
|
||||||
|
cpdef enum en_number_t:
|
||||||
|
NO_NUMBER
|
||||||
|
SINGULAR
|
||||||
|
PLURAL
|
||||||
|
MASS
|
||||||
|
CARDINAL
|
||||||
|
ORDINAL
|
||||||
|
|
||||||
|
|
||||||
|
cpdef enum en_gender_t:
|
||||||
|
NO_GENDER
|
||||||
|
MASCULINE
|
||||||
|
FEMININE
|
||||||
|
|
||||||
|
|
||||||
|
cpdef enum en_tenspect_t:
|
||||||
|
NO_TENSE
|
||||||
|
BASE_VERB
|
||||||
|
PRESENT
|
||||||
|
PAST
|
||||||
|
PASSIVE
|
||||||
|
ING
|
||||||
|
MODAL
|
||||||
|
|
||||||
|
|
||||||
|
cpdef enum en_case_t:
|
||||||
|
NO_CASE
|
||||||
|
NOMINATIVE
|
||||||
|
ACCUSATIVE
|
||||||
|
GENITIVE
|
||||||
|
DEMONYM
|
||||||
|
|
||||||
|
|
||||||
|
cpdef enum misc_t:
|
||||||
|
NO_MISC
|
||||||
|
COMPARATIVE
|
||||||
|
SUPERLATIVE
|
||||||
|
RELATIVE
|
||||||
|
NAME
|
||||||
|
URL
|
||||||
|
EMAIL
|
||||||
|
EMOTICON
|
||||||
|
|
||||||
|
|
||||||
# Flags
|
# Flags
|
||||||
cpdef enum FlagID:
|
cpdef enum FlagID:
|
||||||
IS_ALPHA
|
IS_ALPHA
|
||||||
|
|
73
spacy/en.pyx
73
spacy/en.pyx
|
@ -35,6 +35,63 @@ from __future__ import unicode_literals
|
||||||
cimport lang
|
cimport lang
|
||||||
from .typedefs cimport flags_t
|
from .typedefs cimport flags_t
|
||||||
import orth
|
import orth
|
||||||
|
from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
|
||||||
|
from .tagger cimport X, PUNCT, EOL
|
||||||
|
|
||||||
|
|
||||||
|
POS_TAGS = {
|
||||||
|
'NULL': (NO_TAG, {}),
|
||||||
|
'EOL': (EOL, {}),
|
||||||
|
'CC': (CONJ, {}),
|
||||||
|
'CD': (NUM, {}),
|
||||||
|
'DT': (DET, {}),
|
||||||
|
'EX': (DET, {}),
|
||||||
|
'FW': (X, {}),
|
||||||
|
'IN': (ADP, {}),
|
||||||
|
'JJ': (ADJ, {}),
|
||||||
|
'JJR': (ADJ, {'misc': COMPARATIVE}),
|
||||||
|
'JJS': (ADJ, {'misc': SUPERLATIVE}),
|
||||||
|
'LS': (X, {}),
|
||||||
|
'MD': (VERB, {'tenspect': MODAL}),
|
||||||
|
'NN': (NOUN, {}),
|
||||||
|
'NNS': (NOUN, {'number': PLURAL}),
|
||||||
|
'NNP': (NOUN, {'misc': NAME}),
|
||||||
|
'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}),
|
||||||
|
'PDT': (DET, {}),
|
||||||
|
'POS': (PRT, {'case': GENITIVE}),
|
||||||
|
'PRP': (NOUN, {}),
|
||||||
|
'PRP$': (NOUN, {'case': GENITIVE}),
|
||||||
|
'RB': (ADV, {}),
|
||||||
|
'RBR': (ADV, {'misc': COMPARATIVE}),
|
||||||
|
'RBS': (ADV, {'misc': SUPERLATIVE}),
|
||||||
|
'RP': (PRT, {}),
|
||||||
|
'SYM': (X, {}),
|
||||||
|
'TO': (PRT, {}),
|
||||||
|
'UH': (X, {}),
|
||||||
|
'VB': (VERB, {}),
|
||||||
|
'VBD': (VERB, {'tenspect': PAST}),
|
||||||
|
'VBG': (VERB, {'tenspect': ING}),
|
||||||
|
'VBN': (VERB, {'tenspect': PASSIVE}),
|
||||||
|
'VBP': (VERB, {'tenspect': PRESENT}),
|
||||||
|
'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}),
|
||||||
|
'WDT': (DET, {'misc': RELATIVE}),
|
||||||
|
'WP': (PRON, {'misc': RELATIVE}),
|
||||||
|
'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}),
|
||||||
|
'WRB': (ADV, {'misc': RELATIVE}),
|
||||||
|
'!': (PUNCT, {}),
|
||||||
|
'#': (PUNCT, {}),
|
||||||
|
'$': (PUNCT, {}),
|
||||||
|
"''": (PUNCT, {}),
|
||||||
|
"(": (PUNCT, {}),
|
||||||
|
")": (PUNCT, {}),
|
||||||
|
"-LRB-": (PUNCT, {}),
|
||||||
|
"-RRB-": (PUNCT, {}),
|
||||||
|
".": (PUNCT, {}),
|
||||||
|
",": (PUNCT, {}),
|
||||||
|
"``": (PUNCT, {}),
|
||||||
|
":": (PUNCT, {}),
|
||||||
|
"?": (PUNCT, {}),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
POS_TEMPLATES = (
|
POS_TEMPLATES = (
|
||||||
|
@ -91,19 +148,25 @@ cdef class English(Language):
|
||||||
def set_pos(self, Tokens tokens):
|
def set_pos(self, Tokens tokens):
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef atom_t[N_CONTEXT_FIELDS] context
|
cdef atom_t[N_CONTEXT_FIELDS] context
|
||||||
|
cdef TokenC* t = tokens.data
|
||||||
for i in range(tokens.length):
|
for i in range(tokens.length):
|
||||||
fill_pos_context(context, i, tokens.data)
|
fill_pos_context(context, i, t)
|
||||||
tokens.data[i].pos = self.pos_tagger.predict(context)
|
t[i].pos = self.pos_tagger.predict(context)
|
||||||
|
#self.morphalyser.set_token(&t[i])
|
||||||
|
|
||||||
def train_pos(self, Tokens tokens, golds):
|
def train_pos(self, Tokens tokens, golds):
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef atom_t[N_CONTEXT_FIELDS] context
|
cdef atom_t[N_CONTEXT_FIELDS] context
|
||||||
c = 0
|
c = 0
|
||||||
|
cdef TokenC* t = tokens.data
|
||||||
for i in range(tokens.length):
|
for i in range(tokens.length):
|
||||||
fill_pos_context(context, i, tokens.data)
|
fill_pos_context(context, i, t)
|
||||||
tokens.data[i].pos = self.pos_tagger.predict(context, [golds[i]])
|
t[i].pos = self.pos_tagger.predict(context, [golds[i]])
|
||||||
c += tokens.data[i].pos == golds[i]
|
t[i].morph = self.pos_tagger.tags[t[i].pos].morph
|
||||||
|
#self.analyse_morph(&t[i].lemma, &t[i].morph, t[i].pos, t[i].lex)
|
||||||
|
c += t[i].pos == golds[i]
|
||||||
return c
|
return c
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
EN = English('en')
|
EN = English('en')
|
||||||
|
|
|
@ -2,20 +2,20 @@ from libcpp.vector cimport vector
|
||||||
|
|
||||||
from cpython cimport Py_UNICODE_ISSPACE, Py_UNICODE_ISALPHA, Py_UNICODE_ISUPPER
|
from cpython cimport Py_UNICODE_ISSPACE, Py_UNICODE_ISALPHA, Py_UNICODE_ISUPPER
|
||||||
|
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap, PreshMapArray
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
from .tokens cimport Tokens, TokenC
|
from .tokens cimport Tokens, TokenC
|
||||||
from .lexeme cimport Lexeme
|
from .lexeme cimport Lexeme
|
||||||
from .tagger cimport Tagger
|
from .tagger cimport Tagger
|
||||||
|
from .tagger cimport PosTag
|
||||||
from .utf8string cimport StringStore, UniStr
|
from .utf8string cimport StringStore, UniStr
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexicon:
|
cdef class Lexicon:
|
||||||
cpdef public get_lex_props
|
cpdef public get_lex_props
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cpdef readonly size_t size
|
|
||||||
cpdef readonly StringStore strings
|
cpdef readonly StringStore strings
|
||||||
cdef vector[Lexeme*] lexemes
|
cdef vector[Lexeme*] lexemes
|
||||||
|
|
||||||
|
@ -29,13 +29,17 @@ cdef class Language:
|
||||||
cdef readonly unicode name
|
cdef readonly unicode name
|
||||||
cdef PreshMap _cache
|
cdef PreshMap _cache
|
||||||
cdef PreshMap _specials
|
cdef PreshMap _specials
|
||||||
|
cdef PreshMapArray _lemmas
|
||||||
cpdef readonly Lexicon lexicon
|
cpdef readonly Lexicon lexicon
|
||||||
cpdef readonly Tagger pos_tagger
|
cpdef readonly Tagger pos_tagger
|
||||||
|
cpdef readonly object lemmatizer
|
||||||
|
|
||||||
cdef object _prefix_re
|
cdef object _prefix_re
|
||||||
cdef object _suffix_re
|
cdef object _suffix_re
|
||||||
cdef object _infix_re
|
cdef object _infix_re
|
||||||
|
|
||||||
|
cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1
|
||||||
|
|
||||||
cpdef Tokens tokens_from_list(self, list strings)
|
cpdef Tokens tokens_from_list(self, list strings)
|
||||||
cpdef Tokens tokenize(self, unicode text)
|
cpdef Tokens tokenize(self, unicode text)
|
||||||
|
|
||||||
|
|
|
@ -14,6 +14,7 @@ from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
|
from .lemmatizer import Lemmatizer
|
||||||
|
|
||||||
from .lexeme cimport Lexeme
|
from .lexeme cimport Lexeme
|
||||||
from .lexeme cimport EMPTY_LEXEME
|
from .lexeme cimport EMPTY_LEXEME
|
||||||
|
@ -26,6 +27,8 @@ from . import util
|
||||||
from .util import read_lang_data
|
from .util import read_lang_data
|
||||||
from .tokens import Tokens
|
from .tokens import Tokens
|
||||||
|
|
||||||
|
from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS
|
||||||
|
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
def __init__(self, name):
|
def __init__(self, name):
|
||||||
|
@ -39,14 +42,40 @@ cdef class Language:
|
||||||
self._infix_re = re.compile(infix)
|
self._infix_re = re.compile(infix)
|
||||||
self.lexicon = Lexicon(self.get_props)
|
self.lexicon = Lexicon(self.get_props)
|
||||||
self._load_special_tokenization(rules)
|
self._load_special_tokenization(rules)
|
||||||
|
self._lemmas = PreshMapArray(N_UNIV_TAGS)
|
||||||
self.pos_tagger = None
|
self.pos_tagger = None
|
||||||
|
self.lemmatizer = None
|
||||||
|
|
||||||
def load(self):
|
def load(self):
|
||||||
|
self.lemmatizer = Lemmatizer(path.join(util.DATA_DIR, 'wordnet'))
|
||||||
self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes'))
|
self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes'))
|
||||||
self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings'))
|
self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings'))
|
||||||
if path.exists(path.join(util.DATA_DIR, self.name, 'pos')):
|
if path.exists(path.join(util.DATA_DIR, self.name, 'pos')):
|
||||||
self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
|
self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
|
||||||
|
|
||||||
|
cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1:
|
||||||
|
if self.lemmatizer is None:
|
||||||
|
return lex.sic
|
||||||
|
if pos.pos != NOUN and pos.pos != VERB and pos.pos != ADJ:
|
||||||
|
return lex.sic
|
||||||
|
cdef int lemma = <int><size_t>self._lemmas.get(pos.pos, lex.sic)
|
||||||
|
if lemma != 0:
|
||||||
|
return lemma
|
||||||
|
cdef bytes py_string = self.lexicon.strings[lex.sic]
|
||||||
|
cdef set lemma_strings
|
||||||
|
cdef bytes lemma_string
|
||||||
|
if pos.pos == NOUN:
|
||||||
|
lemma_strings = self.lemmatizer.noun(py_string)
|
||||||
|
elif pos.pos == VERB:
|
||||||
|
lemma_strings = self.lemmatizer.verb(py_string)
|
||||||
|
else:
|
||||||
|
assert pos.pos == ADJ
|
||||||
|
lemma_strings = self.lemmatizer.adj(py_string)
|
||||||
|
lemma_string = sorted(lemma_strings)[0]
|
||||||
|
lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i
|
||||||
|
self._lemmas.set(pos.pos, lex.sic, <void*>lemma)
|
||||||
|
return lemma
|
||||||
|
|
||||||
cpdef Tokens tokens_from_list(self, list strings):
|
cpdef Tokens tokens_from_list(self, list strings):
|
||||||
cdef int length = sum([len(s) for s in strings])
|
cdef int length = sum([len(s) for s in strings])
|
||||||
cdef Tokens tokens = Tokens(self.lexicon.strings, length)
|
cdef Tokens tokens = Tokens(self.lexicon.strings, length)
|
||||||
|
@ -254,9 +283,11 @@ cdef class Lexicon:
|
||||||
self._map = PreshMap(2 ** 20)
|
self._map = PreshMap(2 ** 20)
|
||||||
self.strings = StringStore()
|
self.strings = StringStore()
|
||||||
self.lexemes.push_back(&EMPTY_LEXEME)
|
self.lexemes.push_back(&EMPTY_LEXEME)
|
||||||
self.size = 2
|
|
||||||
self.get_lex_props = get_props
|
self.get_lex_props = get_props
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return self.lexemes.size()
|
||||||
|
|
||||||
cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
|
cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
|
||||||
'''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme
|
'''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme
|
||||||
if necessary, using memory acquired from the given pool. If the pool
|
if necessary, using memory acquired from the given pool. If the pool
|
||||||
|
@ -269,14 +300,13 @@ cdef class Lexicon:
|
||||||
mem = self.mem
|
mem = self.mem
|
||||||
cdef unicode py_string = string.chars[:string.n]
|
cdef unicode py_string = string.chars[:string.n]
|
||||||
lex = <Lexeme*>mem.alloc(sizeof(Lexeme), 1)
|
lex = <Lexeme*>mem.alloc(sizeof(Lexeme), 1)
|
||||||
lex[0] = lexeme_init(self.size, py_string, string.key, self.strings,
|
lex[0] = lexeme_init(self.lexemes.size(), py_string, string.key, self.strings,
|
||||||
self.get_lex_props(py_string))
|
self.get_lex_props(py_string))
|
||||||
if mem is self.mem:
|
if mem is self.mem:
|
||||||
self._map.set(string.key, lex)
|
self._map.set(string.key, lex)
|
||||||
while self.lexemes.size() < (lex.id + 1):
|
while self.lexemes.size() < (lex.id + 1):
|
||||||
self.lexemes.push_back(&EMPTY_LEXEME)
|
self.lexemes.push_back(&EMPTY_LEXEME)
|
||||||
self.lexemes[lex.id] = lex
|
self.lexemes[lex.id] = lex
|
||||||
self.size += 1
|
|
||||||
else:
|
else:
|
||||||
lex[0].id = 1
|
lex[0].id = 1
|
||||||
return lex
|
return lex
|
||||||
|
@ -302,6 +332,8 @@ cdef class Lexicon:
|
||||||
a dict if the operator is called from Python.
|
a dict if the operator is called from Python.
|
||||||
'''
|
'''
|
||||||
if type(id_or_string) == int:
|
if type(id_or_string) == int:
|
||||||
|
if id_or_string >= self.lexemes.size():
|
||||||
|
raise IndexError
|
||||||
return self.lexemes.at(id_or_string)[0]
|
return self.lexemes.at(id_or_string)[0]
|
||||||
cdef UniStr string
|
cdef UniStr string
|
||||||
slice_unicode(&string, id_or_string, 0, len(id_or_string))
|
slice_unicode(&string, id_or_string, 0, len(id_or_string))
|
||||||
|
@ -359,5 +391,4 @@ cdef class Lexicon:
|
||||||
self.lexemes.push_back(&EMPTY_LEXEME)
|
self.lexemes.push_back(&EMPTY_LEXEME)
|
||||||
self.lexemes[lexeme.id] = lexeme
|
self.lexemes[lexeme.id] = lexeme
|
||||||
i += 1
|
i += 1
|
||||||
self.size += 1
|
|
||||||
fclose(fp)
|
fclose(fp)
|
||||||
|
|
|
@ -53,6 +53,7 @@ class Lemmatizer(object):
|
||||||
|
|
||||||
|
|
||||||
def lemmatize(string, index, exceptions, rules):
|
def lemmatize(string, index, exceptions, rules):
|
||||||
|
string = string.lower()
|
||||||
forms = []
|
forms = []
|
||||||
if string in index:
|
if string in index:
|
||||||
forms.append(string)
|
forms.append(string)
|
||||||
|
@ -62,6 +63,8 @@ def lemmatize(string, index, exceptions, rules):
|
||||||
form = string[:len(string) - len(old)] + new
|
form = string[:len(string) - len(old)] + new
|
||||||
if form in index:
|
if form in index:
|
||||||
forms.append(form)
|
forms.append(form)
|
||||||
|
if not forms:
|
||||||
|
forms.append(string)
|
||||||
return set(forms)
|
return set(forms)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -147,6 +147,7 @@ Y PRT
|
||||||
Z NOUN
|
Z NOUN
|
||||||
^ NOUN
|
^ NOUN
|
||||||
~ X
|
~ X
|
||||||
`` .""".strip().split('\n'))
|
`` .
|
||||||
|
EOL EOL""".strip().split('\n'))
|
||||||
return mapping[tag]
|
return mapping[tag]
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,40 @@
|
||||||
|
from libc.stdint cimport uint8_t
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from thinc.learner cimport LinearModel
|
from thinc.learner cimport LinearModel
|
||||||
from thinc.features cimport Extractor
|
from thinc.features cimport Extractor
|
||||||
from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
|
from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
|
||||||
|
|
||||||
|
from preshed.maps cimport PreshMapArray
|
||||||
|
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
from .tokens cimport Tokens
|
from .tokens cimport Tokens, Morphology
|
||||||
|
|
||||||
|
|
||||||
|
# Google universal tag set
|
||||||
|
cdef enum univ_tag_t:
|
||||||
|
NO_TAG
|
||||||
|
ADJ
|
||||||
|
ADV
|
||||||
|
ADP
|
||||||
|
CONJ
|
||||||
|
DET
|
||||||
|
NOUN
|
||||||
|
NUM
|
||||||
|
PRON
|
||||||
|
PRT
|
||||||
|
VERB
|
||||||
|
X
|
||||||
|
PUNCT
|
||||||
|
EOL
|
||||||
|
N_UNIV_TAGS
|
||||||
|
|
||||||
|
|
||||||
|
cdef struct PosTag:
|
||||||
|
Morphology morph
|
||||||
|
int id
|
||||||
|
univ_tag_t pos
|
||||||
|
|
||||||
|
|
||||||
cdef class Tagger:
|
cdef class Tagger:
|
||||||
|
@ -16,4 +45,5 @@ cdef class Tagger:
|
||||||
cpdef readonly LinearModel model
|
cpdef readonly LinearModel model
|
||||||
|
|
||||||
cpdef readonly list tag_names
|
cpdef readonly list tag_names
|
||||||
|
cdef PosTag* tags
|
||||||
cdef dict tagdict
|
cdef dict tagdict
|
||||||
|
|
|
@ -12,13 +12,14 @@ import cython
|
||||||
from thinc.features cimport Feature, count_feats
|
from thinc.features cimport Feature, count_feats
|
||||||
|
|
||||||
|
|
||||||
def setup_model_dir(tag_names, tag_counts, templates, model_dir):
|
def setup_model_dir(tag_names, tag_map, tag_counts, templates, model_dir):
|
||||||
if path.exists(model_dir):
|
if path.exists(model_dir):
|
||||||
shutil.rmtree(model_dir)
|
shutil.rmtree(model_dir)
|
||||||
os.mkdir(model_dir)
|
os.mkdir(model_dir)
|
||||||
config = {
|
config = {
|
||||||
'templates': templates,
|
'templates': templates,
|
||||||
'tag_names': tag_names,
|
'tag_names': tag_names,
|
||||||
|
'tag_map': tag_map,
|
||||||
'tag_counts': tag_counts,
|
'tag_counts': tag_counts,
|
||||||
}
|
}
|
||||||
with open(path.join(model_dir, 'config.json'), 'w') as file_:
|
with open(path.join(model_dir, 'config.json'), 'w') as file_:
|
||||||
|
@ -33,16 +34,31 @@ cdef class Tagger:
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
cfg = json.load(open(path.join(model_dir, 'config.json')))
|
cfg = json.load(open(path.join(model_dir, 'config.json')))
|
||||||
templates = cfg['templates']
|
templates = cfg['templates']
|
||||||
|
tag_map = cfg['tag_map']
|
||||||
|
univ_counts = {}
|
||||||
|
cdef unicode tag
|
||||||
|
cdef unicode univ_tag
|
||||||
self.tag_names = cfg['tag_names']
|
self.tag_names = cfg['tag_names']
|
||||||
|
self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
|
||||||
|
for i, tag in enumerate(self.tag_names):
|
||||||
|
pos, props = tag_map[tag]
|
||||||
|
self.tags[i].id = i
|
||||||
|
self.tags[i].pos = pos
|
||||||
|
self.tags[i].morph.number = props.get('number', 0)
|
||||||
|
self.tags[i].morph.tenspect = props.get('tenspect', 0)
|
||||||
|
self.tags[i].morph.mood = props.get('mood', 0)
|
||||||
|
self.tags[i].morph.gender = props.get('gender', 0)
|
||||||
|
self.tags[i].morph.person = props.get('person', 0)
|
||||||
|
self.tags[i].morph.case = props.get('case', 0)
|
||||||
|
self.tags[i].morph.misc = props.get('misc', 0)
|
||||||
self.tagdict = _make_tag_dict(cfg['tag_counts'])
|
self.tagdict = _make_tag_dict(cfg['tag_counts'])
|
||||||
self.extractor = Extractor(templates)
|
self.extractor = Extractor(templates)
|
||||||
self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2)
|
self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2)
|
||||||
if path.exists(path.join(model_dir, 'model')):
|
if path.exists(path.join(model_dir, 'model')):
|
||||||
self.model.load(path.join(model_dir, 'model'))
|
self.model.load(path.join(model_dir, 'model'))
|
||||||
|
|
||||||
cdef class_t predict(self, const atom_t* context, object golds=None) except *:
|
cdef class_t predict(self, atom_t* context, object golds=None) except *:
|
||||||
"""Predict the tag of tokens[i]. The tagger remembers the features and
|
"""Predict the tag of tokens[i].
|
||||||
prediction, in case you later call tell_answer.
|
|
||||||
|
|
||||||
>>> tokens = EN.tokenize(u'An example sentence.')
|
>>> tokens = EN.tokenize(u'An example sentence.')
|
||||||
>>> tag = EN.pos_tagger.predict(0, tokens)
|
>>> tag = EN.pos_tagger.predict(0, tokens)
|
||||||
|
@ -69,6 +85,24 @@ cdef class Tagger:
|
||||||
return tag_id
|
return tag_id
|
||||||
|
|
||||||
|
|
||||||
|
UNIV_TAGS = {
|
||||||
|
'NULL': NO_TAG,
|
||||||
|
'ADJ': ADJ,
|
||||||
|
'ADV': ADV,
|
||||||
|
'ADP': ADP,
|
||||||
|
'CONJ': CONJ,
|
||||||
|
'DET': DET,
|
||||||
|
'NOUN': NOUN,
|
||||||
|
'NUM': NUM,
|
||||||
|
'PRON': PRON,
|
||||||
|
'PRT': PRT,
|
||||||
|
'VERB': VERB,
|
||||||
|
'X': X,
|
||||||
|
'.': PUNCT,
|
||||||
|
'EOL': EOL
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def _make_tag_dict(counts):
|
def _make_tag_dict(counts):
|
||||||
freq_thresh = 50
|
freq_thresh = 50
|
||||||
ambiguity_thresh = 0.98
|
ambiguity_thresh = 0.98
|
||||||
|
|
|
@ -5,14 +5,29 @@ from cymem.cymem cimport Pool
|
||||||
from thinc.typedefs cimport atom_t
|
from thinc.typedefs cimport atom_t
|
||||||
|
|
||||||
from .lexeme cimport Lexeme
|
from .lexeme cimport Lexeme
|
||||||
|
|
||||||
from .typedefs cimport flags_t
|
from .typedefs cimport flags_t
|
||||||
from .utf8string cimport StringStore
|
from .utf8string cimport StringStore
|
||||||
|
from libc.stdint cimport uint8_t, uint16_t
|
||||||
|
|
||||||
|
|
||||||
|
cdef struct Morphology:
|
||||||
|
uint8_t number
|
||||||
|
uint8_t tenspect # Tense/aspect/voice
|
||||||
|
uint8_t mood
|
||||||
|
uint8_t gender
|
||||||
|
uint8_t person
|
||||||
|
uint8_t case
|
||||||
|
uint8_t misc
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct TokenC:
|
cdef struct TokenC:
|
||||||
const Lexeme* lex
|
const Lexeme* lex
|
||||||
|
Morphology morph
|
||||||
int idx
|
int idx
|
||||||
int pos
|
int pos
|
||||||
|
int lemma
|
||||||
int sense
|
int sense
|
||||||
|
|
||||||
|
|
||||||
|
@ -37,7 +52,7 @@ cdef class Token:
|
||||||
cdef public int i
|
cdef public int i
|
||||||
cdef public int idx
|
cdef public int idx
|
||||||
cdef public int pos
|
cdef public int pos
|
||||||
cdef public int ner
|
cdef int lemma
|
||||||
|
|
||||||
cdef public atom_t id
|
cdef public atom_t id
|
||||||
cdef public atom_t cluster
|
cdef public atom_t cluster
|
||||||
|
|
|
@ -51,7 +51,7 @@ cdef class Tokens:
|
||||||
def __getitem__(self, i):
|
def __getitem__(self, i):
|
||||||
bounds_check(i, self.length, PADDING)
|
bounds_check(i, self.length, PADDING)
|
||||||
return Token(self._string_store, i, self.data[i].idx, self.data[i].pos,
|
return Token(self._string_store, i, self.data[i].idx, self.data[i].pos,
|
||||||
self.data[i].sense, self.data[i].lex[0])
|
self.data[i].lemma, self.data[i].lex[0])
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
|
@ -128,14 +128,15 @@ cdef class Tokens:
|
||||||
|
|
||||||
@cython.freelist(64)
|
@cython.freelist(64)
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
def __init__(self, StringStore string_store, int i, int idx, int pos, int ner,
|
def __init__(self, StringStore string_store, int i, int idx, int pos, int lemma,
|
||||||
dict lex):
|
dict lex):
|
||||||
self._string_store = string_store
|
self._string_store = string_store
|
||||||
self.idx = idx
|
self.idx = idx
|
||||||
self.pos = pos
|
self.pos = pos
|
||||||
self.ner = ner
|
|
||||||
self.i = i
|
self.i = i
|
||||||
self.id = lex['id']
|
self.id = lex['id']
|
||||||
|
|
||||||
|
self.lemma = lemma
|
||||||
|
|
||||||
self.cluster = lex['cluster']
|
self.cluster = lex['cluster']
|
||||||
self.length = lex['length']
|
self.length = lex['length']
|
||||||
|
@ -156,3 +157,10 @@ cdef class Token:
|
||||||
return ''
|
return ''
|
||||||
cdef bytes utf8string = self._string_store[self.sic]
|
cdef bytes utf8string = self._string_store[self.sic]
|
||||||
return utf8string.decode('utf8')
|
return utf8string.decode('utf8')
|
||||||
|
|
||||||
|
property lemma:
|
||||||
|
def __get__(self):
|
||||||
|
if self.lemma == 0:
|
||||||
|
return self.string
|
||||||
|
cdef bytes utf8string = self._string_store[self.lemma]
|
||||||
|
return utf8string.decode('utf8')
|
||||||
|
|
Loading…
Reference in New Issue
Block a user