* Work on morphological processing

This commit is contained in:
Matthew Honnibal 2014-12-08 21:12:15 +11:00
parent 7b68f911cf
commit 99bbbb6feb
10 changed files with 261 additions and 21 deletions

View File

@ -5,6 +5,57 @@ from .tokens cimport Tokens
from .tokens cimport TokenC from .tokens cimport TokenC
cpdef enum en_person_t:
NO_PERSON
FIRST
SECOND
THIRD
cpdef enum en_number_t:
NO_NUMBER
SINGULAR
PLURAL
MASS
CARDINAL
ORDINAL
cpdef enum en_gender_t:
NO_GENDER
MASCULINE
FEMININE
cpdef enum en_tenspect_t:
NO_TENSE
BASE_VERB
PRESENT
PAST
PASSIVE
ING
MODAL
cpdef enum en_case_t:
NO_CASE
NOMINATIVE
ACCUSATIVE
GENITIVE
DEMONYM
cpdef enum misc_t:
NO_MISC
COMPARATIVE
SUPERLATIVE
RELATIVE
NAME
URL
EMAIL
EMOTICON
# Flags # Flags
cpdef enum FlagID: cpdef enum FlagID:
IS_ALPHA IS_ALPHA

View File

@ -35,6 +35,63 @@ from __future__ import unicode_literals
cimport lang cimport lang
from .typedefs cimport flags_t from .typedefs cimport flags_t
import orth import orth
from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
from .tagger cimport X, PUNCT, EOL
POS_TAGS = {
'NULL': (NO_TAG, {}),
'EOL': (EOL, {}),
'CC': (CONJ, {}),
'CD': (NUM, {}),
'DT': (DET, {}),
'EX': (DET, {}),
'FW': (X, {}),
'IN': (ADP, {}),
'JJ': (ADJ, {}),
'JJR': (ADJ, {'misc': COMPARATIVE}),
'JJS': (ADJ, {'misc': SUPERLATIVE}),
'LS': (X, {}),
'MD': (VERB, {'tenspect': MODAL}),
'NN': (NOUN, {}),
'NNS': (NOUN, {'number': PLURAL}),
'NNP': (NOUN, {'misc': NAME}),
'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}),
'PDT': (DET, {}),
'POS': (PRT, {'case': GENITIVE}),
'PRP': (NOUN, {}),
'PRP$': (NOUN, {'case': GENITIVE}),
'RB': (ADV, {}),
'RBR': (ADV, {'misc': COMPARATIVE}),
'RBS': (ADV, {'misc': SUPERLATIVE}),
'RP': (PRT, {}),
'SYM': (X, {}),
'TO': (PRT, {}),
'UH': (X, {}),
'VB': (VERB, {}),
'VBD': (VERB, {'tenspect': PAST}),
'VBG': (VERB, {'tenspect': ING}),
'VBN': (VERB, {'tenspect': PASSIVE}),
'VBP': (VERB, {'tenspect': PRESENT}),
'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}),
'WDT': (DET, {'misc': RELATIVE}),
'WP': (PRON, {'misc': RELATIVE}),
'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}),
'WRB': (ADV, {'misc': RELATIVE}),
'!': (PUNCT, {}),
'#': (PUNCT, {}),
'$': (PUNCT, {}),
"''": (PUNCT, {}),
"(": (PUNCT, {}),
")": (PUNCT, {}),
"-LRB-": (PUNCT, {}),
"-RRB-": (PUNCT, {}),
".": (PUNCT, {}),
",": (PUNCT, {}),
"``": (PUNCT, {}),
":": (PUNCT, {}),
"?": (PUNCT, {}),
}
POS_TEMPLATES = ( POS_TEMPLATES = (
@ -91,19 +148,25 @@ cdef class English(Language):
def set_pos(self, Tokens tokens): def set_pos(self, Tokens tokens):
cdef int i cdef int i
cdef atom_t[N_CONTEXT_FIELDS] context cdef atom_t[N_CONTEXT_FIELDS] context
cdef TokenC* t = tokens.data
for i in range(tokens.length): for i in range(tokens.length):
fill_pos_context(context, i, tokens.data) fill_pos_context(context, i, t)
tokens.data[i].pos = self.pos_tagger.predict(context) t[i].pos = self.pos_tagger.predict(context)
#self.morphalyser.set_token(&t[i])
def train_pos(self, Tokens tokens, golds): def train_pos(self, Tokens tokens, golds):
cdef int i cdef int i
cdef atom_t[N_CONTEXT_FIELDS] context cdef atom_t[N_CONTEXT_FIELDS] context
c = 0 c = 0
cdef TokenC* t = tokens.data
for i in range(tokens.length): for i in range(tokens.length):
fill_pos_context(context, i, tokens.data) fill_pos_context(context, i, t)
tokens.data[i].pos = self.pos_tagger.predict(context, [golds[i]]) t[i].pos = self.pos_tagger.predict(context, [golds[i]])
c += tokens.data[i].pos == golds[i] t[i].morph = self.pos_tagger.tags[t[i].pos].morph
#self.analyse_morph(&t[i].lemma, &t[i].morph, t[i].pos, t[i].lex)
c += t[i].pos == golds[i]
return c return c
EN = English('en') EN = English('en')

View File

@ -2,20 +2,20 @@ from libcpp.vector cimport vector
from cpython cimport Py_UNICODE_ISSPACE, Py_UNICODE_ISALPHA, Py_UNICODE_ISUPPER from cpython cimport Py_UNICODE_ISSPACE, Py_UNICODE_ISALPHA, Py_UNICODE_ISUPPER
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap, PreshMapArray
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from .typedefs cimport hash_t from .typedefs cimport hash_t
from .tokens cimport Tokens, TokenC from .tokens cimport Tokens, TokenC
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .tagger cimport Tagger from .tagger cimport Tagger
from .tagger cimport PosTag
from .utf8string cimport StringStore, UniStr from .utf8string cimport StringStore, UniStr
cdef class Lexicon: cdef class Lexicon:
cpdef public get_lex_props cpdef public get_lex_props
cdef Pool mem cdef Pool mem
cpdef readonly size_t size
cpdef readonly StringStore strings cpdef readonly StringStore strings
cdef vector[Lexeme*] lexemes cdef vector[Lexeme*] lexemes
@ -29,13 +29,17 @@ cdef class Language:
cdef readonly unicode name cdef readonly unicode name
cdef PreshMap _cache cdef PreshMap _cache
cdef PreshMap _specials cdef PreshMap _specials
cdef PreshMapArray _lemmas
cpdef readonly Lexicon lexicon cpdef readonly Lexicon lexicon
cpdef readonly Tagger pos_tagger cpdef readonly Tagger pos_tagger
cpdef readonly object lemmatizer
cdef object _prefix_re cdef object _prefix_re
cdef object _suffix_re cdef object _suffix_re
cdef object _infix_re cdef object _infix_re
cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1
cpdef Tokens tokens_from_list(self, list strings) cpdef Tokens tokens_from_list(self, list strings)
cpdef Tokens tokenize(self, unicode text) cpdef Tokens tokenize(self, unicode text)

View File

@ -14,6 +14,7 @@ from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
from .lemmatizer import Lemmatizer
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .lexeme cimport EMPTY_LEXEME from .lexeme cimport EMPTY_LEXEME
@ -26,6 +27,8 @@ from . import util
from .util import read_lang_data from .util import read_lang_data
from .tokens import Tokens from .tokens import Tokens
from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS
cdef class Language: cdef class Language:
def __init__(self, name): def __init__(self, name):
@ -39,14 +42,40 @@ cdef class Language:
self._infix_re = re.compile(infix) self._infix_re = re.compile(infix)
self.lexicon = Lexicon(self.get_props) self.lexicon = Lexicon(self.get_props)
self._load_special_tokenization(rules) self._load_special_tokenization(rules)
self._lemmas = PreshMapArray(N_UNIV_TAGS)
self.pos_tagger = None self.pos_tagger = None
self.lemmatizer = None
def load(self): def load(self):
self.lemmatizer = Lemmatizer(path.join(util.DATA_DIR, 'wordnet'))
self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes')) self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes'))
self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings')) self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings'))
if path.exists(path.join(util.DATA_DIR, self.name, 'pos')): if path.exists(path.join(util.DATA_DIR, self.name, 'pos')):
self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos')) self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1:
if self.lemmatizer is None:
return lex.sic
if pos.pos != NOUN and pos.pos != VERB and pos.pos != ADJ:
return lex.sic
cdef int lemma = <int><size_t>self._lemmas.get(pos.pos, lex.sic)
if lemma != 0:
return lemma
cdef bytes py_string = self.lexicon.strings[lex.sic]
cdef set lemma_strings
cdef bytes lemma_string
if pos.pos == NOUN:
lemma_strings = self.lemmatizer.noun(py_string)
elif pos.pos == VERB:
lemma_strings = self.lemmatizer.verb(py_string)
else:
assert pos.pos == ADJ
lemma_strings = self.lemmatizer.adj(py_string)
lemma_string = sorted(lemma_strings)[0]
lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i
self._lemmas.set(pos.pos, lex.sic, <void*>lemma)
return lemma
cpdef Tokens tokens_from_list(self, list strings): cpdef Tokens tokens_from_list(self, list strings):
cdef int length = sum([len(s) for s in strings]) cdef int length = sum([len(s) for s in strings])
cdef Tokens tokens = Tokens(self.lexicon.strings, length) cdef Tokens tokens = Tokens(self.lexicon.strings, length)
@ -254,9 +283,11 @@ cdef class Lexicon:
self._map = PreshMap(2 ** 20) self._map = PreshMap(2 ** 20)
self.strings = StringStore() self.strings = StringStore()
self.lexemes.push_back(&EMPTY_LEXEME) self.lexemes.push_back(&EMPTY_LEXEME)
self.size = 2
self.get_lex_props = get_props self.get_lex_props = get_props
def __len__(self):
return self.lexemes.size()
cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL: cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
'''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme '''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme
if necessary, using memory acquired from the given pool. If the pool if necessary, using memory acquired from the given pool. If the pool
@ -269,14 +300,13 @@ cdef class Lexicon:
mem = self.mem mem = self.mem
cdef unicode py_string = string.chars[:string.n] cdef unicode py_string = string.chars[:string.n]
lex = <Lexeme*>mem.alloc(sizeof(Lexeme), 1) lex = <Lexeme*>mem.alloc(sizeof(Lexeme), 1)
lex[0] = lexeme_init(self.size, py_string, string.key, self.strings, lex[0] = lexeme_init(self.lexemes.size(), py_string, string.key, self.strings,
self.get_lex_props(py_string)) self.get_lex_props(py_string))
if mem is self.mem: if mem is self.mem:
self._map.set(string.key, lex) self._map.set(string.key, lex)
while self.lexemes.size() < (lex.id + 1): while self.lexemes.size() < (lex.id + 1):
self.lexemes.push_back(&EMPTY_LEXEME) self.lexemes.push_back(&EMPTY_LEXEME)
self.lexemes[lex.id] = lex self.lexemes[lex.id] = lex
self.size += 1
else: else:
lex[0].id = 1 lex[0].id = 1
return lex return lex
@ -302,6 +332,8 @@ cdef class Lexicon:
a dict if the operator is called from Python. a dict if the operator is called from Python.
''' '''
if type(id_or_string) == int: if type(id_or_string) == int:
if id_or_string >= self.lexemes.size():
raise IndexError
return self.lexemes.at(id_or_string)[0] return self.lexemes.at(id_or_string)[0]
cdef UniStr string cdef UniStr string
slice_unicode(&string, id_or_string, 0, len(id_or_string)) slice_unicode(&string, id_or_string, 0, len(id_or_string))
@ -359,5 +391,4 @@ cdef class Lexicon:
self.lexemes.push_back(&EMPTY_LEXEME) self.lexemes.push_back(&EMPTY_LEXEME)
self.lexemes[lexeme.id] = lexeme self.lexemes[lexeme.id] = lexeme
i += 1 i += 1
self.size += 1
fclose(fp) fclose(fp)

View File

@ -53,6 +53,7 @@ class Lemmatizer(object):
def lemmatize(string, index, exceptions, rules): def lemmatize(string, index, exceptions, rules):
string = string.lower()
forms = [] forms = []
if string in index: if string in index:
forms.append(string) forms.append(string)
@ -62,6 +63,8 @@ def lemmatize(string, index, exceptions, rules):
form = string[:len(string) - len(old)] + new form = string[:len(string) - len(old)] + new
if form in index: if form in index:
forms.append(form) forms.append(form)
if not forms:
forms.append(string)
return set(forms) return set(forms)

View File

@ -147,6 +147,7 @@ Y PRT
Z NOUN Z NOUN
^ NOUN ^ NOUN
~ X ~ X
`` .""".strip().split('\n')) `` .
EOL EOL""".strip().split('\n'))
return mapping[tag] return mapping[tag]

View File

@ -1,11 +1,40 @@
from libc.stdint cimport uint8_t
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from thinc.learner cimport LinearModel from thinc.learner cimport LinearModel
from thinc.features cimport Extractor from thinc.features cimport Extractor
from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
from preshed.maps cimport PreshMapArray
from .typedefs cimport hash_t from .typedefs cimport hash_t
from .tokens cimport Tokens from .tokens cimport Tokens, Morphology
# Google universal tag set
cdef enum univ_tag_t:
NO_TAG
ADJ
ADV
ADP
CONJ
DET
NOUN
NUM
PRON
PRT
VERB
X
PUNCT
EOL
N_UNIV_TAGS
cdef struct PosTag:
Morphology morph
int id
univ_tag_t pos
cdef class Tagger: cdef class Tagger:
@ -16,4 +45,5 @@ cdef class Tagger:
cpdef readonly LinearModel model cpdef readonly LinearModel model
cpdef readonly list tag_names cpdef readonly list tag_names
cdef PosTag* tags
cdef dict tagdict cdef dict tagdict

View File

@ -12,13 +12,14 @@ import cython
from thinc.features cimport Feature, count_feats from thinc.features cimport Feature, count_feats
def setup_model_dir(tag_names, tag_counts, templates, model_dir): def setup_model_dir(tag_names, tag_map, tag_counts, templates, model_dir):
if path.exists(model_dir): if path.exists(model_dir):
shutil.rmtree(model_dir) shutil.rmtree(model_dir)
os.mkdir(model_dir) os.mkdir(model_dir)
config = { config = {
'templates': templates, 'templates': templates,
'tag_names': tag_names, 'tag_names': tag_names,
'tag_map': tag_map,
'tag_counts': tag_counts, 'tag_counts': tag_counts,
} }
with open(path.join(model_dir, 'config.json'), 'w') as file_: with open(path.join(model_dir, 'config.json'), 'w') as file_:
@ -33,16 +34,31 @@ cdef class Tagger:
self.mem = Pool() self.mem = Pool()
cfg = json.load(open(path.join(model_dir, 'config.json'))) cfg = json.load(open(path.join(model_dir, 'config.json')))
templates = cfg['templates'] templates = cfg['templates']
tag_map = cfg['tag_map']
univ_counts = {}
cdef unicode tag
cdef unicode univ_tag
self.tag_names = cfg['tag_names'] self.tag_names = cfg['tag_names']
self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
for i, tag in enumerate(self.tag_names):
pos, props = tag_map[tag]
self.tags[i].id = i
self.tags[i].pos = pos
self.tags[i].morph.number = props.get('number', 0)
self.tags[i].morph.tenspect = props.get('tenspect', 0)
self.tags[i].morph.mood = props.get('mood', 0)
self.tags[i].morph.gender = props.get('gender', 0)
self.tags[i].morph.person = props.get('person', 0)
self.tags[i].morph.case = props.get('case', 0)
self.tags[i].morph.misc = props.get('misc', 0)
self.tagdict = _make_tag_dict(cfg['tag_counts']) self.tagdict = _make_tag_dict(cfg['tag_counts'])
self.extractor = Extractor(templates) self.extractor = Extractor(templates)
self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2) self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2)
if path.exists(path.join(model_dir, 'model')): if path.exists(path.join(model_dir, 'model')):
self.model.load(path.join(model_dir, 'model')) self.model.load(path.join(model_dir, 'model'))
cdef class_t predict(self, const atom_t* context, object golds=None) except *: cdef class_t predict(self, atom_t* context, object golds=None) except *:
"""Predict the tag of tokens[i]. The tagger remembers the features and """Predict the tag of tokens[i].
prediction, in case you later call tell_answer.
>>> tokens = EN.tokenize(u'An example sentence.') >>> tokens = EN.tokenize(u'An example sentence.')
>>> tag = EN.pos_tagger.predict(0, tokens) >>> tag = EN.pos_tagger.predict(0, tokens)
@ -69,6 +85,24 @@ cdef class Tagger:
return tag_id return tag_id
UNIV_TAGS = {
'NULL': NO_TAG,
'ADJ': ADJ,
'ADV': ADV,
'ADP': ADP,
'CONJ': CONJ,
'DET': DET,
'NOUN': NOUN,
'NUM': NUM,
'PRON': PRON,
'PRT': PRT,
'VERB': VERB,
'X': X,
'.': PUNCT,
'EOL': EOL
}
def _make_tag_dict(counts): def _make_tag_dict(counts):
freq_thresh = 50 freq_thresh = 50
ambiguity_thresh = 0.98 ambiguity_thresh = 0.98

View File

@ -5,14 +5,29 @@ from cymem.cymem cimport Pool
from thinc.typedefs cimport atom_t from thinc.typedefs cimport atom_t
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .typedefs cimport flags_t from .typedefs cimport flags_t
from .utf8string cimport StringStore from .utf8string cimport StringStore
from libc.stdint cimport uint8_t, uint16_t
cdef struct Morphology:
uint8_t number
uint8_t tenspect # Tense/aspect/voice
uint8_t mood
uint8_t gender
uint8_t person
uint8_t case
uint8_t misc
cdef struct TokenC: cdef struct TokenC:
const Lexeme* lex const Lexeme* lex
Morphology morph
int idx int idx
int pos int pos
int lemma
int sense int sense
@ -37,7 +52,7 @@ cdef class Token:
cdef public int i cdef public int i
cdef public int idx cdef public int idx
cdef public int pos cdef public int pos
cdef public int ner cdef int lemma
cdef public atom_t id cdef public atom_t id
cdef public atom_t cluster cdef public atom_t cluster

View File

@ -51,7 +51,7 @@ cdef class Tokens:
def __getitem__(self, i): def __getitem__(self, i):
bounds_check(i, self.length, PADDING) bounds_check(i, self.length, PADDING)
return Token(self._string_store, i, self.data[i].idx, self.data[i].pos, return Token(self._string_store, i, self.data[i].idx, self.data[i].pos,
self.data[i].sense, self.data[i].lex[0]) self.data[i].lemma, self.data[i].lex[0])
def __iter__(self): def __iter__(self):
for i in range(self.length): for i in range(self.length):
@ -128,14 +128,15 @@ cdef class Tokens:
@cython.freelist(64) @cython.freelist(64)
cdef class Token: cdef class Token:
def __init__(self, StringStore string_store, int i, int idx, int pos, int ner, def __init__(self, StringStore string_store, int i, int idx, int pos, int lemma,
dict lex): dict lex):
self._string_store = string_store self._string_store = string_store
self.idx = idx self.idx = idx
self.pos = pos self.pos = pos
self.ner = ner
self.i = i self.i = i
self.id = lex['id'] self.id = lex['id']
self.lemma = lemma
self.cluster = lex['cluster'] self.cluster = lex['cluster']
self.length = lex['length'] self.length = lex['length']
@ -156,3 +157,10 @@ cdef class Token:
return '' return ''
cdef bytes utf8string = self._string_store[self.sic] cdef bytes utf8string = self._string_store[self.sic]
return utf8string.decode('utf8') return utf8string.decode('utf8')
property lemma:
def __get__(self):
if self.lemma == 0:
return self.string
cdef bytes utf8string = self._string_store[self.lemma]
return utf8string.decode('utf8')