* Move morphological analysis into its own module, morphology.pyx

This commit is contained in:
Matthew Honnibal 2014-12-09 21:16:17 +11:00
parent b962fe73d7
commit 6b34a2f34b
7 changed files with 135 additions and 97 deletions

View File

@ -35,8 +35,8 @@ from __future__ import unicode_literals
cimport lang cimport lang
from .typedefs cimport flags_t from .typedefs cimport flags_t
import orth import orth
from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB from .morphology cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
from .tagger cimport X, PUNCT, EOL from .morphology cimport X, PUNCT, EOL
from .tokens cimport Morphology from .tokens cimport Morphology
@ -154,8 +154,8 @@ cdef class English(Language):
for i in range(tokens.length): for i in range(tokens.length):
fill_pos_context(context, i, t) fill_pos_context(context, i, t)
t[i].pos = self.pos_tagger.predict(context) t[i].pos = self.pos_tagger.predict(context)
_merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph) if self.morphologizer:
t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex) self.morphologizer.set_morph(i, t)
def train_pos(self, Tokens tokens, golds): def train_pos(self, Tokens tokens, golds):
cdef int i cdef int i
@ -165,8 +165,8 @@ cdef class English(Language):
for i in range(tokens.length): for i in range(tokens.length):
fill_pos_context(context, i, t) fill_pos_context(context, i, t)
t[i].pos = self.pos_tagger.predict(context, [golds[i]]) t[i].pos = self.pos_tagger.predict(context, [golds[i]])
_merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph) if self.morphologizer:
t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex) self.morphologizer.set_morph(i, t)
c += t[i].pos == golds[i] c += t[i].pos == golds[i]
return c return c

View File

@ -2,15 +2,15 @@ from libcpp.vector cimport vector
from cpython cimport Py_UNICODE_ISSPACE, Py_UNICODE_ISALPHA, Py_UNICODE_ISUPPER from cpython cimport Py_UNICODE_ISSPACE, Py_UNICODE_ISALPHA, Py_UNICODE_ISUPPER
from preshed.maps cimport PreshMap, PreshMapArray from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from .typedefs cimport hash_t from .typedefs cimport hash_t
from .tokens cimport Tokens, TokenC from .tokens cimport Tokens, TokenC
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .tagger cimport Tagger from .tagger cimport Tagger
from .tagger cimport univ_tag_t
from .utf8string cimport StringStore, UniStr from .utf8string cimport StringStore, UniStr
from .morphology cimport Morphologizer
cdef union LexemesOrTokens: cdef union LexemesOrTokens:
@ -40,17 +40,14 @@ cdef class Language:
cdef readonly unicode name cdef readonly unicode name
cdef PreshMap _cache cdef PreshMap _cache
cdef PreshMap _specials cdef PreshMap _specials
cdef PreshMapArray _lemmas
cpdef readonly Lexicon lexicon cpdef readonly Lexicon lexicon
cpdef readonly Tagger pos_tagger cpdef readonly Tagger pos_tagger
cpdef readonly object lemmatizer cpdef readonly Morphologizer morphologizer
cdef object _prefix_re cdef object _prefix_re
cdef object _suffix_re cdef object _suffix_re
cdef object _infix_re cdef object _infix_re
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
cpdef Tokens tokens_from_list(self, list strings) cpdef Tokens tokens_from_list(self, list strings)
cpdef Tokens tokenize(self, unicode text) cpdef Tokens tokenize(self, unicode text)

View File

@ -14,7 +14,6 @@ from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
from .lemmatizer import Lemmatizer
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .lexeme cimport EMPTY_LEXEME from .lexeme cimport EMPTY_LEXEME
@ -26,8 +25,6 @@ from .utf8string cimport slice_unicode
from . import util from . import util
from .util import read_lang_data from .util import read_lang_data
from .tokens import Tokens from .tokens import Tokens
from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS
from .tokens cimport Morphology from .tokens cimport Morphology
@ -43,39 +40,16 @@ cdef class Language:
self._infix_re = re.compile(infix) self._infix_re = re.compile(infix)
self.lexicon = Lexicon(self.get_props) self.lexicon = Lexicon(self.get_props)
self._load_special_tokenization(rules) self._load_special_tokenization(rules)
self._lemmas = PreshMapArray(N_UNIV_TAGS)
self.pos_tagger = None self.pos_tagger = None
self.lemmatizer = None self.morphologizer = None
def load(self): def load(self):
self.lemmatizer = Lemmatizer(path.join(util.DATA_DIR, 'wordnet'))
self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes')) self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes'))
self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings')) self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings'))
if path.exists(path.join(util.DATA_DIR, self.name, 'pos')): if path.exists(path.join(util.DATA_DIR, self.name, 'pos')):
self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos')) self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
self.morphologizer = Morphologizer(self.lexicon.strings,
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1: path.join(util.DATA_DIR, self.name))
if self.lemmatizer is None:
return lex.sic
if pos != NOUN and pos != VERB and pos != ADJ:
return lex.sic
cdef int lemma = <int><size_t>self._lemmas.get(pos, lex.sic)
if lemma != 0:
return lemma
cdef bytes py_string = self.lexicon.strings[lex.sic]
cdef set lemma_strings
cdef bytes lemma_string
if pos == NOUN:
lemma_strings = self.lemmatizer.noun(py_string)
elif pos == VERB:
lemma_strings = self.lemmatizer.verb(py_string)
else:
assert pos == ADJ
lemma_strings = self.lemmatizer.adj(py_string)
lemma_string = sorted(lemma_strings)[0]
lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i
self._lemmas.set(pos, lex.sic, <void*>lemma)
return lemma
cpdef Tokens tokens_from_list(self, list strings): cpdef Tokens tokens_from_list(self, list strings):
cdef int length = sum([len(s) for s in strings]) cdef int length = sum([len(s) for s in strings])

42
spacy/morphology.pxd Normal file
View File

@ -0,0 +1,42 @@
from .tokens cimport TokenC, Morphology
from .lexeme cimport Lexeme
from .utf8string cimport StringStore
from preshed.maps cimport PreshMapArray
from cymem.cymem cimport Pool
# Google universal tag set
cpdef enum univ_tag_t:
NO_TAG
ADJ
ADV
ADP
CONJ
DET
NOUN
NUM
PRON
PRT
VERB
X
PUNCT
EOL
N_UNIV_TAGS
cdef struct PosTag:
Morphology morph
int id
univ_tag_t pos
cdef class Morphologizer:
cdef Pool mem
cdef StringStore strings
cdef object lemmatizer
cdef PosTag* tags
cdef PreshMapArray _morph
cdef PreshMapArray _lemmas
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
cdef int set_morph(self, const int i, TokenC* tokens) except -1

81
spacy/morphology.pyx Normal file
View File

@ -0,0 +1,81 @@
from os import path
import json
from .lemmatizer import Lemmatizer
UNIV_TAGS = {
'NULL': NO_TAG,
'ADJ': ADJ,
'ADV': ADV,
'ADP': ADP,
'CONJ': CONJ,
'DET': DET,
'NOUN': NOUN,
'NUM': NUM,
'PRON': PRON,
'PRT': PRT,
'VERB': VERB,
'X': X,
'.': PUNCT,
'EOL': EOL
}
cdef class Morphologizer:
"""Given a POS tag and a Lexeme, find its lemma and morphological analysis.
"""
def __init__(self, StringStore strings, data_dir):
self.mem = Pool()
self.strings = strings
cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
tag_map = cfg['tag_map']
tag_names = cfg['tag_names']
self.lemmatizer = Lemmatizer(path.join(data_dir, '..', 'wordnet'))
self._lemmas = PreshMapArray(N_UNIV_TAGS)
self._morph = PreshMapArray(len(tag_names))
self.tags = <PosTag*>self.mem.alloc(len(tag_names), sizeof(PosTag))
for i, tag in enumerate(tag_names):
pos, props = tag_map[tag]
self.tags[i].id = i
self.tags[i].pos = pos
self.tags[i].morph.number = props.get('number', 0)
self.tags[i].morph.tenspect = props.get('tenspect', 0)
self.tags[i].morph.mood = props.get('mood', 0)
self.tags[i].morph.gender = props.get('gender', 0)
self.tags[i].morph.person = props.get('person', 0)
self.tags[i].morph.case = props.get('case', 0)
self.tags[i].morph.misc = props.get('misc', 0)
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
if self.lemmatizer is None:
return lex.sic
if pos != NOUN and pos != VERB and pos != ADJ:
return lex.sic
cdef int lemma = <int><size_t>self._lemmas.get(pos, lex.sic)
if lemma != 0:
return lemma
cdef bytes py_string = self.strings[lex.sic]
cdef set lemma_strings
cdef bytes lemma_string
if pos == NOUN:
lemma_strings = self.lemmatizer.noun(py_string)
elif pos == VERB:
lemma_strings = self.lemmatizer.verb(py_string)
else:
assert pos == ADJ
lemma_strings = self.lemmatizer.adj(py_string)
lemma_string = sorted(lemma_strings)[0]
lemma = self.strings.intern(lemma_string, len(lemma_string)).i
self._lemmas.set(pos, lex.sic, <void*>lemma)
return lemma
cdef int set_morph(self, const int i, TokenC* tokens) except -1:
cdef const PosTag* tag = &self.tags[tokens[i].pos]
tokens[i].lemma = self.lemmatize(tag.pos, tokens[i].lex)
morph = <Morphology*>self._morph.get(tag.id, tokens[i].lemma)
if morph is NULL:
self._morph.set(tag.id, tokens[i].lemma, <void*>&tag.morph)
tokens[i].morph = tag.morph
else:
tokens[i].morph = morph[0]

View File

@ -12,31 +12,6 @@ from .typedefs cimport hash_t
from .tokens cimport Tokens, Morphology from .tokens cimport Tokens, Morphology
# Google universal tag set
cdef enum univ_tag_t:
NO_TAG
ADJ
ADV
ADP
CONJ
DET
NOUN
NUM
PRON
PRT
VERB
X
PUNCT
EOL
N_UNIV_TAGS
cdef struct PosTag:
Morphology morph
int id
univ_tag_t pos
cdef class Tagger: cdef class Tagger:
cdef class_t predict(self, const atom_t* context, object golds=*) except * cdef class_t predict(self, const atom_t* context, object golds=*) except *
@ -45,5 +20,4 @@ cdef class Tagger:
cpdef readonly LinearModel model cpdef readonly LinearModel model
cpdef readonly list tag_names cpdef readonly list tag_names
cdef PosTag* tags
cdef dict tagdict cdef dict tagdict

View File

@ -34,23 +34,10 @@ cdef class Tagger:
self.mem = Pool() self.mem = Pool()
cfg = json.load(open(path.join(model_dir, 'config.json'))) cfg = json.load(open(path.join(model_dir, 'config.json')))
templates = cfg['templates'] templates = cfg['templates']
tag_map = cfg['tag_map']
univ_counts = {} univ_counts = {}
cdef unicode tag cdef unicode tag
cdef unicode univ_tag cdef unicode univ_tag
self.tag_names = cfg['tag_names'] self.tag_names = cfg['tag_names']
self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
for i, tag in enumerate(self.tag_names):
pos, props = tag_map[tag]
self.tags[i].id = i
self.tags[i].pos = pos
self.tags[i].morph.number = props.get('number', 0)
self.tags[i].morph.tenspect = props.get('tenspect', 0)
self.tags[i].morph.mood = props.get('mood', 0)
self.tags[i].morph.gender = props.get('gender', 0)
self.tags[i].morph.person = props.get('person', 0)
self.tags[i].morph.case = props.get('case', 0)
self.tags[i].morph.misc = props.get('misc', 0)
self.tagdict = _make_tag_dict(cfg['tag_counts']) self.tagdict = _make_tag_dict(cfg['tag_counts'])
self.extractor = Extractor(templates) self.extractor = Extractor(templates)
self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2) self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2)
@ -85,23 +72,6 @@ cdef class Tagger:
return tag_id return tag_id
UNIV_TAGS = {
'NULL': NO_TAG,
'ADJ': ADJ,
'ADV': ADV,
'ADP': ADP,
'CONJ': CONJ,
'DET': DET,
'NOUN': NOUN,
'NUM': NUM,
'PRON': PRON,
'PRT': PRT,
'VERB': VERB,
'X': X,
'.': PUNCT,
'EOL': EOL
}
def _make_tag_dict(counts): def _make_tag_dict(counts):
freq_thresh = 50 freq_thresh = 50