* Refactor morphology.pyx

This commit is contained in:
Matthew Honnibal 2014-12-20 07:27:28 +11:00
parent 4c6ce7ee84
commit 4e30195c6d
2 changed files with 14 additions and 40 deletions

View File

@ -1,36 +1,9 @@
from .tokens cimport TokenC
from .lexeme cimport Lexeme
from .utf8string cimport StringStore
from .typedefs cimport id_t, Morphology
from preshed.maps cimport PreshMapArray
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from preshed.maps cimport PreshMapArray
from .structs cimport TokenC, Lexeme, Morphology, PosTag
# Google universal tag set from .strings cimport StringStore
cpdef enum univ_tag_t: from .typedefs cimport id_t, univ_tag_t
NO_TAG
ADJ
ADV
ADP
CONJ
DET
NOUN
NUM
PRON
PRT
VERB
X
PUNCT
EOL
N_UNIV_TAGS
cdef struct PosTag:
Morphology morph
int id
univ_tag_t pos
cdef class Morphologizer: cdef class Morphologizer:

View File

@ -4,7 +4,9 @@ from os import path
import json import json
from .lemmatizer import Lemmatizer from .lemmatizer import Lemmatizer
from .typedefs cimport id_t from .typedefs cimport id_t, univ_tag_t
from .typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT
from .typedefs cimport VERB, X, PUNCT, EOL
from . import util from . import util
@ -34,13 +36,12 @@ cdef struct _Cached:
cdef class Morphologizer: cdef class Morphologizer:
"""Given a POS tag and a Lexeme, find its lemma and morphological analysis. """Given a POS tag and a Lexeme, find its lemma and morphological analysis.
""" """
def __init__(self, StringStore strings, data_dir): def __init__(self, StringStore strings, object lemmatizer, **kwargs):
self.mem = Pool() self.mem = Pool()
self.strings = strings self.strings = strings
cfg = json.load(open(path.join(data_dir, 'config.json'))) tag_map = kwargs['tag_map']
tag_map = cfg['tag_map'] self.tag_names = kwargs['tag_names']
self.tag_names = cfg['tag_names'] self.lemmatizer = lemmatizer
self.lemmatizer = Lemmatizer(path.join(util.DATA_DIR, 'wordnet'))
self._cache = PreshMapArray(len(self.tag_names)) self._cache = PreshMapArray(len(self.tag_names))
self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag)) self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
for i, tag in enumerate(self.tag_names): for i, tag in enumerate(self.tag_names):
@ -54,9 +55,9 @@ cdef class Morphologizer:
self.tags[i].morph.person = props.get('person', 0) self.tags[i].morph.person = props.get('person', 0)
self.tags[i].morph.case = props.get('case', 0) self.tags[i].morph.case = props.get('case', 0)
self.tags[i].morph.misc = props.get('misc', 0) self.tags[i].morph.misc = props.get('misc', 0)
if path.exists(path.join(data_dir, 'morphs.json')): #if path.exists(path.join(data_dir, 'morphs.json')):
with open(path.join(data_dir, 'morphs.json')) as file_: # with open(path.join(data_dir, 'morphs.json')) as file_:
self.load_exceptions(json.load(file_)) # self.load_exceptions(json.load(file_))
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1: cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
if self.lemmatizer is None: if self.lemmatizer is None: