* More work on language independent parsing

This commit is contained in:
Matthew Honnibal 2015-08-28 03:44:54 +02:00
parent c2307fa9ee
commit 534e3dda3c
5 changed files with 18 additions and 22 deletions

View File

@ -6,7 +6,6 @@ except ImportError:
import json
from .tokenizer import Tokenizer
from .morphology import Morphology
from .vocab import Vocab
from .syntax.parser import Parser
from .tagger import Tagger
@ -132,16 +131,12 @@ class Language(object):
def default_data_dir(cls):
return path.join(path.dirname(__file__), 'data')
@classmethod
def default_morphology(cls, data_dir):
return Morphology.from_dir(data_dir)
@classmethod
def default_vectors(cls, data_dir):
return None
@classmethod
def default_vocab(cls, data_dir=None, get_lex_attr=None, vectors=None, morphology=None):
def default_vocab(cls, data_dir=None, get_lex_attr=None, vectors=None):
if data_dir is None:
data_dir = cls.default_data_dir()
if vectors is None:

View File

@ -22,7 +22,7 @@ cdef struct MorphAnalysisC:
cdef class Morphology:
cdef readonly Pool mem
cdef readonly object strings
cdef readonly StringStore strings
cdef public object lemmatizer
cdef public object n_tags
cdef public object reverse_index

View File

@ -11,20 +11,15 @@ from .parts_of_speech cimport ADJ, VERB, NOUN
cdef class Morphology:
@classmethod
def from_dir(cls, data_dir, lemmatizer=None):
tag_map = json.load(open(path.join(data_dir, 'tag_map.json')))
if lemmatizer is None:
lemmatizer = Lemmatizer.from_dir(data_dir)
return cls(tag_map, {}, lemmatizer)
def __init__(self, string_store, tag_map, lemmatizer):
def __init__(self, StringStore string_store, tag_map, lemmatizer):
self.mem = Pool()
self.strings = string_store
self.lemmatizer = lemmatizer
self.n_tags = len(tag_map)
self.n_tags = len(tag_map) + 1
self.tag_names = tuple(sorted(tag_map.keys()))
self.reverse_index = {}
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
for i, (tag_str, props) in enumerate(sorted(tag_map.items())):
self.rich_tags[i].id = i
self.rich_tags[i].name = self.strings[tag_str]
@ -33,12 +28,16 @@ cdef class Morphology:
self._cache = PreshMapArray(self.n_tags)
cdef int assign_tag(self, TokenC* token, tag) except -1:
cdef int tag_id = self.strings[tag] if isinstance(tag, basestring) else tag
cdef int tag_id
if isinstance(tag, basestring):
tag_id = self.reverse_index[self.strings[tag]]
else:
tag_id = tag
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
if analysis is NULL:
analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
analysis.tag = self.rich_tags[tag_id]
analysis.lemma = self.lemmatize(tag, token.lex.orth)
analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth)
token.lemma = analysis.lemma
token.pos = analysis.tag.pos
token.tag = analysis.tag.name

View File

@ -104,7 +104,7 @@ cdef class Tagger:
@classmethod
def blank(cls, vocab, templates):
model = Model(vocab.n_tags, templates, model_loc=None)
model = Model(vocab.morphology.n_tags, templates, model_loc=None)
return cls(vocab, model)
@classmethod
@ -113,7 +113,7 @@ cdef class Tagger:
templates = json.loads(open(path.join(data_dir, 'templates.json')))
else:
templates = cls.default_templates()
model = Model(vocab.n_tags, templates, data_dir)
model = Model(vocab.morphology.n_tags, templates, data_dir)
return cls(vocab, model)
def __init__(self, Vocab vocab, model):
@ -128,7 +128,7 @@ cdef class Tagger:
@property
def tag_names(self):
return self.vocab.tag_names
return self.vocab.morphology.tag_names
def __call__(self, Doc tokens):
"""Apply the tagger, setting the POS tags onto the Doc object.

View File

@ -49,13 +49,15 @@ cdef class Vocab:
self._serializer = None
@classmethod
def from_dir(cls, data_dir, get_lex_attr=None, morphology=None, vectors=None):
def from_dir(cls, data_dir, get_lex_attr=None, vectors=None):
if not path.exists(data_dir):
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
if not path.isdir(data_dir):
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
tag_map = json.load(open(path.join(data_dir, 'tag_map.json')))
cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, tag_map=tag_map)
self.load_lexemes(path.join(data_dir, 'strings.txt'), path.join(data_dir, 'lexemes.bin'))
if vectors is None and path.exists(path.join(data_dir, 'vec.bin')):
self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))