* More work on language independent parsing

This commit is contained in:
Matthew Honnibal 2015-08-28 03:44:54 +02:00
parent c2307fa9ee
commit 534e3dda3c
5 changed files with 18 additions and 22 deletions

View File

@ -6,7 +6,6 @@ except ImportError:
import json import json
from .tokenizer import Tokenizer from .tokenizer import Tokenizer
from .morphology import Morphology
from .vocab import Vocab from .vocab import Vocab
from .syntax.parser import Parser from .syntax.parser import Parser
from .tagger import Tagger from .tagger import Tagger
@ -132,16 +131,12 @@ class Language(object):
def default_data_dir(cls): def default_data_dir(cls):
return path.join(path.dirname(__file__), 'data') return path.join(path.dirname(__file__), 'data')
@classmethod
def default_morphology(cls, data_dir):
return Morphology.from_dir(data_dir)
@classmethod @classmethod
def default_vectors(cls, data_dir): def default_vectors(cls, data_dir):
return None return None
@classmethod @classmethod
def default_vocab(cls, data_dir=None, get_lex_attr=None, vectors=None, morphology=None): def default_vocab(cls, data_dir=None, get_lex_attr=None, vectors=None):
if data_dir is None: if data_dir is None:
data_dir = cls.default_data_dir() data_dir = cls.default_data_dir()
if vectors is None: if vectors is None:

View File

@ -22,7 +22,7 @@ cdef struct MorphAnalysisC:
cdef class Morphology: cdef class Morphology:
cdef readonly Pool mem cdef readonly Pool mem
cdef readonly object strings cdef readonly StringStore strings
cdef public object lemmatizer cdef public object lemmatizer
cdef public object n_tags cdef public object n_tags
cdef public object reverse_index cdef public object reverse_index

View File

@ -11,20 +11,15 @@ from .parts_of_speech cimport ADJ, VERB, NOUN
cdef class Morphology: cdef class Morphology:
@classmethod def __init__(self, StringStore string_store, tag_map, lemmatizer):
def from_dir(cls, data_dir, lemmatizer=None):
tag_map = json.load(open(path.join(data_dir, 'tag_map.json')))
if lemmatizer is None:
lemmatizer = Lemmatizer.from_dir(data_dir)
return cls(tag_map, {}, lemmatizer)
def __init__(self, string_store, tag_map, lemmatizer):
self.mem = Pool() self.mem = Pool()
self.strings = string_store self.strings = string_store
self.lemmatizer = lemmatizer self.lemmatizer = lemmatizer
self.n_tags = len(tag_map) self.n_tags = len(tag_map) + 1
self.tag_names = tuple(sorted(tag_map.keys())) self.tag_names = tuple(sorted(tag_map.keys()))
self.reverse_index = {} self.reverse_index = {}
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
for i, (tag_str, props) in enumerate(sorted(tag_map.items())): for i, (tag_str, props) in enumerate(sorted(tag_map.items())):
self.rich_tags[i].id = i self.rich_tags[i].id = i
self.rich_tags[i].name = self.strings[tag_str] self.rich_tags[i].name = self.strings[tag_str]
@ -33,12 +28,16 @@ cdef class Morphology:
self._cache = PreshMapArray(self.n_tags) self._cache = PreshMapArray(self.n_tags)
cdef int assign_tag(self, TokenC* token, tag) except -1: cdef int assign_tag(self, TokenC* token, tag) except -1:
cdef int tag_id = self.strings[tag] if isinstance(tag, basestring) else tag cdef int tag_id
if isinstance(tag, basestring):
tag_id = self.reverse_index[self.strings[tag]]
else:
tag_id = tag
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth) analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
if analysis is NULL: if analysis is NULL:
analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC)) analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
analysis.tag = self.rich_tags[tag_id] analysis.tag = self.rich_tags[tag_id]
analysis.lemma = self.lemmatize(tag, token.lex.orth) analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth)
token.lemma = analysis.lemma token.lemma = analysis.lemma
token.pos = analysis.tag.pos token.pos = analysis.tag.pos
token.tag = analysis.tag.name token.tag = analysis.tag.name

View File

@ -104,7 +104,7 @@ cdef class Tagger:
@classmethod @classmethod
def blank(cls, vocab, templates): def blank(cls, vocab, templates):
model = Model(vocab.n_tags, templates, model_loc=None) model = Model(vocab.morphology.n_tags, templates, model_loc=None)
return cls(vocab, model) return cls(vocab, model)
@classmethod @classmethod
@ -113,7 +113,7 @@ cdef class Tagger:
templates = json.loads(open(path.join(data_dir, 'templates.json'))) templates = json.loads(open(path.join(data_dir, 'templates.json')))
else: else:
templates = cls.default_templates() templates = cls.default_templates()
model = Model(vocab.n_tags, templates, data_dir) model = Model(vocab.morphology.n_tags, templates, data_dir)
return cls(vocab, model) return cls(vocab, model)
def __init__(self, Vocab vocab, model): def __init__(self, Vocab vocab, model):
@ -128,7 +128,7 @@ cdef class Tagger:
@property @property
def tag_names(self): def tag_names(self):
return self.vocab.tag_names return self.vocab.morphology.tag_names
def __call__(self, Doc tokens): def __call__(self, Doc tokens):
"""Apply the tagger, setting the POS tags onto the Doc object. """Apply the tagger, setting the POS tags onto the Doc object.

View File

@ -49,13 +49,15 @@ cdef class Vocab:
self._serializer = None self._serializer = None
@classmethod @classmethod
def from_dir(cls, data_dir, get_lex_attr=None, morphology=None, vectors=None): def from_dir(cls, data_dir, get_lex_attr=None, vectors=None):
if not path.exists(data_dir): if not path.exists(data_dir):
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir) raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
if not path.isdir(data_dir): if not path.isdir(data_dir):
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
tag_map = json.load(open(path.join(data_dir, 'tag_map.json'))) tag_map = json.load(open(path.join(data_dir, 'tag_map.json')))
cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, tag_map=tag_map) cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, tag_map=tag_map)
self.load_lexemes(path.join(data_dir, 'strings.txt'), path.join(data_dir, 'lexemes.bin')) self.load_lexemes(path.join(data_dir, 'strings.txt'), path.join(data_dir, 'lexemes.bin'))
if vectors is None and path.exists(path.join(data_dir, 'vec.bin')): if vectors is None and path.exists(path.join(data_dir, 'vec.bin')):
self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin')) self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))