* More work on language-generic parsing

This commit is contained in:
Matthew Honnibal 2015-08-28 02:02:33 +02:00
parent 86c4a8e3e2
commit c2307fa9ee
12 changed files with 129 additions and 222 deletions

11
spacy/fi/__init__.py Normal file
View File

@ -0,0 +1,11 @@
from __future__ import unicode_literals, print_function
from os import path
from ..language import Language
class Finnish(Language):
@classmethod
def default_data_dir(cls):
return path.join(path.dirname(__file__), 'data')

View File

@ -148,13 +148,10 @@ class Language(object):
vectors = cls.default_vectors(data_dir) vectors = cls.default_vectors(data_dir)
if get_lex_attr is None: if get_lex_attr is None:
get_lex_attr = cls.default_lex_attrs(data_dir) get_lex_attr = cls.default_lex_attrs(data_dir)
if morphology is None:
morphology = cls.default_morphology(path.join(data_dir, 'vocab'))
return Vocab.from_dir( return Vocab.from_dir(
path.join(data_dir, 'vocab'), path.join(data_dir, 'vocab'),
get_lex_attr=get_lex_attr, get_lex_attr=get_lex_attr,
vectors=vectors, vectors=vectors)
morphology=morphology)
@classmethod @classmethod
def default_tokenizer(cls, vocab, data_dir): def default_tokenizer(cls, vocab, data_dir):

View File

@ -1,18 +1,41 @@
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMapArray
from libc.stdint cimport uint64_t
from .structs cimport TokenC from .structs cimport TokenC
from .strings cimport StringStore from .strings cimport StringStore
from .typedefs cimport attr_t
from .parts_of_speech cimport univ_pos_t
cdef struct RichTagC:
uint64_t morph
int id
univ_pos_t pos
attr_t name
cdef struct MorphAnalysisC:
RichTagC tag
attr_t lemma
cdef class Morphology: cdef class Morphology:
cdef readonly Pool mem
cdef readonly object strings cdef readonly object strings
cdef public object lemmatizer cdef public object lemmatizer
cdef public object tag_map cdef public object n_tags
cdef public object reverse_index
cdef public object tag_names cdef public object tag_names
cdef public object tag_ids
cdef public int n_tags
cdef int assign_tag(self, StringStore strings, TokenC* token, int tag) except -1 cdef RichTagC* rich_tags
cdef PreshMapArray _cache
cdef int assign_tag(self, TokenC* token, tag) except -1
cdef int assign_feature(self, uint64_t* morph, feature, value) except -1
cdef int assign_from_dict(self, TokenC* token, props) except -1
# #
#cpdef enum Feature_t: #cpdef enum Feature_t:

View File

@ -6,13 +6,8 @@ try:
except ImportError: except ImportError:
import json import json
from spacy.parts_of_speech import UNIV_POS_NAMES from .parts_of_speech import UNIV_POS_NAMES
from .parts_of_speech cimport ADJ, VERB, NOUN
cdef struct MorphAnalysisC:
uint64_t[4] features
attr_t lemma
attr_t pos
cdef class Morphology: cdef class Morphology:
@ -23,32 +18,37 @@ cdef class Morphology:
lemmatizer = Lemmatizer.from_dir(data_dir) lemmatizer = Lemmatizer.from_dir(data_dir)
return cls(tag_map, {}, lemmatizer) return cls(tag_map, {}, lemmatizer)
def __init__(self, tag_map, fused_tokens, lemmatizer): def __init__(self, string_store, tag_map, lemmatizer):
self.mem = Pool()
self.strings = string_store
self.lemmatizer = lemmatizer self.lemmatizer = lemmatizer
self.tag_map = tag_map
self.n_tags = len(tag_map) self.n_tags = len(tag_map)
self.tag_names = tuple(sorted(tag_map.keys())) self.tag_names = tuple(sorted(tag_map.keys()))
self.tag_ids = {} self.reverse_index = {}
for i, tag_str in enumerate(self.tag_names): for i, (tag_str, props) in enumerate(sorted(tag_map.items())):
self.tag_ids[tag_str] = i self.rich_tags[i].id = i
self._cache = PreshMapArray() self.rich_tags[i].name = self.strings[tag_str]
self.rich_tags[i].morph = 0
self.reverse_index[self.rich_tags[i].name] = i
self._cache = PreshMapArray(self.n_tags)
cdef int assign_tag(self, TokenC* token, tag) except -1: cdef int assign_tag(self, TokenC* token, tag) except -1:
analysis = <MorphAnalysisC*>self._cache.get(tag, token.lex.orth) cdef int tag_id = self.strings[tag] if isinstance(tag, basestring) else tag
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
if analysis is NULL: if analysis is NULL:
analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC)) analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
cached = self.decode_tag(tag) analysis.tag = self.rich_tags[tag_id]
cached.lemma = self.lemmatize(token.pos, token.lex) analysis.lemma = self.lemmatize(tag, token.lex.orth)
token.lemma = analysis.lemma token.lemma = analysis.lemma
token.pos = analysis.pos token.pos = analysis.tag.pos
token.tag = analysis.tag token.tag = analysis.tag.name
token.morph = analysis.features token.morph = analysis.tag.morph
cdef int assign_feature(self, TokenC* token, feature, value) except -1: cdef int assign_feature(self, uint64_t* morph, feature, value) except -1:
pass pass
def load_morph_exceptions(self, dict exc): def load_morph_exceptions(self, dict exc):
# Map (form, pos) to (lemma, inflection) # Map (form, pos) to (lemma, rich tag)
cdef unicode pos_str cdef unicode pos_str
cdef unicode form_str cdef unicode form_str
cdef unicode lemma_str cdef unicode lemma_str
@ -57,121 +57,30 @@ cdef class Morphology:
cdef int lemma cdef int lemma
cdef attr_t orth cdef attr_t orth
cdef int pos cdef int pos
for pos_str, entries in exc.items(): for tag_str, entries in exc.items():
pos = self.tag_names.index(pos_str) tag = self.strings[tag_str]
rich_tag = self.rich_tags[self.reverse_index[tag]]
for form_str, props in entries.items(): for form_str, props in entries.items():
lemma_str = props.get('L', form_str)
orth = self.strings[form_str]
cached = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC)) cached = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
cached.lemma = self.strings[lemma_str] orth = self.strings[form_str]
self.set_features(cached, props) for name_str, value_str in props.items():
self._cache.set(pos, orth, <void*>cached) if name_str == 'L':
cached.lemma = self.strings[value_str]
else:
self.assign_feature(&cached.tag.morph, name_str, value_str)
if cached.lemma == 0:
cached.lemma = self.lemmatize(rich_tag.pos, orth)
self._cache.set(rich_tag.pos, orth, <void*>cached)
def _load_special_tokenization(self, special_cases): def lemmatize(self, const univ_pos_t pos, attr_t orth):
'''Add a special-case tokenization rule. if self.lemmatizer is None:
''' return orth
cdef int i cdef unicode py_string = self.strings[orth]
cdef list substrings if pos != NOUN and pos != VERB and pos != ADJ:
cdef unicode chunk return orth
cdef unicode form cdef set lemma_strings
cdef unicode lemma cdef unicode lemma_string
cdef dict props lemma_strings = self.lemmatizer(py_string, pos)
cdef LexemeC** lexemes lemma_string = sorted(lemma_strings)[0]
cdef hash_t hashed lemma = self.strings[lemma_string]
for chunk, substrings in sorted(special_cases.items()): return lemma
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
for i, props in enumerate(substrings):
# Set the special tokens up to have morphology and lemmas if
# specified, otherwise use the part-of-speech tag (if specified)
form = props['F']
tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, form)
morphology = self.vocab.morphology.decode_dict(props)
tokens[i].lemma = morph_analysis.lemma
tokens[i].pos = morph_analysis.pos
tokens[i].tag = morph_analysis.tag
tokens[i].morph = morph_analysis.morph
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
cached.length = len(substrings)
cached.is_lex = False
cached.data.tokens = tokens
hashed = hash_string(chunk)
self._specials.set(hashed, cached)
self._cache.set(hashed, cached)
#cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
# morph.number = props.get('number', 0)
# morph.tenspect = props.get('tenspect', 0)
# morph.mood = props.get('mood', 0)
# morph.gender = props.get('gender', 0)
# morph.person = props.get('person', 0)
# morph.case = props.get('case', 0)
# morph.misc = props.get('misc', 0)
#
#
#cdef class Morphology:
# cdef Pool mem
# cdef PreshMap table
#
# def __init__(self, tags, exceptions):
# pass
#
# def __getitem__(self, hash_t id_):
# pass
#
# cdef const InflectionC* get(self, hash_t key) except NULL:
# pass
#
# cdef MorphAnalysis analyse(const TokenC* token) except -1:
# cdef struct MorphAnalysis morphology
# tokens[i].pos = tag.pos
# cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth)
# if cached is NULL:
# cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
# cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
# cached.morph = tag.morph
# self._morph_cache.set(tag.id, tokens[i].lex.orth, <void*>cached)
# tokens[i].lemma = cached.lemma
# tokens[i].morph = cached.morph
#
# cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1:
# if self.lemmatizer is None:
# return lex.orth
# cdef unicode py_string = self.strings[lex.orth]
# if pos != NOUN and pos != VERB and pos != ADJ:
# return lex.orth
# cdef set lemma_strings
# cdef unicode lemma_string
# lemma_strings = self.lemmatizer(py_string, pos)
# lemma_string = sorted(lemma_strings)[0]
# lemma = self.strings[lemma_string]
# return lemma
#
#
#cdef class Inflection:
# cdef InflectionC* c
#
# def __init__(self, container, id_):
# self.c = container[id_]
# self.container = container
#
# for i, feat_id in enumerate(feat_ids):
# feature, value = parse_id(feat_id)
# self.add_value(feature, value, True)
#
# def has(self, Value_t feat_value_id):
# part = feat_value_id % 64
# bit = feat_value_id / 64
# if self.value_set[part] & bit:
# return True
# else:
# return False
#
# property pos: def __get__(self): return self.c.pos
#
# property id: def __get__(self): return self.c.id
#
# property features:
# pass

View File

@ -25,17 +25,6 @@ cdef struct LexemeC:
float sentiment float sentiment
float l2_norm float l2_norm
cdef struct MorphFeatC:
int name
int value
cdef struct MorphologyC:
uint64_t[4] feature_set
MorphFeatC* features
univ_pos_t pos
int n
cdef struct Entity: cdef struct Entity:
int start int start
@ -54,8 +43,8 @@ cdef struct Constituent:
cdef struct TokenC: cdef struct TokenC:
const LexemeC* lex const LexemeC* lex
const MorphologyC* morph
const Constituent* ctnt const Constituent* ctnt
uint64_t morph
univ_pos_t pos univ_pos_t pos
bint spacy bint spacy
int tag int tag

View File

@ -104,7 +104,7 @@ cdef class Tagger:
@classmethod @classmethod
def blank(cls, vocab, templates): def blank(cls, vocab, templates):
model = Model(vocab.morphology.n_tags, templates, model_loc=None) model = Model(vocab.n_tags, templates, model_loc=None)
return cls(vocab, model) return cls(vocab, model)
@classmethod @classmethod
@ -113,7 +113,7 @@ cdef class Tagger:
templates = json.loads(open(path.join(data_dir, 'templates.json'))) templates = json.loads(open(path.join(data_dir, 'templates.json')))
else: else:
templates = cls.default_templates() templates = cls.default_templates()
model = Model(vocab.morphology.n_tags, templates, data_dir) model = Model(vocab.n_tags, templates, data_dir)
return cls(vocab, model) return cls(vocab, model)
def __init__(self, Vocab vocab, model): def __init__(self, Vocab vocab, model):
@ -128,7 +128,7 @@ cdef class Tagger:
@property @property
def tag_names(self): def tag_names(self):
return self.vocab.morphology.tag_names return self.vocab.tag_names
def __call__(self, Doc tokens): def __call__(self, Doc tokens):
"""Apply the tagger, setting the POS tags onto the Doc object. """Apply the tagger, setting the POS tags onto the Doc object.
@ -143,14 +143,15 @@ cdef class Tagger:
for i in range(tokens.length): for i in range(tokens.length):
if tokens.data[i].pos == 0: if tokens.data[i].pos == 0:
guess = self.predict(i, tokens.data) guess = self.predict(i, tokens.data)
self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], guess) self.vocab.morphology.assign_tag(&tokens.data[i], guess)
tokens.is_tagged = True tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length tokens._py_tokens = [None] * tokens.length
def tag_from_strings(self, Doc tokens, object tag_strs): def tag_from_strings(self, Doc tokens, object tag_strs):
cdef int i cdef int i
for i in range(tokens.length): for i in range(tokens.length):
self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], tag_strs[i]) self.vocab.morphology.assign_tag(&tokens.data[i], tag_strs[i])
tokens.is_tagged = True tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length tokens._py_tokens = [None] * tokens.length
@ -168,7 +169,9 @@ cdef class Tagger:
for i in range(tokens.length): for i in range(tokens.length):
guess = self.update(i, tokens.data, golds[i]) guess = self.update(i, tokens.data, golds[i])
loss = golds[i] != -1 and guess != golds[i] loss = golds[i] != -1 and guess != golds[i]
self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], guess)
self.vocab.morphology.assign_tag(&tokens.data[i], guess)
correct += loss == 0 correct += loss == 0
self.freqs[TAG][tokens.data[i].tag] += 1 self.freqs[TAG][tokens.data[i].tag] += 1
return correct return correct

View File

@ -7,12 +7,7 @@ from .typedefs cimport hash_t
from .structs cimport LexemeC, TokenC from .structs cimport LexemeC, TokenC
from .strings cimport StringStore from .strings cimport StringStore
from .tokens.doc cimport Doc from .tokens.doc cimport Doc
from .vocab cimport Vocab, _Cached from .vocab cimport Vocab, LexemesOrTokens, _Cached
cdef union LexemesOrTokens:
const LexemeC* const* lexemes
TokenC* tokens
cdef class Tokenizer: cdef class Tokenizer:

View File

@ -192,9 +192,7 @@ cdef class Tokenizer:
tokens.push_back(prefixes[0][i], False) tokens.push_back(prefixes[0][i], False)
if string: if string:
cache_hit = self._try_cache(hash_string(string), tokens) cache_hit = self._try_cache(hash_string(string), tokens)
if cache_hit: if not cache_hit:
pass
else:
match = self.find_infix(string) match = self.find_infix(string)
if match is None: if match is None:
tokens.push_back(self.vocab.get(tokens.mem, string), False) tokens.push_back(self.vocab.get(tokens.mem, string), False)
@ -253,38 +251,10 @@ cdef class Tokenizer:
cdef LexemeC** lexemes cdef LexemeC** lexemes
cdef hash_t hashed cdef hash_t hashed
for chunk, substrings in sorted(special_cases.items()): for chunk, substrings in sorted(special_cases.items()):
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
for i, props in enumerate(substrings):
form = props['F']
tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, form)
lemma = props.get('L', form)
tokens[i].lemma = self.vocab.strings[lemma]
#TODO
#self.vocab.morphology.assign_from_dict(&tokens[i], props)
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
cached.length = len(substrings) cached.length = len(substrings)
cached.is_lex = False cached.is_lex = False
cached.data.tokens = tokens cached.data.tokens = self.vocab.make_fused_token(substrings)
hashed = hash_string(chunk) key = hash_string(chunk)
self._specials.set(hashed, cached) self._specials.set(key, cached)
self._cache.set(hashed, cached) self._cache.set(key, cached)
#if lemma is not None:
# tokens[i].lemma = self.vocab.strings[lemma]
#else:
# tokens[i].lemma = 0
#if 'pos' in props:
# inflection = self.vocab.morphology.get(props['pos'])
# inflection.assign(&tokens[i])
# # These are defaults, which can be over-ridden by the
# # token-specific props.
# #pos, morph_features = self.vocab.morphology.tag_map[props['pos']]
# #tokens[i].pos = pos
# ## These are defaults, which can be over-ridden by the
# ## token-specific props.
# #set_morph_from_dict(&tokens[i].morph, morph_features)
# #if tokens[i].lemma == 0:
# # tokens[i].lemma = tokens[i].lex.orth
##set_morph_from_dict(&tokens[i].morph, props)

View File

@ -12,11 +12,11 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil
ctypedef const LexemeC* const_Lexeme_ptr ctypedef const LexemeC* const_Lexeme_ptr
ctypedef TokenC* TokenC_ptr ctypedef const TokenC* const_TokenC_ptr
ctypedef fused LexemeOrToken: ctypedef fused LexemeOrToken:
const_Lexeme_ptr const_Lexeme_ptr
TokenC_ptr const_TokenC_ptr
cdef class Doc: cdef class Doc:

View File

@ -209,7 +209,7 @@ cdef class Doc:
if self.length == self.max_length: if self.length == self.max_length:
self._realloc(self.length * 2) self._realloc(self.length * 2)
cdef TokenC* t = &self.data[self.length] cdef TokenC* t = &self.data[self.length]
if LexemeOrToken is TokenC_ptr: if LexemeOrToken is const_TokenC_ptr:
t[0] = lex_or_tok[0] t[0] = lex_or_tok[0]
else: else:
t.lex = lex_or_tok t.lex = lex_or_tok

View File

@ -15,7 +15,7 @@ cdef LexemeC EMPTY_LEXEME
cdef union LexemesOrTokens: cdef union LexemesOrTokens:
const LexemeC* const* lexemes const LexemeC* const* lexemes
TokenC* tokens const TokenC* tokens
cdef struct _Cached: cdef struct _Cached:
@ -37,6 +37,7 @@ cdef class Vocab:
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
cdef const TokenC* make_fused_token(self, substrings) except NULL
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1

View File

@ -17,6 +17,7 @@ from .strings cimport hash_string
from .orth cimport word_shape from .orth cimport word_shape
from .typedefs cimport attr_t from .typedefs cimport attr_t
from .cfile cimport CFile from .cfile cimport CFile
from .lemmatizer import Lemmatizer
from cymem.cymem cimport Address from cymem.cymem cimport Address
from . import util from . import util
@ -36,20 +37,13 @@ EMPTY_LEXEME.repvec = EMPTY_VEC
cdef class Vocab: cdef class Vocab:
'''A map container for a language's LexemeC structs. '''A map container for a language's LexemeC structs.
''' '''
@classmethod def __init__(self, get_lex_attr=None, tag_map=None, vectors=None):
def default_morphology(cls):
return Morphology({'VBZ': ['VERB', {}]}, [], None)
def __init__(self, get_lex_attr=None, morphology=None, vectors=None):
self.get_lex_attr = get_lex_attr
if morphology is None:
morphology = self.default_morphology()
self.morphology = morphology
self.mem = Pool() self.mem = Pool()
self._by_hash = PreshMap() self._by_hash = PreshMap()
self._by_orth = PreshMap() self._by_orth = PreshMap()
self.strings = StringStore() self.strings = StringStore()
self.get_lex_attr = get_lex_attr
self.morphology = Morphology(self.strings, tag_map, Lemmatizer({}, {}, {}))
self.length = 1 self.length = 1
self._serializer = None self._serializer = None
@ -60,10 +54,9 @@ cdef class Vocab:
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir) raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
if not path.isdir(data_dir): if not path.isdir(data_dir):
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, tag_map = json.load(open(path.join(data_dir, 'tag_map.json')))
morphology=morphology) cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, tag_map=tag_map)
self.load_lexemes(path.join(data_dir, 'strings.txt'), self.load_lexemes(path.join(data_dir, 'strings.txt'), path.join(data_dir, 'lexemes.bin'))
path.join(data_dir, 'lexemes.bin'))
if vectors is None and path.exists(path.join(data_dir, 'vec.bin')): if vectors is None and path.exists(path.join(data_dir, 'vec.bin')):
self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin')) self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
return self return self
@ -172,6 +165,22 @@ cdef class Vocab:
orth = id_or_string orth = id_or_string
return Lexeme(self, orth) return Lexeme(self, orth)
cdef const TokenC* make_fused_token(self, substrings) except NULL:
cdef int i
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
for i, props in enumerate(substrings):
token = &tokens[i]
# Set the special tokens up to have morphology and lemmas if
# specified, otherwise use the part-of-speech tag (if specified)
token.lex = <LexemeC*>self.get(self.mem, props['F'])
if 'pos' in props:
self.morphology.assign_tag(token, props['pos'])
if 'L' in props:
tokens[i].lemma = self.strings[props['L']]
for feature, value in props.get('morph', {}).items():
self.morphology.assign_feature(&token.morph, feature, value)
return tokens
def dump(self, loc): def dump(self, loc):
if path.exists(loc): if path.exists(loc):
assert not path.isdir(loc) assert not path.isdir(loc)