* Begin merge of Gazetteer and DE branches

This commit is contained in:
Matthew Honnibal 2015-09-06 19:45:15 +02:00
parent dbf8dce109
commit d2fc104a26
7 changed files with 74 additions and 57 deletions

View File

@ -4,6 +4,7 @@ from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTE
from .structs cimport LexemeC from .structs cimport LexemeC
from .strings cimport StringStore from .strings cimport StringStore
from .vocab cimport Vocab
from numpy cimport ndarray from numpy cimport ndarray
@ -15,21 +16,31 @@ cdef class Lexeme:
cdef readonly Vocab vocab cdef readonly Vocab vocab
cdef readonly attr_t orth cdef readonly attr_t orth
cdef int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1: @staticmethod
lex.length = props['length'] cdef inline Lexeme from_ptr(LexemeC* lex, Vocab vocab, int vector_length):
lex.orth = vocab.strings[props['orth']] cdef Lexeme self = Lexeme.__new__(Lexeme, vocab, lex.orth)
lex.lower = vocab.strings[props['lower']] self.c = lex
lex.norm = vocab.strings[props['norm']] self.vocab = vocab
lex.shape = vocab.strings[props['shape']] self.orth = lex.orth
lex.prefix = vocab.strings[props['prefix']]
lex.suffix = vocab.strings[props['suffix']]
lex.cluster = props['cluster'] @staticmethod
lex.prob = props['prob'] cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil:
lex.sentiment = props['sentiment'] if name < (sizeof(flags_t) * 8):
Lexeme.set_flag(lex, name, value)
lex.flags = props['flags'] elif name == ID:
lex.repvec = empty_vec lex.id = value
elif name == LOWER:
lex.lower = value
elif name == NORM:
lex.norm = value
elif name == SHAPE:
lex.shape = value
elif name == PREFIX:
lex.prefix = value
elif name == SUFFIX:
lex.suffix = value
elif name == CLUSTER:
lex.cluster = value
@staticmethod @staticmethod
cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil: cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
@ -56,5 +67,14 @@ cdef class Lexeme:
else: else:
return 0 return 0
@staticmethod
cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil: cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
return lexeme.flags & (1 << flag_id) return lexeme.flags & (1 << flag_id)
@staticmethod
cdef inline bint set_flag(LexemeC* lexeme, attr_id_t flag_id, bint value) nogil:
cdef flags_t one = 1
if value:
lexeme.flags |= one << flag_id
else:
lexeme.flags &= ~(one << flag_id)

View File

@ -26,11 +26,7 @@ cdef class Lexeme:
def __init__(self, Vocab vocab, int orth): def __init__(self, Vocab vocab, int orth):
self.vocab = vocab self.vocab = vocab
self.orth = orth self.orth = orth
self.c = <LexemeC*><void*>vocab.get_by_orth(orth) self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
property orth:
def __get__(self):
return self.c.orth
property lower: property lower:
def __get__(self): return self.c.lower def __get__(self): return self.c.lower
@ -78,44 +74,44 @@ cdef class Lexeme:
property is_oov: property is_oov:
def __get__(self): return Lexeme.check_flag(self.c, IS_OOV) def __get__(self): return Lexeme.check_flag(self.c, IS_OOV)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_OOV, x) def __set__(self, bint x): Lexeme.set_flag(self.c, IS_OOV, x)
property is_alpha: property is_alpha:
def __get__(self): return Lexeme.check_flag(self.c, IS_ALPHA) def __get__(self): return Lexeme.check_flag(self.c, IS_ALPHA)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ALPHA, x) def __set__(self, bint x): Lexeme.set_flag(self.c, IS_ALPHA, x)
property is_ascii: property is_ascii:
def __get__(self): return Lexeme.check_flag(self.c, IS_ASCII) def __get__(self): return Lexeme.check_flag(self.c, IS_ASCII)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ASCII, x) def __set__(self, bint x): Lexeme.set_flag(self.c, IS_ASCII, x)
property is_digit: property is_digit:
def __get__(self): return Lexeme.check_flag(self.c, IS_DIGIT) def __get__(self): return Lexeme.check_flag(self.c, IS_DIGIT)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_DIGIT, x) def __set__(self, bint x): Lexeme.set_flag(self.c, IS_DIGIT, x)
property is_lower: property is_lower:
def __get__(self): return Lexeme.check_flag(self.c, IS_LOWER) def __get__(self): return Lexeme.check_flag(self.c, IS_LOWER)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_LOWER, x) def __set__(self, bint x): Lexeme.set_flag(self.c, IS_LOWER, x)
property is_title: property is_title:
def __get__(self): return Lexeme.check_flag(self.c, IS_TITLE) def __get__(self): return Lexeme.check_flag(self.c, IS_TITLE)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_TITLE, x) def __set__(self, bint x): Lexeme.set_flag(self.c, IS_TITLE, x)
property is_punct: property is_punct:
def __get__(self): return Lexeme.check_flag(self.c, IS_PUNCT) def __get__(self): return Lexeme.check_flag(self.c, IS_PUNCT)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_PUNCT, x) def __set__(self, bint x): Lexeme.set_flag(self.c, IS_PUNCT, x)
property is_space: property is_space:
def __get__(self): return Lexeme.check_flag(self.c, IS_SPACE) def __get__(self): return Lexeme.check_flag(self.c, IS_SPACE)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_SPACE, x) def __set__(self, bint x): Lexeme.set_flag(self.c, IS_SPACE, x)
property like_url: property like_url:
def __get__(self): return Lexeme.check_flag(self.c, LIKE_URL) def __get__(self): return Lexeme.check_flag(self.c, LIKE_URL)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_URL, x) def __set__(self, bint x): Lexeme.set_flag(self.c, LIKE_URL, x)
property like_num: property like_num:
def __get__(self): return Lexeme.like_num(self.c, IKE_NUM) def __get__(self): return Lexeme.check_flag(self.c, LIKE_NUM)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_NUM, x) def __set__(self, bint x): Lexeme.set_flag(self.c, LIKE_NUM, x)
property like_email: property like_email:
def __get__(self): return Lexeme.check_flag(self.c, LIKE_EMAIL) def __get__(self): return Lexeme.check_flag(self.c, LIKE_EMAIL)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_EMAIL, x) def __set__(self, bint x): Lexeme.set_flag(self.c, LIKE_EMAIL, x)

View File

@ -102,21 +102,22 @@ cdef class Matcher:
cdef readonly int n_patterns cdef readonly int n_patterns
def __init__(self, vocab, patterns): def __init__(self, vocab, patterns):
self.vocab = vocab
self.mem = Pool() self.mem = Pool()
for entity_key, (etype, attrs, specs) in sorted(patterns.items()): for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
self.add(entity_key, etype, attrs, specs) self.add(entity_key, etype, attrs, specs)
def add(self, entity_key, etype, attrs, specs): def add(self, entity_key, etype, attrs, specs):
if isinstance(entity_key, basestring): if isinstance(entity_key, basestring):
entity_key = vocab.strings[entity_key] entity_key = self.vocab.strings[entity_key]
if isinstance(etype, basestring): if isinstance(etype, basestring):
etype = vocab.strings[etype] etype = self.vocab.strings[etype]
elif etype is None: elif etype is None:
etype = -1 etype = -1
# TODO: Do something more clever about multiple patterns for single # TODO: Do something more clever about multiple patterns for single
# entity # entity
for spec in specs: for spec in specs:
spec = _convert_strings(spec, vocab.strings) spec = _convert_strings(spec, self.vocab.strings)
self.patterns.push_back(init_pattern(self.mem, spec, etype)) self.patterns.push_back(init_pattern(self.mem, spec, etype))
@classmethod @classmethod

View File

@ -5,6 +5,7 @@ from libc.stdint cimport uint32_t
import numpy import numpy
import struct import struct
from ..lexeme cimport Lexeme
from ..lexeme cimport EMPTY_LEXEME from ..lexeme cimport EMPTY_LEXEME
from ..typedefs cimport attr_t, flags_t from ..typedefs cimport attr_t, flags_t
from ..attrs cimport attr_id_t from ..attrs cimport attr_id_t
@ -13,8 +14,6 @@ from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
from ..parts_of_speech import UNIV_POS_NAMES from ..parts_of_speech import UNIV_POS_NAMES
from ..parts_of_speech cimport CONJ, PUNCT, NOUN from ..parts_of_speech cimport CONJ, PUNCT, NOUN
from ..parts_of_speech cimport univ_pos_t from ..parts_of_speech cimport univ_pos_t
from ..lexeme cimport check_flag
from ..lexeme cimport get_attr as get_lex_attr
from .spans cimport Span from .spans cimport Span
from .token cimport Token from .token cimport Token
from ..serialize.bits cimport BitArray from ..serialize.bits cimport BitArray
@ -48,7 +47,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
elif feat_name == ENT_TYPE: elif feat_name == ENT_TYPE:
return token.ent_type return token.ent_type
else: else:
return get_lex_attr(token.lex, feat_name) return Lexeme.get_struct_attr(token.lex, feat_name)
cdef class Doc: cdef class Doc:

View File

@ -1,6 +1,5 @@
from libc.string cimport memcpy from libc.string cimport memcpy
from cpython.mem cimport PyMem_Malloc, PyMem_Free from cpython.mem cimport PyMem_Malloc, PyMem_Free
from ..lexeme cimport check_flag
# Compiler crashes on memory view coercion without this. Should report bug. # Compiler crashes on memory view coercion without this. Should report bug.
from cython.view cimport array as cvarray from cython.view cimport array as cvarray
cimport numpy as np cimport numpy as np
@ -9,6 +8,7 @@ np.import_array()
import numpy import numpy
from ..lexeme cimport Lexeme
from ..parts_of_speech import UNIV_POS_NAMES from ..parts_of_speech import UNIV_POS_NAMES
from ..attrs cimport LEMMA from ..attrs cimport LEMMA
@ -42,7 +42,7 @@ cdef class Token:
return self.string return self.string
cpdef bint check_flag(self, attr_id_t flag_id) except -1: cpdef bint check_flag(self, attr_id_t flag_id) except -1:
return check_flag(self.c.lex, flag_id) return Lexeme.check_flag(self.c.lex, flag_id)
def nbor(self, int i=1): def nbor(self, int i=1):
return self.doc[self.i+i] return self.doc[self.i+i]
@ -286,37 +286,37 @@ cdef class Token:
return self.vocab.strings[self.c.dep] return self.vocab.strings[self.c.dep]
property is_oov: property is_oov:
def __get__(self): return check_flag(self.c.lex, IS_OOV) def __get__(self): return Lexeme.check_flag(self.c.lex, IS_OOV)
property is_alpha: property is_alpha:
def __get__(self): return check_flag(self.c.lex, IS_ALPHA) def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ALPHA)
property is_ascii: property is_ascii:
def __get__(self): return check_flag(self.c.lex, IS_ASCII) def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ASCII)
property is_digit: property is_digit:
def __get__(self): return check_flag(self.c.lex, IS_DIGIT) def __get__(self): return Lexeme.check_flag(self.c.lex, IS_DIGIT)
property is_lower: property is_lower:
def __get__(self): return check_flag(self.c.lex, IS_LOWER) def __get__(self): return Lexeme.check_flag(self.c.lex, IS_LOWER)
property is_title: property is_title:
def __get__(self): return check_flag(self.c.lex, IS_TITLE) def __get__(self): return Lexeme.check_flag(self.c.lex, IS_TITLE)
property is_punct: property is_punct:
def __get__(self): return check_flag(self.c.lex, IS_PUNCT) def __get__(self): return Lexeme.check_flag(self.c.lex, IS_PUNCT)
property is_space: property is_space:
def __get__(self): return check_flag(self.c.lex, IS_SPACE) def __get__(self): return Lexeme.check_flag(self.c.lex, IS_SPACE)
property like_url: property like_url:
def __get__(self): return check_flag(self.c.lex, LIKE_URL) def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_URL)
property like_num: property like_num:
def __get__(self): return check_flag(self.c.lex, LIKE_NUM) def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_NUM)
property like_email: property like_email:
def __get__(self): return check_flag(self.c.lex, LIKE_EMAIL) def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_EMAIL)
_pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()} _pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}

View File

@ -37,6 +37,7 @@ cdef class Vocab:
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
cdef PreshMap _by_hash cdef PreshMap _by_hash
cdef PreshMap _by_orth cdef PreshMap _by_orth

View File

@ -12,7 +12,6 @@ import math
import json import json
from .lexeme cimport EMPTY_LEXEME from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport set_lex_struct_props
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .strings cimport hash_string from .strings cimport hash_string
from .orth cimport word_shape from .orth cimport word_shape
@ -36,12 +35,13 @@ EMPTY_LEXEME.repvec = EMPTY_VEC
cdef class Vocab: cdef class Vocab:
'''A map container for a language's LexemeC structs. '''A map container for a language's LexemeC structs.
''' '''
def __init__(self, data_dir=None, get_lex_attr=None): def __init__(self, data_dir=None, get_lex_attr=None, load_vectors=False):
self.mem = Pool() self.mem = Pool()
self._by_hash = PreshMap() self._by_hash = PreshMap()
self._by_orth = PreshMap() self._by_orth = PreshMap()
self.strings = StringStore() self.strings = StringStore()
self.pos_tags = pos_tags if pos_tags is not None else {} #self.pos_tags = pos_tags if pos_tags is not None else {}
self.pos_tags = {}
self.get_lex_attr = get_lex_attr self.get_lex_attr = get_lex_attr
self.repvec_length = 0 self.repvec_length = 0
@ -112,7 +112,7 @@ cdef class Vocab:
if is_oov: if is_oov:
lex.id = 0 lex.id = 0
else: else:
self._add_lex_to_vocab(key, lex) self._add_lex_to_vocab(hash_string(string), lex)
assert lex != NULL, string assert lex != NULL, string
return lex return lex
@ -125,7 +125,7 @@ cdef class Vocab:
cdef attr_t orth cdef attr_t orth
cdef size_t addr cdef size_t addr
for orth, addr in self._by_orth.items(): for orth, addr in self._by_orth.items():
yield Lexeme.from_ptr(<LexemeC*>addr, self.strings, self.repvec_length) yield Lexeme.from_ptr(<LexemeC*>addr, self, self.repvec_length)
def __getitem__(self, id_or_string): def __getitem__(self, id_or_string):
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously '''Retrieve a lexeme, given an int ID or a unicode string. If a previously
@ -157,7 +157,7 @@ cdef class Vocab:
raise ValueError("Vocab unable to map type: " raise ValueError("Vocab unable to map type: "
"%s. Maps unicode --> Lexeme or " "%s. Maps unicode --> Lexeme or "
"int --> Lexeme" % str(type(id_or_string))) "int --> Lexeme" % str(type(id_or_string)))
return Lexeme.from_ptr(lexeme, self.strings, self.repvec_length) return Lexeme.from_ptr(<LexemeC*><void*>lexeme, self, self.repvec_length)
def dump(self, loc): def dump(self, loc):
if path.exists(loc): if path.exists(loc):