* Work on language-independent refactoring

This commit is contained in:
Matthew Honnibal 2015-08-23 20:49:18 +02:00
parent 3879d28457
commit 6f1743692a
10 changed files with 88 additions and 54 deletions

View File

@ -41,6 +41,8 @@ def get_lex_props(string, oov_prob=-30, is_oov=False):
'sentiment': 0 'sentiment': 0
} }
get_lex_attr = {}
if_model_present = -1 if_model_present = -1
LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data') LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')

View File

@ -4,6 +4,7 @@ from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTE
from .structs cimport LexemeC from .structs cimport LexemeC
from .strings cimport StringStore from .strings cimport StringStore
from .vocab cimport Vocab
from numpy cimport ndarray from numpy cimport ndarray
@ -15,7 +16,8 @@ cdef class Lexeme:
cdef readonly Vocab vocab cdef readonly Vocab vocab
cdef readonly attr_t orth cdef readonly attr_t orth
cdef int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1: @staticmethod
cdef inline int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1:
lex.length = props['length'] lex.length = props['length']
lex.orth = vocab.strings[props['orth']] lex.orth = vocab.strings[props['orth']]
lex.lower = vocab.strings[props['lower']] lex.lower = vocab.strings[props['lower']]
@ -29,7 +31,6 @@ cdef class Lexeme:
lex.sentiment = props['sentiment'] lex.sentiment = props['sentiment']
lex.flags = props['flags'] lex.flags = props['flags']
lex.repvec = empty_vec
@staticmethod @staticmethod
cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil: cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
@ -55,6 +56,34 @@ cdef class Lexeme:
return lex.cluster return lex.cluster
else: else:
return 0 return 0
@staticmethod
cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil:
if name < (sizeof(flags_t) * 8):
Lexeme.set_flag(lex, name, value)
elif name == ID:
lex.id = value
elif name == LOWER:
lex.lower = value
elif name == NORM:
lex.norm = value
elif name == SHAPE:
lex.shape = value
elif name == PREFIX:
lex.prefix = value
elif name == SUFFIX:
lex.suffix = value
elif name == CLUSTER:
lex.cluster = value
@staticmethod
cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil: cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
return lexeme.flags & (1 << flag_id) return lexeme.flags & (1 << flag_id)
@staticmethod
cdef inline bint set_flag(LexemeC* lex, attr_id_t flag_id, int value) nogil:
cdef flags_t one = 1
if value:
lex.flags |= one << flag_id
else:
lex.flags &= ~(one << flag_id)

View File

@ -26,12 +26,9 @@ cdef class Lexeme:
def __init__(self, Vocab vocab, int orth): def __init__(self, Vocab vocab, int orth):
self.vocab = vocab self.vocab = vocab
self.orth = orth self.orth = orth
self.c = <LexemeC*><void*>vocab.get_by_orth(orth) self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
assert self.c.orth == orth
property orth:
def __get__(self):
return self.c.orth
property lower: property lower:
def __get__(self): return self.c.lower def __get__(self): return self.c.lower
def __set__(self, int x): self.c.lower = x def __set__(self, int x): self.c.lower = x
@ -113,7 +110,7 @@ cdef class Lexeme:
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_URL, x) def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_URL, x)
property like_num: property like_num:
def __get__(self): return Lexeme.like_num(self.c, IKE_NUM) def __get__(self): return Lexeme.check_flag(self.c, LIKE_NUM)
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_NUM, x) def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_NUM, x)
property like_email: property like_email:

View File

@ -103,20 +103,21 @@ cdef class Matcher:
def __init__(self, vocab, patterns): def __init__(self, vocab, patterns):
self.mem = Pool() self.mem = Pool()
self.vocab = vocab
for entity_key, (etype, attrs, specs) in sorted(patterns.items()): for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
self.add(entity_key, etype, attrs, specs) self.add(entity_key, etype, attrs, specs)
def add(self, entity_key, etype, attrs, specs): def add(self, entity_key, etype, attrs, specs):
if isinstance(entity_key, basestring): if isinstance(entity_key, basestring):
entity_key = vocab.strings[entity_key] entity_key = self.vocab.strings[entity_key]
if isinstance(etype, basestring): if isinstance(etype, basestring):
etype = vocab.strings[etype] etype = self.vocab.strings[etype]
elif etype is None: elif etype is None:
etype = -1 etype = -1
# TODO: Do something more clever about multiple patterns for single # TODO: Do something more clever about multiple patterns for single
# entity # entity
for spec in specs: for spec in specs:
spec = _convert_strings(spec, vocab.strings) spec = _convert_strings(spec, self.vocab.strings)
self.patterns.push_back(init_pattern(self.mem, spec, etype)) self.patterns.push_back(init_pattern(self.mem, spec, etype))
@classmethod @classmethod

View File

@ -92,6 +92,7 @@ cpdef bint like_url(unicode string):
return False return False
# TODO: This should live in the language.orth
NUM_WORDS = set('zero one two three four five six seven eight nine ten' NUM_WORDS = set('zero one two three four five six seven eight nine ten'
'eleven twelve thirteen fourteen fifteen sixteen seventeen' 'eleven twelve thirteen fourteen fifteen sixteen seventeen'
'eighteen nineteen twenty thirty forty fifty sixty seventy' 'eighteen nineteen twenty thirty forty fifty sixty seventy'

View File

@ -142,6 +142,8 @@ cdef class StringStore:
def load(self, loc): def load(self, loc):
with codecs.open(loc, 'r', 'utf8') as file_: with codecs.open(loc, 'r', 'utf8') as file_:
strings = file_.read().split(SEPARATOR) strings = file_.read().split(SEPARATOR)
if strings == ['']:
return None
cdef unicode string cdef unicode string
cdef bytes byte_string cdef bytes byte_string
for string in strings: for string in strings:

View File

@ -12,8 +12,7 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUST
from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
from ..parts_of_speech import UNIV_POS_NAMES from ..parts_of_speech import UNIV_POS_NAMES
from ..parts_of_speech cimport CONJ, PUNCT, NOUN from ..parts_of_speech cimport CONJ, PUNCT, NOUN
from ..lexeme cimport check_flag from ..lexeme cimport Lexeme
from ..lexeme cimport get_attr as get_lex_attr
from .spans cimport Span from .spans cimport Span
from .token cimport Token from .token cimport Token
from ..serialize.bits cimport BitArray from ..serialize.bits cimport BitArray
@ -47,7 +46,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
elif feat_name == ENT_TYPE: elif feat_name == ENT_TYPE:
return token.ent_type return token.ent_type
else: else:
return get_lex_attr(token.lex, feat_name) return Lexeme.get_struct_attr(token.lex, feat_name)
cdef class Doc: cdef class Doc:
@ -218,6 +217,7 @@ cdef class Doc:
t.idx = 0 t.idx = 0
else: else:
t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy
assert t.lex.orth != 0
t.spacy = has_space t.spacy = has_space
self.length += 1 self.length += 1
self._py_tokens.append(None) self._py_tokens.append(None)

View File

@ -1,6 +1,5 @@
from libc.string cimport memcpy from libc.string cimport memcpy
from cpython.mem cimport PyMem_Malloc, PyMem_Free from cpython.mem cimport PyMem_Malloc, PyMem_Free
from ..lexeme cimport check_flag
# Compiler crashes on memory view coercion without this. Should report bug. # Compiler crashes on memory view coercion without this. Should report bug.
from cython.view cimport array as cvarray from cython.view cimport array as cvarray
cimport numpy as np cimport numpy as np
@ -20,6 +19,8 @@ from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from ..attrs cimport IS_OOV from ..attrs cimport IS_OOV
from ..lexeme cimport Lexeme
cdef class Token: cdef class Token:
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created """An individual token --- i.e. a word, a punctuation symbol, etc. Created
@ -42,7 +43,7 @@ cdef class Token:
return self.string return self.string
cpdef bint check_flag(self, attr_id_t flag_id) except -1: cpdef bint check_flag(self, attr_id_t flag_id) except -1:
return check_flag(self.c.lex, flag_id) return Lexeme.check_flag(self.c.lex, flag_id)
def nbor(self, int i=1): def nbor(self, int i=1):
return self.doc[self.i+i] return self.doc[self.i+i]
@ -286,37 +287,37 @@ cdef class Token:
return self.vocab.strings[self.c.dep] return self.vocab.strings[self.c.dep]
property is_oov: property is_oov:
def __get__(self): return check_flag(self.c.lex, IS_OOV) def __get__(self): return Lexeme.check_flag(self.c.lex, IS_OOV)
property is_alpha: property is_alpha:
def __get__(self): return check_flag(self.c.lex, IS_ALPHA) def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ALPHA)
property is_ascii: property is_ascii:
def __get__(self): return check_flag(self.c.lex, IS_ASCII) def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ASCII)
property is_digit: property is_digit:
def __get__(self): return check_flag(self.c.lex, IS_DIGIT) def __get__(self): return Lexeme.check_flag(self.c.lex, IS_DIGIT)
property is_lower: property is_lower:
def __get__(self): return check_flag(self.c.lex, IS_LOWER) def __get__(self): return Lexeme.check_flag(self.c.lex, IS_LOWER)
property is_title: property is_title:
def __get__(self): return check_flag(self.c.lex, IS_TITLE) def __get__(self): return Lexeme.check_flag(self.c.lex, IS_TITLE)
property is_punct: property is_punct:
def __get__(self): return check_flag(self.c.lex, IS_PUNCT) def __get__(self): return Lexeme.check_flag(self.c.lex, IS_PUNCT)
property is_space: property is_space:
def __get__(self): return check_flag(self.c.lex, IS_SPACE) def __get__(self): return Lexeme.check_flag(self.c.lex, IS_SPACE)
property like_url: property like_url:
def __get__(self): return check_flag(self.c.lex, LIKE_URL) def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_URL)
property like_num: property like_num:
def __get__(self): return check_flag(self.c.lex, LIKE_NUM) def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_NUM)
property like_email: property like_email:
def __get__(self): return check_flag(self.c.lex, LIKE_EMAIL) def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_EMAIL)
_pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()} _pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}

View File

@ -27,15 +27,16 @@ cdef class Vocab:
cpdef public lexeme_props_getter cpdef public lexeme_props_getter
cdef Pool mem cdef Pool mem
cpdef readonly StringStore strings cpdef readonly StringStore strings
cdef readonly object pos_tags
cdef readonly int length cdef readonly int length
cdef public object _serializer cdef public object _serializer
cdef public object data_dir cdef public object data_dir
cdef public float oov_prob cdef public object get_lex_attr
cdef public object pos_tags
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
cdef PreshMap _by_hash cdef PreshMap _by_hash

View File

@ -12,7 +12,6 @@ import math
import json import json
from .lexeme cimport EMPTY_LEXEME from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport set_lex_struct_props
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .strings cimport hash_string from .strings cimport hash_string
from .orth cimport word_shape from .orth cimport word_shape
@ -36,17 +35,15 @@ EMPTY_LEXEME.repvec = EMPTY_VEC
cdef class Vocab: cdef class Vocab:
'''A map container for a language's LexemeC structs. '''A map container for a language's LexemeC structs.
''' '''
def __init__(self, data_dir=None, get_lex_attr=None): def __init__(self, data_dir=None, get_lex_attr=None, load_vectors=True, pos_tags=None):
self.mem = Pool() self.mem = Pool()
self._by_hash = PreshMap() self._by_hash = PreshMap()
self._by_orth = PreshMap() self._by_orth = PreshMap()
self.strings = StringStore() self.strings = StringStore()
self.pos_tags = pos_tags if pos_tags is not None else {}
self.get_lex_attr = get_lex_attr self.get_lex_attr = get_lex_attr
self.repvec_length = 0 self.repvec_length = 0
self.length = 0 self.length = 1
self._add_lex_to_vocab(0, &EMPTY_LEXEME) self.pos_tags = pos_tags
if data_dir is not None: if data_dir is not None:
if not path.exists(data_dir): if not path.exists(data_dir):
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir) raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
@ -84,7 +81,10 @@ cdef class Vocab:
cdef LexemeC* lex cdef LexemeC* lex
cdef hash_t key = hash_string(string) cdef hash_t key = hash_string(string)
lex = <LexemeC*>self._by_hash.get(key) lex = <LexemeC*>self._by_hash.get(key)
cdef size_t addr
if lex != NULL: if lex != NULL:
print string, lex.orth, self.strings[string]
assert lex.orth == self.strings[string]
return lex return lex
else: else:
return self._new_lexeme(mem, string) return self._new_lexeme(mem, string)
@ -103,15 +103,24 @@ cdef class Vocab:
return self._new_lexeme(mem, self.strings[orth]) return self._new_lexeme(mem, self.strings[orth])
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL: cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
cdef hash_t key
cdef bint is_oov = mem is not self.mem cdef bint is_oov = mem is not self.mem
if len(string) < 3: mem = self.mem
mem = self.mem #if len(string) < 3:
# mem = self.mem
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1) lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
for attr, func in self.lex_attr_getters.items(): lex.orth = self.strings[string]
Lexeme.set_struct_attr(lex, attr, func(string)) lex.id = self.length
if self.get_lex_attr is not None:
for attr, func in self.get_lex_attr.items():
value = func(string)
if isinstance(value, unicode):
value = self.strings[value]
Lexeme.set_struct_attr(lex, attr, value)
if is_oov: if is_oov:
lex.id = 0 lex.id = 0
else: else:
key = hash_string(string)
self._add_lex_to_vocab(key, lex) self._add_lex_to_vocab(key, lex)
assert lex != NULL, string assert lex != NULL, string
return lex return lex
@ -119,13 +128,14 @@ cdef class Vocab:
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1: cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
self._by_hash.set(key, <void*>lex) self._by_hash.set(key, <void*>lex)
self._by_orth.set(lex.orth, <void*>lex) self._by_orth.set(lex.orth, <void*>lex)
print "Add lex", key, lex.orth, self.strings[lex.orth]
self.length += 1 self.length += 1
def __iter__(self): def __iter__(self):
cdef attr_t orth cdef attr_t orth
cdef size_t addr cdef size_t addr
for orth, addr in self._by_orth.items(): for orth, addr in self._by_orth.items():
yield Lexeme.from_ptr(<LexemeC*>addr, self.strings, self.repvec_length) yield Lexeme(self, orth)
def __getitem__(self, id_or_string): def __getitem__(self, id_or_string):
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously '''Retrieve a lexeme, given an int ID or a unicode string. If a previously
@ -142,22 +152,12 @@ cdef class Vocab:
An instance of the Lexeme Python class, with data copied on An instance of the Lexeme Python class, with data copied on
instantiation. instantiation.
''' '''
cdef const LexemeC* lexeme
cdef attr_t orth cdef attr_t orth
if type(id_or_string) == int: if type(id_or_string) == unicode:
orth = id_or_string orth = self.strings[id_or_string]
lexeme = <LexemeC*>self._by_orth.get(orth)
if lexeme == NULL:
raise KeyError(id_or_string)
assert lexeme.orth == orth, ('%d vs %d' % (lexeme.orth, orth))
elif type(id_or_string) == unicode:
lexeme = self.get(self.mem, id_or_string)
assert lexeme.orth == self.strings[id_or_string]
else: else:
raise ValueError("Vocab unable to map type: " orth = id_or_string
"%s. Maps unicode --> Lexeme or " return Lexeme(self, orth)
"int --> Lexeme" % str(type(id_or_string)))
return Lexeme.from_ptr(lexeme, self.strings, self.repvec_length)
def dump(self, loc): def dump(self, loc):
if path.exists(loc): if path.exists(loc):