mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 00:46:28 +03:00
* Begin merge of Gazetteer and DE branches
This commit is contained in:
parent
dbf8dce109
commit
d2fc104a26
|
@ -4,6 +4,7 @@ from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTE
|
|||
|
||||
from .structs cimport LexemeC
|
||||
from .strings cimport StringStore
|
||||
from .vocab cimport Vocab
|
||||
|
||||
from numpy cimport ndarray
|
||||
|
||||
|
@ -15,21 +16,31 @@ cdef class Lexeme:
|
|||
cdef readonly Vocab vocab
|
||||
cdef readonly attr_t orth
|
||||
|
||||
cdef int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1:
|
||||
lex.length = props['length']
|
||||
lex.orth = vocab.strings[props['orth']]
|
||||
lex.lower = vocab.strings[props['lower']]
|
||||
lex.norm = vocab.strings[props['norm']]
|
||||
lex.shape = vocab.strings[props['shape']]
|
||||
lex.prefix = vocab.strings[props['prefix']]
|
||||
lex.suffix = vocab.strings[props['suffix']]
|
||||
|
||||
lex.cluster = props['cluster']
|
||||
lex.prob = props['prob']
|
||||
lex.sentiment = props['sentiment']
|
||||
|
||||
lex.flags = props['flags']
|
||||
lex.repvec = empty_vec
|
||||
@staticmethod
|
||||
cdef inline Lexeme from_ptr(LexemeC* lex, Vocab vocab, int vector_length):
|
||||
cdef Lexeme self = Lexeme.__new__(Lexeme, vocab, lex.orth)
|
||||
self.c = lex
|
||||
self.vocab = vocab
|
||||
self.orth = lex.orth
|
||||
|
||||
@staticmethod
|
||||
cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil:
|
||||
if name < (sizeof(flags_t) * 8):
|
||||
Lexeme.set_flag(lex, name, value)
|
||||
elif name == ID:
|
||||
lex.id = value
|
||||
elif name == LOWER:
|
||||
lex.lower = value
|
||||
elif name == NORM:
|
||||
lex.norm = value
|
||||
elif name == SHAPE:
|
||||
lex.shape = value
|
||||
elif name == PREFIX:
|
||||
lex.prefix = value
|
||||
elif name == SUFFIX:
|
||||
lex.suffix = value
|
||||
elif name == CLUSTER:
|
||||
lex.cluster = value
|
||||
|
||||
@staticmethod
|
||||
cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
||||
|
@ -56,5 +67,14 @@ cdef class Lexeme:
|
|||
else:
|
||||
return 0
|
||||
|
||||
@staticmethod
|
||||
cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
|
||||
return lexeme.flags & (1 << flag_id)
|
||||
|
||||
@staticmethod
|
||||
cdef inline bint set_flag(LexemeC* lexeme, attr_id_t flag_id, bint value) nogil:
|
||||
cdef flags_t one = 1
|
||||
if value:
|
||||
lexeme.flags |= one << flag_id
|
||||
else:
|
||||
lexeme.flags &= ~(one << flag_id)
|
||||
|
|
|
@ -26,12 +26,8 @@ cdef class Lexeme:
|
|||
def __init__(self, Vocab vocab, int orth):
|
||||
self.vocab = vocab
|
||||
self.orth = orth
|
||||
self.c = <LexemeC*><void*>vocab.get_by_orth(orth)
|
||||
self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
|
||||
|
||||
property orth:
|
||||
def __get__(self):
|
||||
return self.c.orth
|
||||
|
||||
property lower:
|
||||
def __get__(self): return self.c.lower
|
||||
def __set__(self, int x): self.c.lower = x
|
||||
|
@ -78,44 +74,44 @@ cdef class Lexeme:
|
|||
|
||||
property is_oov:
|
||||
def __get__(self): return Lexeme.check_flag(self.c, IS_OOV)
|
||||
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_OOV, x)
|
||||
def __set__(self, bint x): Lexeme.set_flag(self.c, IS_OOV, x)
|
||||
|
||||
property is_alpha:
|
||||
def __get__(self): return Lexeme.check_flag(self.c, IS_ALPHA)
|
||||
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ALPHA, x)
|
||||
def __set__(self, bint x): Lexeme.set_flag(self.c, IS_ALPHA, x)
|
||||
|
||||
property is_ascii:
|
||||
def __get__(self): return Lexeme.check_flag(self.c, IS_ASCII)
|
||||
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ASCII, x)
|
||||
def __set__(self, bint x): Lexeme.set_flag(self.c, IS_ASCII, x)
|
||||
|
||||
property is_digit:
|
||||
def __get__(self): return Lexeme.check_flag(self.c, IS_DIGIT)
|
||||
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_DIGIT, x)
|
||||
def __set__(self, bint x): Lexeme.set_flag(self.c, IS_DIGIT, x)
|
||||
|
||||
property is_lower:
|
||||
def __get__(self): return Lexeme.check_flag(self.c, IS_LOWER)
|
||||
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_LOWER, x)
|
||||
def __set__(self, bint x): Lexeme.set_flag(self.c, IS_LOWER, x)
|
||||
|
||||
property is_title:
|
||||
def __get__(self): return Lexeme.check_flag(self.c, IS_TITLE)
|
||||
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_TITLE, x)
|
||||
def __set__(self, bint x): Lexeme.set_flag(self.c, IS_TITLE, x)
|
||||
|
||||
property is_punct:
|
||||
def __get__(self): return Lexeme.check_flag(self.c, IS_PUNCT)
|
||||
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_PUNCT, x)
|
||||
def __set__(self, bint x): Lexeme.set_flag(self.c, IS_PUNCT, x)
|
||||
|
||||
property is_space:
|
||||
def __get__(self): return Lexeme.check_flag(self.c, IS_SPACE)
|
||||
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_SPACE, x)
|
||||
def __set__(self, bint x): Lexeme.set_flag(self.c, IS_SPACE, x)
|
||||
|
||||
property like_url:
|
||||
def __get__(self): return Lexeme.check_flag(self.c, LIKE_URL)
|
||||
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_URL, x)
|
||||
def __set__(self, bint x): Lexeme.set_flag(self.c, LIKE_URL, x)
|
||||
|
||||
property like_num:
|
||||
def __get__(self): return Lexeme.like_num(self.c, IKE_NUM)
|
||||
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_NUM, x)
|
||||
def __get__(self): return Lexeme.check_flag(self.c, LIKE_NUM)
|
||||
def __set__(self, bint x): Lexeme.set_flag(self.c, LIKE_NUM, x)
|
||||
|
||||
property like_email:
|
||||
def __get__(self): return Lexeme.check_flag(self.c, LIKE_EMAIL)
|
||||
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_EMAIL, x)
|
||||
def __set__(self, bint x): Lexeme.set_flag(self.c, LIKE_EMAIL, x)
|
||||
|
|
|
@ -102,21 +102,22 @@ cdef class Matcher:
|
|||
cdef readonly int n_patterns
|
||||
|
||||
def __init__(self, vocab, patterns):
|
||||
self.vocab = vocab
|
||||
self.mem = Pool()
|
||||
for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
|
||||
self.add(entity_key, etype, attrs, specs)
|
||||
|
||||
def add(self, entity_key, etype, attrs, specs):
|
||||
if isinstance(entity_key, basestring):
|
||||
entity_key = vocab.strings[entity_key]
|
||||
entity_key = self.vocab.strings[entity_key]
|
||||
if isinstance(etype, basestring):
|
||||
etype = vocab.strings[etype]
|
||||
etype = self.vocab.strings[etype]
|
||||
elif etype is None:
|
||||
etype = -1
|
||||
# TODO: Do something more clever about multiple patterns for single
|
||||
# entity
|
||||
for spec in specs:
|
||||
spec = _convert_strings(spec, vocab.strings)
|
||||
spec = _convert_strings(spec, self.vocab.strings)
|
||||
self.patterns.push_back(init_pattern(self.mem, spec, etype))
|
||||
|
||||
@classmethod
|
||||
|
|
|
@ -5,6 +5,7 @@ from libc.stdint cimport uint32_t
|
|||
import numpy
|
||||
import struct
|
||||
|
||||
from ..lexeme cimport Lexeme
|
||||
from ..lexeme cimport EMPTY_LEXEME
|
||||
from ..typedefs cimport attr_t, flags_t
|
||||
from ..attrs cimport attr_id_t
|
||||
|
@ -13,8 +14,6 @@ from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
|||
from ..parts_of_speech import UNIV_POS_NAMES
|
||||
from ..parts_of_speech cimport CONJ, PUNCT, NOUN
|
||||
from ..parts_of_speech cimport univ_pos_t
|
||||
from ..lexeme cimport check_flag
|
||||
from ..lexeme cimport get_attr as get_lex_attr
|
||||
from .spans cimport Span
|
||||
from .token cimport Token
|
||||
from ..serialize.bits cimport BitArray
|
||||
|
@ -48,7 +47,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
|
|||
elif feat_name == ENT_TYPE:
|
||||
return token.ent_type
|
||||
else:
|
||||
return get_lex_attr(token.lex, feat_name)
|
||||
return Lexeme.get_struct_attr(token.lex, feat_name)
|
||||
|
||||
|
||||
cdef class Doc:
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
from libc.string cimport memcpy
|
||||
from cpython.mem cimport PyMem_Malloc, PyMem_Free
|
||||
from ..lexeme cimport check_flag
|
||||
# Compiler crashes on memory view coercion without this. Should report bug.
|
||||
from cython.view cimport array as cvarray
|
||||
cimport numpy as np
|
||||
|
@ -9,6 +8,7 @@ np.import_array()
|
|||
import numpy
|
||||
|
||||
|
||||
from ..lexeme cimport Lexeme
|
||||
from ..parts_of_speech import UNIV_POS_NAMES
|
||||
|
||||
from ..attrs cimport LEMMA
|
||||
|
@ -42,7 +42,7 @@ cdef class Token:
|
|||
return self.string
|
||||
|
||||
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
|
||||
return check_flag(self.c.lex, flag_id)
|
||||
return Lexeme.check_flag(self.c.lex, flag_id)
|
||||
|
||||
def nbor(self, int i=1):
|
||||
return self.doc[self.i+i]
|
||||
|
@ -286,37 +286,37 @@ cdef class Token:
|
|||
return self.vocab.strings[self.c.dep]
|
||||
|
||||
property is_oov:
|
||||
def __get__(self): return check_flag(self.c.lex, IS_OOV)
|
||||
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_OOV)
|
||||
|
||||
property is_alpha:
|
||||
def __get__(self): return check_flag(self.c.lex, IS_ALPHA)
|
||||
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ALPHA)
|
||||
|
||||
property is_ascii:
|
||||
def __get__(self): return check_flag(self.c.lex, IS_ASCII)
|
||||
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ASCII)
|
||||
|
||||
property is_digit:
|
||||
def __get__(self): return check_flag(self.c.lex, IS_DIGIT)
|
||||
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_DIGIT)
|
||||
|
||||
property is_lower:
|
||||
def __get__(self): return check_flag(self.c.lex, IS_LOWER)
|
||||
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_LOWER)
|
||||
|
||||
property is_title:
|
||||
def __get__(self): return check_flag(self.c.lex, IS_TITLE)
|
||||
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_TITLE)
|
||||
|
||||
property is_punct:
|
||||
def __get__(self): return check_flag(self.c.lex, IS_PUNCT)
|
||||
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_PUNCT)
|
||||
|
||||
property is_space:
|
||||
def __get__(self): return check_flag(self.c.lex, IS_SPACE)
|
||||
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_SPACE)
|
||||
|
||||
property like_url:
|
||||
def __get__(self): return check_flag(self.c.lex, LIKE_URL)
|
||||
def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_URL)
|
||||
|
||||
property like_num:
|
||||
def __get__(self): return check_flag(self.c.lex, LIKE_NUM)
|
||||
def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_NUM)
|
||||
|
||||
property like_email:
|
||||
def __get__(self): return check_flag(self.c.lex, LIKE_EMAIL)
|
||||
def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_EMAIL)
|
||||
|
||||
|
||||
_pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}
|
||||
|
|
|
@ -37,6 +37,7 @@ cdef class Vocab:
|
|||
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
|
||||
|
||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
||||
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
|
||||
|
||||
cdef PreshMap _by_hash
|
||||
cdef PreshMap _by_orth
|
||||
|
|
|
@ -12,7 +12,6 @@ import math
|
|||
import json
|
||||
|
||||
from .lexeme cimport EMPTY_LEXEME
|
||||
from .lexeme cimport set_lex_struct_props
|
||||
from .lexeme cimport Lexeme
|
||||
from .strings cimport hash_string
|
||||
from .orth cimport word_shape
|
||||
|
@ -36,12 +35,13 @@ EMPTY_LEXEME.repvec = EMPTY_VEC
|
|||
cdef class Vocab:
|
||||
'''A map container for a language's LexemeC structs.
|
||||
'''
|
||||
def __init__(self, data_dir=None, get_lex_attr=None):
|
||||
def __init__(self, data_dir=None, get_lex_attr=None, load_vectors=False):
|
||||
self.mem = Pool()
|
||||
self._by_hash = PreshMap()
|
||||
self._by_orth = PreshMap()
|
||||
self.strings = StringStore()
|
||||
self.pos_tags = pos_tags if pos_tags is not None else {}
|
||||
#self.pos_tags = pos_tags if pos_tags is not None else {}
|
||||
self.pos_tags = {}
|
||||
|
||||
self.get_lex_attr = get_lex_attr
|
||||
self.repvec_length = 0
|
||||
|
@ -112,7 +112,7 @@ cdef class Vocab:
|
|||
if is_oov:
|
||||
lex.id = 0
|
||||
else:
|
||||
self._add_lex_to_vocab(key, lex)
|
||||
self._add_lex_to_vocab(hash_string(string), lex)
|
||||
assert lex != NULL, string
|
||||
return lex
|
||||
|
||||
|
@ -125,7 +125,7 @@ cdef class Vocab:
|
|||
cdef attr_t orth
|
||||
cdef size_t addr
|
||||
for orth, addr in self._by_orth.items():
|
||||
yield Lexeme.from_ptr(<LexemeC*>addr, self.strings, self.repvec_length)
|
||||
yield Lexeme.from_ptr(<LexemeC*>addr, self, self.repvec_length)
|
||||
|
||||
def __getitem__(self, id_or_string):
|
||||
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
||||
|
@ -157,7 +157,7 @@ cdef class Vocab:
|
|||
raise ValueError("Vocab unable to map type: "
|
||||
"%s. Maps unicode --> Lexeme or "
|
||||
"int --> Lexeme" % str(type(id_or_string)))
|
||||
return Lexeme.from_ptr(lexeme, self.strings, self.repvec_length)
|
||||
return Lexeme.from_ptr(<LexemeC*><void*>lexeme, self, self.repvec_length)
|
||||
|
||||
def dump(self, loc):
|
||||
if path.exists(loc):
|
||||
|
|
Loading…
Reference in New Issue
Block a user