mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
* Tmp commit. Refactoring to create a Python Lexeme class.
This commit is contained in:
parent
61904e590f
commit
ce2edd6312
|
@ -1,6 +1,7 @@
|
|||
from __future__ import unicode_literals
|
||||
from os import path
|
||||
|
||||
from .. import orth
|
||||
from ..vocab import Vocab
|
||||
from ..tokenizer import Tokenizer
|
||||
from ..syntax.parser import GreedyParser
|
||||
|
@ -10,12 +11,10 @@ from .pos import POS_TAGS
|
|||
from .attrs import get_flags
|
||||
|
||||
|
||||
DATA_DIR = path.join(path.dirname(__file__), 'data')
|
||||
|
||||
|
||||
def get_lex_props(string):
|
||||
return {'flags': get_flags(string), 'dense': 1}
|
||||
|
||||
LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')
|
||||
|
||||
class English(object):
|
||||
"""The English NLP pipeline.
|
||||
|
@ -44,16 +43,18 @@ class English(object):
|
|||
parser (spacy.syntax.parser.GreedyParser):
|
||||
A greedy shift-reduce dependency parser.
|
||||
"""
|
||||
def __init__(self, data_dir=None):
|
||||
if data_dir is None:
|
||||
data_dir = path.join(path.dirname(__file__), 'data')
|
||||
def __init__(self, data_dir=LOCAL_DATA_DIR):
|
||||
self._data_dir = data_dir
|
||||
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab'),
|
||||
get_lex_props=get_lex_props)
|
||||
tag_names = list(POS_TAGS.keys())
|
||||
tag_names.sort()
|
||||
self.tokenizer = Tokenizer.from_dir(self.vocab, path.join(data_dir, 'tokenizer'),
|
||||
POS_TAGS, tag_names)
|
||||
if data_dir is None:
|
||||
self.tokenizer = Tokenizer(self.vocab, {}, None, None, None,
|
||||
POS_TAGS, tag_names)
|
||||
else:
|
||||
self.tokenizer = Tokenizer.from_dir(self.vocab, path.join(data_dir, 'tokenizer'),
|
||||
POS_TAGS, tag_names)
|
||||
self.strings = self.vocab.strings
|
||||
self._tagger = None
|
||||
self._parser = None
|
||||
|
|
|
@ -4,7 +4,7 @@ import tarfile
|
|||
import shutil
|
||||
import requests
|
||||
|
||||
URL = 'https://s3-us-west-1.amazonaws.com/media.spacynlp.com/en.tgz'
|
||||
PARSER_URL = 'https://s3-us-west-1.amazonaws.com/media.spacynlp.com/en.tgz'
|
||||
|
||||
DEST_DIR = path.join(path.dirname(__file__), 'data', 'deps')
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@ from cymem.cymem cimport Pool
|
|||
|
||||
from .._ml cimport Model
|
||||
from ..strings cimport StringStore
|
||||
from ..structs cimport TokenC, Lexeme, Morphology, PosTag
|
||||
from ..structs cimport TokenC, LexemeC, Morphology, PosTag
|
||||
from ..typedefs cimport univ_tag_t
|
||||
from .lemmatizer import Lemmatizer
|
||||
|
||||
|
@ -21,5 +21,5 @@ cdef class EnPosTagger:
|
|||
cdef readonly int n_tags
|
||||
|
||||
cdef int set_morph(self, const int i, TokenC* tokens) except -1
|
||||
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
|
||||
cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1
|
||||
|
||||
|
|
|
@ -12,7 +12,7 @@ from ..typedefs cimport univ_tag_t
|
|||
from ..typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
|
||||
from ..typedefs cimport X, PUNCT, EOL
|
||||
from ..typedefs cimport id_t
|
||||
from ..structs cimport TokenC, Morphology, Lexeme
|
||||
from ..structs cimport TokenC, Morphology, LexemeC
|
||||
from ..tokens cimport Tokens
|
||||
from ..morphology cimport set_morph_from_dict
|
||||
from .._ml cimport arg_max
|
||||
|
@ -290,7 +290,7 @@ cdef class EnPosTagger:
|
|||
tokens[i].lemma = cached.lemma
|
||||
tokens[i].morph = cached.morph
|
||||
|
||||
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
|
||||
cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1:
|
||||
if self.lemmatizer is None:
|
||||
return lex.sic
|
||||
cdef bytes py_string = self.strings[lex.sic]
|
||||
|
|
|
@ -1,21 +1,21 @@
|
|||
from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t
|
||||
from .typedefs cimport ID, SIC, DENSE, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, POS_TYPE
|
||||
from .structs cimport Lexeme
|
||||
from .structs cimport LexemeC
|
||||
from .strings cimport StringStore
|
||||
|
||||
|
||||
cdef Lexeme EMPTY_LEXEME
|
||||
cdef LexemeC EMPTY_LEXEME
|
||||
|
||||
|
||||
cpdef Lexeme init(id_t i, unicode string, hash_t hashed, StringStore store,
|
||||
cdef LexemeC init(id_t i, unicode string, hash_t hashed, StringStore store,
|
||||
dict props) except *
|
||||
|
||||
|
||||
cdef inline bint check_flag(const Lexeme* lexeme, attr_id_t flag_id) nogil:
|
||||
cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
|
||||
return lexeme.flags & (1 << flag_id)
|
||||
|
||||
|
||||
cdef inline attr_t get_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
|
||||
cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
||||
if feat_name < (sizeof(flags_t) * 8):
|
||||
return check_flag(lex, feat_name)
|
||||
elif feat_name == ID:
|
||||
|
|
|
@ -7,12 +7,12 @@ from libc.string cimport memset
|
|||
from .orth cimport word_shape
|
||||
|
||||
|
||||
memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
|
||||
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
||||
|
||||
|
||||
cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
|
||||
cdef LexemeC init(id_t i, unicode string, hash_t hashed,
|
||||
StringStore string_store, dict props) except *:
|
||||
cdef Lexeme lex
|
||||
cdef LexemeC lex
|
||||
lex.id = i
|
||||
lex.length = len(string)
|
||||
lex.sic = string_store[string]
|
||||
|
@ -27,3 +27,6 @@ cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
|
|||
|
||||
lex.flags = props.get('flags', 0)
|
||||
return lex
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from .structs cimport TokenC, Lexeme, Morphology, PosTag
|
||||
from .structs cimport TokenC, Morphology, PosTag
|
||||
|
||||
|
||||
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1
|
||||
|
|
|
@ -3,6 +3,9 @@ from preshed.maps cimport PreshMap
|
|||
from murmurhash.mrmr cimport hash64
|
||||
|
||||
from .structs cimport Utf8Str, UniStr
|
||||
from .typedefs cimport hash_t
|
||||
|
||||
cpdef hash_t hash_string(unicode string) except 0
|
||||
|
||||
|
||||
cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) nogil:
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import codecs
|
||||
|
||||
from libc.string cimport memcpy
|
||||
from murmurhash.mrmr cimport hash64
|
||||
|
||||
|
||||
from .typedefs cimport hash_t
|
||||
|
@ -9,6 +10,11 @@ from .typedefs cimport hash_t
|
|||
SEPARATOR = '\n|-SEP-|\n'
|
||||
|
||||
|
||||
cpdef hash_t hash_string(unicode string) except 0:
|
||||
chars = <Py_UNICODE*>string
|
||||
return hash64(chars, len(string) * sizeof(Py_UNICODE), 0)
|
||||
|
||||
|
||||
"""
|
||||
cdef class SymbolMap:
|
||||
def __init__(self):
|
||||
|
|
|
@ -3,7 +3,9 @@ from libc.stdint cimport uint8_t, uint32_t
|
|||
from .typedefs cimport flags_t, attr_t, id_t, hash_t, univ_tag_t
|
||||
|
||||
|
||||
cdef struct Lexeme:
|
||||
cdef struct LexemeC:
|
||||
const float* vec
|
||||
|
||||
flags_t flags
|
||||
|
||||
attr_t id
|
||||
|
@ -38,7 +40,7 @@ cdef struct PosTag:
|
|||
|
||||
|
||||
cdef struct TokenC:
|
||||
const Lexeme* lex
|
||||
const LexemeC* lex
|
||||
Morphology morph
|
||||
univ_tag_t pos
|
||||
int fine_pos
|
||||
|
|
|
@ -6,14 +6,14 @@ from preshed.maps cimport PreshMap
|
|||
from cymem.cymem cimport Pool
|
||||
|
||||
from .typedefs cimport hash_t
|
||||
from .structs cimport Lexeme, TokenC, Morphology, UniStr
|
||||
from .structs cimport LexemeC, TokenC, Morphology, UniStr
|
||||
from .strings cimport StringStore
|
||||
from .tokens cimport Tokens
|
||||
from .vocab cimport Vocab, _Cached
|
||||
|
||||
|
||||
cdef union LexemesOrTokens:
|
||||
const Lexeme* const* lexemes
|
||||
const LexemeC* const* lexemes
|
||||
TokenC* tokens
|
||||
|
||||
|
||||
|
@ -33,10 +33,10 @@ cdef class Tokenizer:
|
|||
|
||||
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1
|
||||
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
|
||||
cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
|
||||
vector[Lexeme*] *suffixes) except NULL
|
||||
cdef UniStr* _split_affixes(self, UniStr* string, vector[LexemeC*] *prefixes,
|
||||
vector[LexemeC*] *suffixes) except NULL
|
||||
cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
|
||||
vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1
|
||||
vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
|
||||
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
|
||||
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
|
||||
cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
|
||||
|
|
|
@ -53,7 +53,7 @@ cdef class Tokenizer:
|
|||
cdef int idx = 0
|
||||
for i, py_string in enumerate(strings):
|
||||
slice_unicode(&string_struct, py_string, 0, len(py_string))
|
||||
tokens.push_back(idx, <const Lexeme*>self.vocab.get(tokens.mem, &string_struct))
|
||||
tokens.push_back(idx, <const LexemeC*>self.vocab.get(tokens.mem, &string_struct))
|
||||
idx += len(py_string) + 1
|
||||
return tokens
|
||||
|
||||
|
@ -75,7 +75,7 @@ cdef class Tokenizer:
|
|||
string (unicode): The string to be tokenized.
|
||||
|
||||
Returns:
|
||||
tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
|
||||
tokens (Tokens): A Tokens object, giving access to a sequence of LexemeCs.
|
||||
"""
|
||||
cdef int length = len(string)
|
||||
cdef Tokens tokens = Tokens(self.vocab, length)
|
||||
|
@ -121,8 +121,8 @@ cdef class Tokenizer:
|
|||
return True
|
||||
|
||||
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
|
||||
cdef vector[Lexeme*] prefixes
|
||||
cdef vector[Lexeme*] suffixes
|
||||
cdef vector[LexemeC*] prefixes
|
||||
cdef vector[LexemeC*] suffixes
|
||||
cdef hash_t orig_key
|
||||
cdef int orig_size
|
||||
orig_key = span.key
|
||||
|
@ -131,8 +131,8 @@ cdef class Tokenizer:
|
|||
self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
|
||||
self._save_cached(&tokens.data[orig_size], orig_key, tokens.length - orig_size)
|
||||
|
||||
cdef UniStr* _split_affixes(self, UniStr* string, vector[const Lexeme*] *prefixes,
|
||||
vector[const Lexeme*] *suffixes) except NULL:
|
||||
cdef UniStr* _split_affixes(self, UniStr* string, vector[const LexemeC*] *prefixes,
|
||||
vector[const LexemeC*] *suffixes) except NULL:
|
||||
cdef size_t i
|
||||
cdef UniStr prefix
|
||||
cdef UniStr suffix
|
||||
|
@ -174,12 +174,12 @@ cdef class Tokenizer:
|
|||
return string
|
||||
|
||||
cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
|
||||
vector[const Lexeme*] *prefixes,
|
||||
vector[const Lexeme*] *suffixes) except -1:
|
||||
vector[const LexemeC*] *prefixes,
|
||||
vector[const LexemeC*] *suffixes) except -1:
|
||||
cdef bint cache_hit
|
||||
cdef int split
|
||||
cdef const Lexeme* const* lexemes
|
||||
cdef Lexeme* lexeme
|
||||
cdef const LexemeC* const* lexemes
|
||||
cdef LexemeC* lexeme
|
||||
cdef UniStr span
|
||||
cdef int i
|
||||
if prefixes.size():
|
||||
|
@ -200,7 +200,7 @@ cdef class Tokenizer:
|
|||
idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span))
|
||||
slice_unicode(&span, string.chars, split + 1, string.n)
|
||||
idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span))
|
||||
cdef vector[const Lexeme*].reverse_iterator it = suffixes.rbegin()
|
||||
cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
|
||||
while it != suffixes.rend():
|
||||
idx = tokens.push_back(idx, deref(it))
|
||||
preinc(it)
|
||||
|
@ -213,10 +213,10 @@ cdef class Tokenizer:
|
|||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||
cached.length = n
|
||||
cached.is_lex = True
|
||||
lexemes = <const Lexeme**>self.mem.alloc(n, sizeof(Lexeme**))
|
||||
lexemes = <const LexemeC**>self.mem.alloc(n, sizeof(LexemeC**))
|
||||
for i in range(n):
|
||||
lexemes[i] = tokens[i].lex
|
||||
cached.data.lexemes = <const Lexeme* const*>lexemes
|
||||
cached.data.lexemes = <const LexemeC* const*>lexemes
|
||||
self._cache.set(key, cached)
|
||||
|
||||
cdef int _find_infix(self, Py_UNICODE* chars, size_t length) except -1:
|
||||
|
@ -243,7 +243,7 @@ cdef class Tokenizer:
|
|||
cdef unicode form
|
||||
cdef unicode lemma
|
||||
cdef dict props
|
||||
cdef Lexeme** lexemes
|
||||
cdef LexemeC** lexemes
|
||||
cdef hash_t hashed
|
||||
cdef UniStr string
|
||||
for chunk, substrings in sorted(rules.items()):
|
||||
|
@ -252,7 +252,7 @@ cdef class Tokenizer:
|
|||
form = props['F']
|
||||
lemma = props.get("L", None)
|
||||
slice_unicode(&string, form, 0, len(form))
|
||||
tokens[i].lex = <Lexeme*>self.vocab.get(self.vocab.mem, &string)
|
||||
tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, &string)
|
||||
if lemma:
|
||||
tokens[i].lemma = self.vocab.strings[lemma]
|
||||
if 'pos' in props:
|
||||
|
|
|
@ -5,13 +5,13 @@ from cython.view cimport array as cvarray
|
|||
from cymem.cymem cimport Pool
|
||||
from thinc.typedefs cimport atom_t
|
||||
|
||||
from .typedefs cimport flags_t, attr_id_t, attr_t
|
||||
from .structs cimport Morphology, TokenC, Lexeme
|
||||
from .typedefs cimport flags_t, attr_id_t, attr_t, univ_tag_t
|
||||
from .structs cimport Morphology, TokenC, LexemeC
|
||||
from .vocab cimport Vocab
|
||||
from .strings cimport StringStore
|
||||
|
||||
|
||||
ctypedef const Lexeme* const_Lexeme_ptr
|
||||
ctypedef const LexemeC* const_Lexeme_ptr
|
||||
ctypedef TokenC* TokenC_ptr
|
||||
|
||||
ctypedef fused LexemeOrToken:
|
||||
|
@ -19,10 +19,10 @@ ctypedef fused LexemeOrToken:
|
|||
TokenC_ptr
|
||||
|
||||
|
||||
cdef attr_t get_lex_attr(const Lexeme* lex, attr_id_t feat_name) nogil
|
||||
cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil
|
||||
cdef attr_t get_token_attr(const TokenC* lex, attr_id_t feat_name) nogil
|
||||
|
||||
cdef inline bint check_flag(const Lexeme* lexeme, attr_id_t flag_id) nogil:
|
||||
cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
|
||||
return lexeme.flags & (1 << flag_id)
|
||||
|
||||
|
||||
|
@ -42,5 +42,32 @@ cdef class Tokens:
|
|||
|
||||
|
||||
cdef class Token:
|
||||
cdef Tokens _seq
|
||||
cdef readonly int i
|
||||
cdef cvarray vec
|
||||
|
||||
cdef readonly flags_t flags
|
||||
|
||||
cdef readonly attr_t id
|
||||
cdef readonly attr_t sic
|
||||
cdef readonly attr_t dense
|
||||
cdef readonly attr_t shape
|
||||
cdef readonly attr_t prefix
|
||||
cdef readonly attr_t suffix
|
||||
|
||||
cdef readonly attr_t length
|
||||
cdef readonly attr_t cluster
|
||||
cdef readonly attr_t pos_type
|
||||
|
||||
cdef readonly float prob
|
||||
cdef readonly float sentiment
|
||||
|
||||
cdef readonly Morphology morph
|
||||
cdef readonly univ_tag_t pos
|
||||
cdef readonly int fine_pos
|
||||
cdef readonly int idx
|
||||
cdef readonly int lemma
|
||||
cdef readonly int sense
|
||||
cdef readonly int dep_tag
|
||||
|
||||
cdef readonly int head_offset
|
||||
cdef readonly uint32_t l_kids
|
||||
cdef readonly uint32_t r_kids
|
||||
|
|
207
spacy/tokens.pyx
207
spacy/tokens.pyx
|
@ -32,7 +32,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
|
|||
return get_lex_attr(token.lex, feat_name)
|
||||
|
||||
|
||||
cdef attr_t get_lex_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
|
||||
cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
||||
if feat_name < (sizeof(flags_t) * 8):
|
||||
return check_flag(lex, feat_name)
|
||||
elif feat_name == ID:
|
||||
|
@ -85,7 +85,7 @@ cdef class Tokens:
|
|||
token (Token):
|
||||
"""
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return Token(self, i)
|
||||
return cinit_token(&self.data[i])
|
||||
|
||||
def __iter__(self):
|
||||
"""Iterate over the tokens.
|
||||
|
@ -174,26 +174,57 @@ cdef class Tokens:
|
|||
self.data[i].lex = &EMPTY_LEXEME
|
||||
|
||||
|
||||
@cython.freelist(64)
|
||||
cdef Token cinit_token(const TokenC* c_tok):
|
||||
cdef const LexemeC* lex = c_tok.lex
|
||||
cdef Token py_tok = Token.__new__(Token)
|
||||
|
||||
cyarr = cvarray(shape=(300,), itemsize=sizeof(float), format="i")
|
||||
py_tok.vec = cyarr
|
||||
|
||||
py_tok.flags = lex.flags
|
||||
py_tok.id = lex.id
|
||||
py_tok.sic = lex.sic
|
||||
py_tok.dense = lex.dense
|
||||
py_tok.shape = lex.shape
|
||||
py_tok.prefix = lex.prefix
|
||||
py_tok.suffix = lex.suffix
|
||||
py_tok.length = lex.length
|
||||
py_tok.cluster = lex.cluster
|
||||
py_tok.pos_type = lex.pos_type
|
||||
|
||||
py_tok.prob = lex.prob
|
||||
py_tok.sentiment = lex.sentiment
|
||||
|
||||
py_tok.morph = c_tok.morph
|
||||
py_tok.pos = c_tok.pos
|
||||
py_tok.fine_pos = c_tok.fine_pos
|
||||
py_tok.idx = c_tok.idx
|
||||
py_tok.lemma = c_tok.lemma
|
||||
py_tok.sense = c_tok.sense
|
||||
py_tok.dep_tag = c_tok.dep_tag
|
||||
py_tok.head_offset = c_tok.head
|
||||
py_tok.l_kids = c_tok.l_kids
|
||||
py_tok.r_kids = c_tok.r_kids
|
||||
return py_tok
|
||||
|
||||
|
||||
cdef class Token:
|
||||
"""An individual token.
|
||||
|
||||
Internally, the Token is a tuple (i, tokens) --- it delegates to the Tokens
|
||||
object.
|
||||
"""
|
||||
def __init__(self, Tokens tokens, int i):
|
||||
self._seq = tokens
|
||||
self.i = i
|
||||
def __init__(self):
|
||||
pass
|
||||
#self._seq = tokens
|
||||
#self.i = i
|
||||
|
||||
def __unicode__(self):
|
||||
cdef const TokenC* t = &self._seq.data[self.i]
|
||||
cdef int end_idx = t.idx + t.lex.length
|
||||
if self.i + 1 == self._seq.length:
|
||||
return self.string
|
||||
if end_idx == t[1].idx:
|
||||
return self.string
|
||||
else:
|
||||
return self.string + ' '
|
||||
#def __unicode__(self):
|
||||
# cdef const TokenC* t = &self._seq.data[self.i]
|
||||
# cdef int end_idx = t.idx + t.lex.length
|
||||
# if self.i + 1 == self._seq.length:
|
||||
# return self.string
|
||||
# if end_idx == t[1].idx:
|
||||
# return self.string
|
||||
# else:
|
||||
# return self.string + ' '
|
||||
|
||||
def __len__(self):
|
||||
"""The number of unicode code-points in the original string.
|
||||
|
@ -201,87 +232,87 @@ cdef class Token:
|
|||
Returns:
|
||||
length (int):
|
||||
"""
|
||||
return self._seq.data[self.i].lex.length
|
||||
return self.length
|
||||
|
||||
property idx:
|
||||
"""The index into the original string at which the token starts.
|
||||
#property idx:
|
||||
# """The index into the original string at which the token starts.
|
||||
|
||||
The following is supposed to always be true:
|
||||
|
||||
>>> original_string[token.idx:token.idx len(token) == token.string
|
||||
"""
|
||||
def __get__(self):
|
||||
return self._seq.data[self.i].idx
|
||||
# The following is supposed to always be true:
|
||||
#
|
||||
# >>> original_string[token.idx:token.idx len(token) == token.string
|
||||
# """
|
||||
# def __get__(self):
|
||||
# return self._seq.data[self.i].idx
|
||||
|
||||
property cluster:
|
||||
"""The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering
|
||||
|
||||
Similar words have better-than-chance likelihood of having similar cluster
|
||||
IDs, although the clustering is quite noisy. Cluster IDs make good features,
|
||||
and help to make models slightly more robust to domain variation.
|
||||
#property cluster:
|
||||
# """The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering
|
||||
#
|
||||
# Similar words have better-than-chance likelihood of having similar cluster
|
||||
# IDs, although the clustering is quite noisy. Cluster IDs make good features,
|
||||
# and help to make models slightly more robust to domain variation.
|
||||
|
||||
A common trick is to use only the first N bits of a cluster ID in a feature,
|
||||
as the more general part of the hierarchical clustering is often more accurate
|
||||
than the lower categories.
|
||||
# A common trick is to use only the first N bits of a cluster ID in a feature,
|
||||
# as the more general part of the hierarchical clustering is often more accurate
|
||||
# than the lower categories.
|
||||
|
||||
To assist in this, I encode the cluster IDs little-endian, to allow a simple
|
||||
bit-mask:
|
||||
# To assist in this, I encode the cluster IDs little-endian, to allow a simple
|
||||
# bit-mask:
|
||||
|
||||
>>> six_bits = cluster & (2**6 - 1)
|
||||
"""
|
||||
def __get__(self):
|
||||
return self._seq.data[self.i].lex.cluster
|
||||
# >>> six_bits = cluster & (2**6 - 1)
|
||||
# """
|
||||
# def __get__(self):
|
||||
# return self._seq.data[self.i].lex.cluster
|
||||
|
||||
property string:
|
||||
"""The unicode string of the word, with no whitespace padding."""
|
||||
def __get__(self):
|
||||
cdef const TokenC* t = &self._seq.data[self.i]
|
||||
if t.lex.sic == 0:
|
||||
return ''
|
||||
cdef bytes utf8string = self._seq.vocab.strings[t.lex.sic]
|
||||
return utf8string.decode('utf8')
|
||||
#property string:
|
||||
# """The unicode string of the word, with no whitespace padding."""
|
||||
# def __get__(self):
|
||||
# cdef const TokenC* t = &self._seq.data[self.i]
|
||||
# if t.lex.sic == 0:
|
||||
# return ''
|
||||
# cdef bytes utf8string = self._seq.vocab.strings[t.lex.sic]
|
||||
# return utf8string.decode('utf8')
|
||||
|
||||
property lemma:
|
||||
"""The unicode string of the word's lemma. If no part-of-speech tag is
|
||||
assigned, the most common part-of-speech tag of the word is used.
|
||||
"""
|
||||
def __get__(self):
|
||||
cdef const TokenC* t = &self._seq.data[self.i]
|
||||
if t.lemma == 0:
|
||||
return self.string
|
||||
cdef bytes utf8string = self._seq.vocab.strings[t.lemma]
|
||||
return utf8string.decode('utf8')
|
||||
#property lemma:
|
||||
# """The unicode string of the word's lemma. If no part-of-speech tag is
|
||||
# assigned, the most common part-of-speech tag of the word is used.
|
||||
# """
|
||||
# def __get__(self):
|
||||
# cdef const TokenC* t = &self._seq.data[self.i]
|
||||
# if t.lemma == 0:
|
||||
# return self.string
|
||||
# cdef bytes utf8string = self._seq.vocab.strings[t.lemma]
|
||||
# return utf8string.decode('utf8')
|
||||
|
||||
property dep_tag:
|
||||
"""The ID integer of the word's dependency label. If no parse has been
|
||||
assigned, defaults to 0.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self._seq.data[self.i].dep_tag
|
||||
#property dep_tag:
|
||||
# """The ID integer of the word's dependency label. If no parse has been
|
||||
# assigned, defaults to 0.
|
||||
# """
|
||||
# def __get__(self):
|
||||
# return self._seq.data[self.i].dep_tag
|
||||
|
||||
property pos:
|
||||
"""The ID integer of the word's part-of-speech tag, from the 13-tag
|
||||
Google Universal Tag Set. Constants for this tag set are available in
|
||||
spacy.typedefs.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self._seq.data[self.i].pos
|
||||
#property pos:
|
||||
# """The ID integer of the word's part-of-speech tag, from the 13-tag
|
||||
# Google Universal Tag Set. Constants for this tag set are available in
|
||||
# spacy.typedefs.
|
||||
# """
|
||||
# def __get__(self):
|
||||
# return self._seq.data[self.i].pos
|
||||
|
||||
property fine_pos:
|
||||
"""The ID integer of the word's fine-grained part-of-speech tag, as assigned
|
||||
by the tagger model. Fine-grained tags include morphological information,
|
||||
and other distinctions, and allow a more accurate tagger to be trained.
|
||||
"""
|
||||
#property fine_pos:
|
||||
# """The ID integer of the word's fine-grained part-of-speech tag, as assigned
|
||||
# by the tagger model. Fine-grained tags include morphological information,
|
||||
# and other distinctions, and allow a more accurate tagger to be trained.
|
||||
# """
|
||||
|
||||
def __get__(self):
|
||||
return self._seq.data[self.i].fine_pos
|
||||
# def __get__(self):
|
||||
# return self._seq.data[self.i].fine_pos
|
||||
|
||||
property sic:
|
||||
def __get__(self):
|
||||
return self._seq.data[self.i].lex.sic
|
||||
#property sic:
|
||||
# def __get__(self):
|
||||
# return self._seq.data[self.i].lex.sic
|
||||
|
||||
property head:
|
||||
"""The token predicted by the parser to be the head of the current token."""
|
||||
def __get__(self):
|
||||
cdef const TokenC* t = &self._seq.data[self.i]
|
||||
return Token(self._seq, self.i + t.head)
|
||||
#property head:
|
||||
# """The token predicted by the parser to be the head of the current token."""
|
||||
# def __get__(self):
|
||||
# cdef const TokenC* t = &self._seq.data[self.i]
|
||||
# return Token(self._seq, self.i + t.head)
|
||||
|
|
|
@ -4,16 +4,16 @@ from preshed.maps cimport PreshMap
|
|||
from cymem.cymem cimport Pool
|
||||
from murmurhash.mrmr cimport hash64
|
||||
|
||||
from .structs cimport Lexeme, TokenC, UniStr
|
||||
from .structs cimport LexemeC, TokenC, UniStr
|
||||
from .typedefs cimport utf8_t, id_t, hash_t
|
||||
from .strings cimport StringStore
|
||||
|
||||
|
||||
cdef Lexeme EMPTY_LEXEME
|
||||
cdef LexemeC EMPTY_LEXEME
|
||||
|
||||
|
||||
cdef union LexemesOrTokens:
|
||||
const Lexeme* const* lexemes
|
||||
const LexemeC* const* lexemes
|
||||
TokenC* tokens
|
||||
|
||||
|
||||
|
@ -27,9 +27,9 @@ cdef class Vocab:
|
|||
cpdef public get_lex_props
|
||||
cdef Pool mem
|
||||
cpdef readonly StringStore strings
|
||||
cdef vector[Lexeme*] lexemes
|
||||
cdef vector[LexemeC*] lexemes
|
||||
|
||||
cdef const Lexeme* get(self, Pool mem, UniStr* s) except NULL
|
||||
cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL
|
||||
|
||||
cdef PreshMap _map
|
||||
|
||||
|
|
|
@ -2,20 +2,27 @@ from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
|||
from libc.string cimport memset
|
||||
|
||||
from os import path
|
||||
import codecs
|
||||
|
||||
from .lexeme cimport EMPTY_LEXEME
|
||||
from .lexeme cimport init as lexeme_init
|
||||
from .strings cimport slice_unicode
|
||||
from .typedefs cimport flags_t
|
||||
from .strings cimport hash_string
|
||||
from .orth cimport word_shape
|
||||
|
||||
|
||||
memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
|
||||
DEF MAX_VEC_SIZE = 100000
|
||||
|
||||
|
||||
cpdef Lexeme init_lexeme(id_t i, unicode string, hash_t hashed,
|
||||
cdef float[MAX_VEC_SIZE] EMPTY_VEC
|
||||
memset(EMPTY_VEC, 0, sizeof(EMPTY_VEC))
|
||||
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
||||
EMPTY_LEXEME.vec = EMPTY_VEC
|
||||
|
||||
|
||||
cdef LexemeC init_lexeme(id_t i, unicode string, hash_t hashed,
|
||||
StringStore string_store, dict props) except *:
|
||||
cdef Lexeme lex
|
||||
cdef LexemeC lex
|
||||
lex.id = i
|
||||
lex.length = len(string)
|
||||
lex.sic = string_store[string]
|
||||
|
@ -28,13 +35,12 @@ cpdef Lexeme init_lexeme(id_t i, unicode string, hash_t hashed,
|
|||
lex.suffix = string_store[string[-3:]]
|
||||
lex.shape = string_store[word_shape(string)]
|
||||
|
||||
cdef object flags_val = props.get('flags', 0)
|
||||
lex.flags = <flags_t>flags_val
|
||||
lex.flags = props.get('flags', 0)
|
||||
return lex
|
||||
|
||||
|
||||
cdef class Vocab:
|
||||
'''A map container for a language's Lexeme structs.
|
||||
'''A map container for a language's LexemeC structs.
|
||||
'''
|
||||
def __init__(self, data_dir=None, get_lex_props=None):
|
||||
self.mem = Pool()
|
||||
|
@ -50,24 +56,25 @@ cdef class Vocab:
|
|||
if not path.isdir(data_dir):
|
||||
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
|
||||
self.strings.load(path.join(data_dir, 'strings.txt'))
|
||||
self.load(path.join(data_dir, 'lexemes.bin'))
|
||||
self.load_lexemes(path.join(data_dir, 'lexemes.bin'))
|
||||
#self.load_vectors(path.join(data_dir, 'deps.words'))
|
||||
|
||||
def __len__(self):
|
||||
"""The current number of lexemes stored."""
|
||||
return self.lexemes.size()
|
||||
|
||||
cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
|
||||
'''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme
|
||||
cdef const LexemeC* get(self, Pool mem, UniStr* string) except NULL:
|
||||
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
||||
if necessary, using memory acquired from the given pool. If the pool
|
||||
is the lexicon's own memory, the lexeme is saved in the lexicon.'''
|
||||
cdef Lexeme* lex
|
||||
lex = <Lexeme*>self._map.get(string.key)
|
||||
cdef LexemeC* lex
|
||||
lex = <LexemeC*>self._map.get(string.key)
|
||||
if lex != NULL:
|
||||
return lex
|
||||
if string.n < 3:
|
||||
mem = self.mem
|
||||
cdef unicode py_string = string.chars[:string.n]
|
||||
lex = <Lexeme*>mem.alloc(sizeof(Lexeme), 1)
|
||||
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
||||
lex[0] = init_lexeme(self.lexemes.size(), py_string, string.key, self.strings,
|
||||
self.get_lex_props(py_string))
|
||||
if mem is self.mem:
|
||||
|
@ -81,13 +88,13 @@ cdef class Vocab:
|
|||
|
||||
def __getitem__(self, id_or_string):
|
||||
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
||||
unseen unicode string is given, a new Lexeme is created and stored.
|
||||
unseen unicode string is given, a new LexemeC is created and stored.
|
||||
|
||||
This function relies on Cython's struct-to-dict conversion. Python clients
|
||||
receive a dict keyed by strings (byte or unicode, depending on Python 2/3),
|
||||
with int values. Cython clients can instead receive a Lexeme struct value.
|
||||
with int values. Cython clients can instead receive a LexemeC struct value.
|
||||
More efficient Cython access is provided by Lexicon.get, which returns
|
||||
a Lexeme*.
|
||||
a LexemeC*.
|
||||
|
||||
Args:
|
||||
id_or_string (int or unicode): The integer ID of a word, or its unicode
|
||||
|
@ -96,24 +103,26 @@ cdef class Vocab:
|
|||
is raised.
|
||||
|
||||
Returns:
|
||||
lexeme (dict): A Lexeme struct instance, which Cython translates into
|
||||
lexeme (dict): A LexemeC struct instance, which Cython translates into
|
||||
a dict if the operator is called from Python.
|
||||
'''
|
||||
if type(id_or_string) == int:
|
||||
if id_or_string >= self.lexemes.size():
|
||||
raise IndexError
|
||||
return self.lexemes.at(id_or_string)[0]
|
||||
return {}
|
||||
#return self.lexemes.at(id_or_string)[0]
|
||||
cdef UniStr string
|
||||
slice_unicode(&string, id_or_string, 0, len(id_or_string))
|
||||
cdef const Lexeme* lexeme = self.get(self.mem, &string)
|
||||
return lexeme[0]
|
||||
cdef const LexemeC* lexeme = self.get(self.mem, &string)
|
||||
return {}
|
||||
#return lexeme[0]
|
||||
|
||||
def __setitem__(self, unicode uni_string, dict props):
|
||||
cdef UniStr s
|
||||
slice_unicode(&s, uni_string, 0, len(uni_string))
|
||||
# Cast through the const here, since we're allowed to change our own
|
||||
# Lexemes.
|
||||
lex = <Lexeme*><void*>self.get(self.mem, &s)
|
||||
# LexemeCs.
|
||||
lex = <LexemeC*><void*>self.get(self.mem, &s)
|
||||
lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)
|
||||
|
||||
def dump(self, loc):
|
||||
|
@ -128,30 +137,30 @@ cdef class Vocab:
|
|||
key = self._map.c_map.cells[i].key
|
||||
if key == 0:
|
||||
continue
|
||||
lexeme = <Lexeme*>self._map.c_map.cells[i].value
|
||||
lexeme = <LexemeC*>self._map.c_map.cells[i].value
|
||||
st = fwrite(&key, sizeof(key), 1, fp)
|
||||
assert st == 1
|
||||
st = fwrite(lexeme, sizeof(Lexeme), 1, fp)
|
||||
st = fwrite(lexeme, sizeof(LexemeC), 1, fp)
|
||||
assert st == 1
|
||||
st = fclose(fp)
|
||||
assert st == 0
|
||||
|
||||
def load(self, loc):
|
||||
def load_lexemes(self, loc):
|
||||
if not path.exists(loc):
|
||||
raise IOError('Lexemes file not found at %s' % loc)
|
||||
raise IOError('LexemeCs file not found at %s' % loc)
|
||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||
cdef FILE* fp = fopen(<char*>bytes_loc, 'rb')
|
||||
assert fp != NULL
|
||||
cdef size_t st
|
||||
cdef Lexeme* lexeme
|
||||
cdef LexemeC* lexeme
|
||||
cdef hash_t key
|
||||
i = 0
|
||||
while True:
|
||||
st = fread(&key, sizeof(key), 1, fp)
|
||||
if st != 1:
|
||||
break
|
||||
lexeme = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
|
||||
st = fread(lexeme, sizeof(Lexeme), 1, fp)
|
||||
lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
|
||||
st = fread(lexeme, sizeof(LexemeC), 1, fp)
|
||||
if st != 1:
|
||||
break
|
||||
self._map.set(key, lexeme)
|
||||
|
@ -160,3 +169,29 @@ cdef class Vocab:
|
|||
self.lexemes[lexeme.id] = lexeme
|
||||
i += 1
|
||||
fclose(fp)
|
||||
|
||||
def load_vectors(self, loc):
|
||||
cdef int i
|
||||
cdef unicode line
|
||||
cdef unicode word
|
||||
cdef unicode val_str
|
||||
cdef hash_t key
|
||||
cdef LexemeC* lex
|
||||
cdef float* vec
|
||||
|
||||
with codecs.open(loc, 'r', 'utf8') as file_:
|
||||
for line in file_:
|
||||
pieces = line.split()
|
||||
word = pieces.pop(0)
|
||||
if len(pieces) >= MAX_VEC_SIZE:
|
||||
sizes = (len(pieces), MAX_VEC_SIZE)
|
||||
msg = ("Your vector is %d elements."
|
||||
"The compile-time limit is %d elements." % sizes)
|
||||
raise ValueError(msg)
|
||||
key = hash_string(word)
|
||||
lex = <LexemeC*>self._map.get(key)
|
||||
if lex is not NULL:
|
||||
vec = <float*>self.mem.alloc(len(pieces), sizeof(float))
|
||||
for i, val_str in enumerate(pieces):
|
||||
vec[i] = float(val_str)
|
||||
lex.vec = vec
|
||||
|
|
Loading…
Reference in New Issue
Block a user