Merge branch 'develop' of https://github.com/explosion/spaCy into develop
|
@ -1,9 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import importlib
|
||||
|
||||
from .compat import basestring_
|
||||
from .cli.info import info as cli_info
|
||||
from .glossary import explain
|
||||
from .deprecated import resolve_load_name
|
||||
|
@ -12,14 +9,7 @@ from . import util
|
|||
|
||||
def load(name, **overrides):
|
||||
name = resolve_load_name(name, **overrides)
|
||||
model_path = util.resolve_model_path(name)
|
||||
meta = util.parse_package_meta(model_path)
|
||||
if 'lang' not in meta:
|
||||
raise IOError('No language setting found in model meta.')
|
||||
cls = util.get_lang_class(meta['lang'])
|
||||
overrides['meta'] = meta
|
||||
overrides['path'] = model_path
|
||||
return cls(**overrides)
|
||||
return util.load_model(name)
|
||||
|
||||
|
||||
def info(model=None, markdown=False):
|
||||
|
|
|
@ -150,6 +150,9 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
|||
else:
|
||||
int_key = IDS[name.upper()]
|
||||
if strings_map is not None and isinstance(value, basestring):
|
||||
if hasattr(strings_map, 'add'):
|
||||
value = strings_map.add(value)
|
||||
else:
|
||||
value = strings_map[value]
|
||||
inty_attrs[int_key] = value
|
||||
return inty_attrs
|
||||
|
|
|
@ -20,8 +20,14 @@ def info(cmd, model=None, markdown=False):
|
|||
prints details in Markdown for easy copy-pasting to GitHub issues.
|
||||
"""
|
||||
if model:
|
||||
model_path = util.resolve_model_path(model)
|
||||
meta = util.parse_package_meta(model_path)
|
||||
if util.is_package(model):
|
||||
model_path = util.get_package_path(model)
|
||||
else:
|
||||
model_path = util.get_data_path() / model
|
||||
meta_path = model_path / 'meta.json'
|
||||
if not meta_path.is_file():
|
||||
prints(meta_path, title="Can't find model meta.json", exits=1)
|
||||
meta = read_json(meta_path)
|
||||
if model_path.resolve() != model_path:
|
||||
meta['link'] = path2str(model_path)
|
||||
meta['source'] = path2str(model_path.resolve())
|
||||
|
|
|
@ -21,7 +21,7 @@ def link(cmd, origin, link_name, force=False):
|
|||
directory. Linking models allows loading them via spacy.load(link_name).
|
||||
"""
|
||||
if util.is_package(origin):
|
||||
model_path = util.get_model_package_path(origin)
|
||||
model_path = util.get_package_path(model)
|
||||
else:
|
||||
model_path = Path(origin)
|
||||
if not model_path.exists():
|
||||
|
|
|
@ -1,13 +1,14 @@
|
|||
from cymem.cymem cimport Pool
|
||||
|
||||
from .structs cimport TokenC
|
||||
from .typedefs cimport attr_t
|
||||
from .syntax.transition_system cimport Transition
|
||||
|
||||
|
||||
cdef struct GoldParseC:
|
||||
int* tags
|
||||
int* heads
|
||||
int* labels
|
||||
attr_t* labels
|
||||
int** brackets
|
||||
Transition* ner
|
||||
|
||||
|
|
|
@ -384,7 +384,7 @@ cdef class GoldParse:
|
|||
# These are filled by the tagger/parser/entity recogniser
|
||||
self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||
self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||
self.c.labels = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||
self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
|
||||
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
|
||||
|
||||
self.words = [None] * len(doc)
|
||||
|
|
|
@ -35,4 +35,4 @@ class English(Language):
|
|||
Defaults = EnglishDefaults
|
||||
|
||||
|
||||
__all__ = ['English', 'EnglishDefaults']
|
||||
__all__ = ['English']
|
||||
|
|
26
spacy/lang/xx/__init__.py
Normal file
|
@ -0,0 +1,26 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
class MultiLanguageDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'xx'
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
|
||||
|
||||
class MultiLanguage(Language):
|
||||
"""Language class to be used for models that support multiple languages.
|
||||
This module allows models to specify their language ID as 'xx'.
|
||||
"""
|
||||
lang = 'xx'
|
||||
Defaults = MultiLanguageDefaults
|
||||
|
||||
|
||||
__all__ = ['MultiLanguage']
|
|
@ -215,7 +215,9 @@ class Language(object):
|
|||
grads = {}
|
||||
def get_grads(W, dW, key=None):
|
||||
grads[key] = (W, dW)
|
||||
for proc in self.pipeline[1:]:
|
||||
pipes = list(self.pipeline[1:])
|
||||
random.shuffle(pipes)
|
||||
for proc in pipes:
|
||||
if not hasattr(proc, 'update'):
|
||||
continue
|
||||
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
|
||||
|
|
|
@ -27,7 +27,7 @@ cdef class Lexeme:
|
|||
cdef inline SerializedLexemeC c_to_bytes(const LexemeC* lex) nogil:
|
||||
cdef SerializedLexemeC lex_data
|
||||
buff = <const unsigned char*>&lex.flags
|
||||
end = <const unsigned char*>&lex.l2_norm + sizeof(lex.l2_norm)
|
||||
end = <const unsigned char*>&lex.sentiment + sizeof(lex.sentiment)
|
||||
for i in range(sizeof(lex_data.data)):
|
||||
lex_data.data[i] = buff[i]
|
||||
return lex_data
|
||||
|
@ -35,7 +35,7 @@ cdef class Lexeme:
|
|||
@staticmethod
|
||||
cdef inline void c_from_bytes(LexemeC* lex, SerializedLexemeC lex_data) nogil:
|
||||
buff = <unsigned char*>&lex.flags
|
||||
end = <unsigned char*>&lex.l2_norm + sizeof(lex.l2_norm)
|
||||
end = <unsigned char*>&lex.sentiment + sizeof(lex.sentiment)
|
||||
for i in range(sizeof(lex_data.data)):
|
||||
buff[i] = lex_data.data[i]
|
||||
|
||||
|
|
|
@ -35,11 +35,11 @@ cdef class Lexeme:
|
|||
tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
|
||||
tag).
|
||||
"""
|
||||
def __init__(self, Vocab vocab, int orth):
|
||||
def __init__(self, Vocab vocab, attr_t orth):
|
||||
"""Create a Lexeme object.
|
||||
|
||||
vocab (Vocab): The parent vocabulary
|
||||
orth (int): The orth id of the lexeme.
|
||||
orth (uint64): The orth id of the lexeme.
|
||||
Returns (Lexeme): The newly constructd object.
|
||||
"""
|
||||
self.vocab = vocab
|
||||
|
@ -51,7 +51,7 @@ cdef class Lexeme:
|
|||
if isinstance(other, Lexeme):
|
||||
a = self.orth
|
||||
b = other.orth
|
||||
elif isinstance(other, int):
|
||||
elif isinstance(other, long):
|
||||
a = self.orth
|
||||
b = other
|
||||
elif isinstance(other, str):
|
||||
|
@ -109,7 +109,7 @@ cdef class Lexeme:
|
|||
def to_bytes(self):
|
||||
lex_data = Lexeme.c_to_bytes(self.c)
|
||||
start = <const char*>&self.c.flags
|
||||
end = <const char*>&self.c.l2_norm + sizeof(self.c.l2_norm)
|
||||
end = <const char*>&self.c.sentiment + sizeof(self.c.sentiment)
|
||||
assert (end-start) == sizeof(lex_data.data), (end-start, sizeof(lex_data.data))
|
||||
byte_string = b'\0' * sizeof(lex_data.data)
|
||||
byte_chars = <char*>byte_string
|
||||
|
@ -136,12 +136,7 @@ cdef class Lexeme:
|
|||
RETURNS (bool): Whether a word vector is associated with the object.
|
||||
"""
|
||||
def __get__(self):
|
||||
cdef int i
|
||||
for i in range(self.vocab.vectors_length):
|
||||
if self.c.vector[i] != 0:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
return self.vocab.has_vector(self.c.orth)
|
||||
|
||||
property vector_norm:
|
||||
"""The L2 norm of the lexeme's vector representation.
|
||||
|
@ -149,10 +144,8 @@ cdef class Lexeme:
|
|||
RETURNS (float): The L2 norm of the vector representation.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.c.l2_norm
|
||||
|
||||
def __set__(self, float value):
|
||||
self.c.l2_norm = value
|
||||
vector = self.vector
|
||||
return numpy.sqrt((vector**2).sum())
|
||||
|
||||
property vector:
|
||||
"""A real-valued meaning representation.
|
||||
|
@ -169,27 +162,16 @@ cdef class Lexeme:
|
|||
"model doesn't include word vectors. For more info, see "
|
||||
"the documentation: \n%s\n" % about.__docs_models__
|
||||
)
|
||||
|
||||
vector_view = <float[:length,]>self.c.vector
|
||||
return numpy.asarray(vector_view)
|
||||
return self.vocab.get_vector(self.c.orth)
|
||||
|
||||
def __set__(self, vector):
|
||||
assert len(vector) == self.vocab.vectors_length
|
||||
cdef float value
|
||||
cdef double norm = 0.0
|
||||
for i, value in enumerate(vector):
|
||||
self.c.vector[i] = value
|
||||
norm += value * value
|
||||
self.c.l2_norm = sqrt(norm)
|
||||
self.vocab.set_vector(self.c.orth, vector)
|
||||
|
||||
property rank:
|
||||
def __get__(self):
|
||||
return self.c.id
|
||||
|
||||
property repvec:
|
||||
def __get__(self):
|
||||
raise AttributeError("lex.repvec has been renamed to lex.vector")
|
||||
|
||||
property sentiment:
|
||||
def __get__(self):
|
||||
return self.c.sentiment
|
||||
|
@ -210,31 +192,31 @@ cdef class Lexeme:
|
|||
|
||||
property lower:
|
||||
def __get__(self): return self.c.lower
|
||||
def __set__(self, int x): self.c.lower = x
|
||||
def __set__(self, attr_t x): self.c.lower = x
|
||||
|
||||
property norm:
|
||||
def __get__(self): return self.c.norm
|
||||
def __set__(self, int x): self.c.norm = x
|
||||
def __set__(self, attr_t x): self.c.norm = x
|
||||
|
||||
property shape:
|
||||
def __get__(self): return self.c.shape
|
||||
def __set__(self, int x): self.c.shape = x
|
||||
def __set__(self, attr_t x): self.c.shape = x
|
||||
|
||||
property prefix:
|
||||
def __get__(self): return self.c.prefix
|
||||
def __set__(self, int x): self.c.prefix = x
|
||||
def __set__(self, attr_t x): self.c.prefix = x
|
||||
|
||||
property suffix:
|
||||
def __get__(self): return self.c.suffix
|
||||
def __set__(self, int x): self.c.suffix = x
|
||||
def __set__(self, attr_t x): self.c.suffix = x
|
||||
|
||||
property cluster:
|
||||
def __get__(self): return self.c.cluster
|
||||
def __set__(self, int x): self.c.cluster = x
|
||||
def __set__(self, attr_t x): self.c.cluster = x
|
||||
|
||||
property lang:
|
||||
def __get__(self): return self.c.lang
|
||||
def __set__(self, int x): self.c.lang = x
|
||||
def __set__(self, attr_t x): self.c.lang = x
|
||||
|
||||
property prob:
|
||||
def __get__(self): return self.c.prob
|
||||
|
@ -270,7 +252,7 @@ cdef class Lexeme:
|
|||
|
||||
property is_oov:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_OOV)
|
||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_OOV, x)
|
||||
def __set__(self, attr_t x): Lexeme.c_set_flag(self.c, IS_OOV, x)
|
||||
|
||||
property is_stop:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_STOP)
|
||||
|
@ -320,7 +302,6 @@ cdef class Lexeme:
|
|||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
|
||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
|
||||
|
||||
|
||||
property like_url:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL)
|
||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x)
|
||||
|
|
|
@ -154,7 +154,7 @@ def _convert_strings(token_specs, string_store):
|
|||
if isinstance(attr, basestring):
|
||||
attr = attrs.IDS.get(attr.upper())
|
||||
if isinstance(value, basestring):
|
||||
value = string_store[value]
|
||||
value = string_store.add(value)
|
||||
if isinstance(value, bool):
|
||||
value = int(value)
|
||||
if attr is not None:
|
||||
|
@ -381,7 +381,7 @@ cdef class Matcher:
|
|||
|
||||
def _normalize_key(self, key):
|
||||
if isinstance(key, basestring):
|
||||
return self.vocab.strings[key]
|
||||
return self.vocab.strings.add(key)
|
||||
else:
|
||||
return key
|
||||
|
||||
|
@ -469,7 +469,7 @@ cdef class PhraseMatcher:
|
|||
self(doc)
|
||||
yield doc
|
||||
|
||||
def accept_match(self, Doc doc, int ent_id, int label, int start, int end):
|
||||
def accept_match(self, Doc doc, attr_t ent_id, attr_t label, int start, int end):
|
||||
assert (end - start) < self.max_length
|
||||
cdef int i, j
|
||||
for i in range(self.max_length):
|
||||
|
|
|
@ -48,7 +48,7 @@ cdef class Morphology:
|
|||
self.tag_map[tag_str] = dict(attrs)
|
||||
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
||||
self.rich_tags[i].id = i
|
||||
self.rich_tags[i].name = self.strings[tag_str]
|
||||
self.rich_tags[i].name = self.strings.add(tag_str)
|
||||
self.rich_tags[i].morph = 0
|
||||
self.rich_tags[i].pos = attrs[POS]
|
||||
self.reverse_index[self.rich_tags[i].name] = i
|
||||
|
@ -59,10 +59,12 @@ cdef class Morphology:
|
|||
|
||||
cdef int assign_tag(self, TokenC* token, tag) except -1:
|
||||
if isinstance(tag, basestring):
|
||||
tag_id = self.reverse_index[self.strings[tag]]
|
||||
else:
|
||||
tag = self.strings.add(tag)
|
||||
if tag in self.reverse_index:
|
||||
tag_id = self.reverse_index[tag]
|
||||
self.assign_tag_id(token, tag_id)
|
||||
else:
|
||||
token.tag = tag
|
||||
|
||||
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
|
||||
if tag_id >= self.n_tags:
|
||||
|
@ -73,7 +75,7 @@ cdef class Morphology:
|
|||
# the statistical model fails.
|
||||
# Related to Issue #220
|
||||
if Lexeme.c_check_flag(token.lex, IS_SPACE):
|
||||
tag_id = self.reverse_index[self.strings['SP']]
|
||||
tag_id = self.reverse_index[self.strings.add('SP')]
|
||||
rich_tag = self.rich_tags[tag_id]
|
||||
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
|
||||
if analysis is NULL:
|
||||
|
@ -104,7 +106,7 @@ cdef class Morphology:
|
|||
tag (unicode): The part-of-speech tag to key the exception.
|
||||
orth (unicode): The word-form to key the exception.
|
||||
"""
|
||||
tag = self.strings[tag_str]
|
||||
tag = self.strings.add(tag_str)
|
||||
tag_id = self.reverse_index[tag]
|
||||
orth = self.strings[orth_str]
|
||||
cdef RichTagC rich_tag = self.rich_tags[tag_id]
|
||||
|
@ -140,14 +142,14 @@ cdef class Morphology:
|
|||
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
|
||||
cdef unicode py_string = self.strings[orth]
|
||||
if self.lemmatizer is None:
|
||||
return self.strings[py_string.lower()]
|
||||
return self.strings.add(py_string.lower())
|
||||
if univ_pos not in (NOUN, VERB, ADJ, PUNCT):
|
||||
return self.strings[py_string.lower()]
|
||||
return self.strings.add(py_string.lower())
|
||||
cdef set lemma_strings
|
||||
cdef unicode lemma_string
|
||||
lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
|
||||
lemma_string = sorted(lemma_strings)[0]
|
||||
lemma = self.strings[lemma_string]
|
||||
lemma = self.strings.add(lemma_string)
|
||||
return lemma
|
||||
|
||||
|
||||
|
|
|
@ -228,6 +228,7 @@ class NeuralTagger(object):
|
|||
idx += 1
|
||||
correct = self.model.ops.xp.array(correct, dtype='i')
|
||||
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
|
||||
d_scores /= d_scores.shape[0]
|
||||
loss = (d_scores**2).sum()
|
||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
||||
return float(loss), d_scores
|
||||
|
@ -292,6 +293,7 @@ class NeuralLabeller(NeuralTagger):
|
|||
idx += 1
|
||||
correct = self.model.ops.xp.array(correct, dtype='i')
|
||||
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
|
||||
d_scores /= d_scores.shape[0]
|
||||
loss = (d_scores**2).sum()
|
||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
||||
return float(loss), d_scores
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
from libc.stdint cimport int64_t
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
|
@ -8,6 +9,9 @@ from .typedefs cimport attr_t, hash_t
|
|||
|
||||
|
||||
cpdef hash_t hash_string(unicode string) except 0
|
||||
cdef hash_t hash_utf8(char* utf8_string, int length) nogil
|
||||
|
||||
cdef unicode decode_Utf8Str(const Utf8Str* string)
|
||||
|
||||
|
||||
ctypedef union Utf8Str:
|
||||
|
@ -17,13 +21,11 @@ ctypedef union Utf8Str:
|
|||
|
||||
cdef class StringStore:
|
||||
cdef Pool mem
|
||||
cdef Utf8Str* c
|
||||
cdef int64_t size
|
||||
cdef bint is_frozen
|
||||
|
||||
cdef vector[hash_t] keys
|
||||
cdef public PreshMap _map
|
||||
cdef public PreshMap _oov
|
||||
cdef int64_t _resize_at
|
||||
|
||||
cdef const Utf8Str* intern_unicode(self, unicode py_string)
|
||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)
|
||||
|
|
|
@ -11,6 +11,9 @@ from libc.stdint cimport uint32_t
|
|||
import ujson
|
||||
import dill
|
||||
|
||||
from .symbols import IDS as SYMBOLS_BY_STR
|
||||
from .symbols import NAMES as SYMBOLS_BY_INT
|
||||
|
||||
from .typedefs cimport hash_t
|
||||
from . import util
|
||||
|
||||
|
@ -28,7 +31,7 @@ cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil:
|
|||
return hash32(utf8_string, length, 1)
|
||||
|
||||
|
||||
cdef unicode _decode(const Utf8Str* string):
|
||||
cdef unicode decode_Utf8Str(const Utf8Str* string):
|
||||
cdef int i, length
|
||||
if string.s[0] < sizeof(string.s) and string.s[0] != 0:
|
||||
return string.s[1:string.s[0]+1].decode('utf8')
|
||||
|
@ -45,10 +48,10 @@ cdef unicode _decode(const Utf8Str* string):
|
|||
return string.p[i:length + i].decode('utf8')
|
||||
|
||||
|
||||
cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *:
|
||||
cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *:
|
||||
cdef int n_length_bytes
|
||||
cdef int i
|
||||
cdef Utf8Str string
|
||||
cdef Utf8Str* string = <Utf8Str*>mem.alloc(1, sizeof(Utf8Str))
|
||||
cdef uint32_t ulength = length
|
||||
if length < sizeof(string.s):
|
||||
string.s[0] = <unsigned char>length
|
||||
|
@ -73,7 +76,7 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) ex
|
|||
|
||||
|
||||
cdef class StringStore:
|
||||
"""Map strings to and from integer IDs."""
|
||||
"""Lookup strings by 64-bit hash"""
|
||||
def __init__(self, strings=None, freeze=False):
|
||||
"""Create the StringStore.
|
||||
|
||||
|
@ -83,70 +86,66 @@ cdef class StringStore:
|
|||
self.mem = Pool()
|
||||
self._map = PreshMap()
|
||||
self._oov = PreshMap()
|
||||
self._resize_at = 10000
|
||||
self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
|
||||
self.size = 1
|
||||
self.is_frozen = freeze
|
||||
if strings is not None:
|
||||
for string in strings:
|
||||
_ = self[string]
|
||||
self.add(string)
|
||||
|
||||
property size:
|
||||
def __get__(self):
|
||||
return self.size -1
|
||||
def __getitem__(self, object string_or_id):
|
||||
"""Retrieve a string from a given hash ID, or vice versa.
|
||||
|
||||
string_or_id (bytes or unicode or uint64): The value to encode.
|
||||
Returns (unicode or uint64): The value to be retrieved.
|
||||
"""
|
||||
if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
|
||||
return 0
|
||||
elif string_or_id == 0:
|
||||
return u''
|
||||
elif string_or_id in SYMBOLS_BY_STR:
|
||||
return SYMBOLS_BY_STR[string_or_id]
|
||||
|
||||
cdef hash_t key
|
||||
|
||||
if isinstance(string_or_id, unicode):
|
||||
key = hash_string(string_or_id)
|
||||
return key
|
||||
elif isinstance(string_or_id, bytes):
|
||||
key = hash_utf8(string_or_id, len(string_or_id))
|
||||
return key
|
||||
else:
|
||||
if string_or_id < len(SYMBOLS_BY_INT):
|
||||
return SYMBOLS_BY_INT[string_or_id]
|
||||
key = string_or_id
|
||||
utf8str = <Utf8Str*>self._map.get(key)
|
||||
if utf8str is NULL:
|
||||
raise KeyError(string_or_id)
|
||||
else:
|
||||
return decode_Utf8Str(utf8str)
|
||||
|
||||
def add(self, string):
|
||||
if isinstance(string, unicode):
|
||||
if string in SYMBOLS_BY_STR:
|
||||
return SYMBOLS_BY_STR[string]
|
||||
key = hash_string(string)
|
||||
self.intern_unicode(string)
|
||||
elif isinstance(string, bytes):
|
||||
if string in SYMBOLS_BY_STR:
|
||||
return SYMBOLS_BY_STR[string]
|
||||
key = hash_utf8(string, len(string))
|
||||
self._intern_utf8(string, len(string))
|
||||
else:
|
||||
raise TypeError(
|
||||
"Can only add unicode or bytes. Got type: %s" % type(string))
|
||||
return key
|
||||
|
||||
def __len__(self):
|
||||
"""The number of strings in the store.
|
||||
|
||||
RETURNS (int): The number of strings in the store.
|
||||
"""
|
||||
return self.size-1
|
||||
return self.keys.size()
|
||||
|
||||
def __getitem__(self, object string_or_id):
|
||||
"""Retrieve a string from a given integer ID, or vice versa.
|
||||
|
||||
string_or_id (bytes or unicode or int): The value to encode.
|
||||
Returns (unicode or int): The value to be retrieved.
|
||||
"""
|
||||
if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
|
||||
return 0
|
||||
elif string_or_id == 0:
|
||||
return u''
|
||||
|
||||
cdef bytes byte_string
|
||||
cdef const Utf8Str* utf8str
|
||||
cdef uint64_t int_id
|
||||
cdef uint32_t oov_id
|
||||
if isinstance(string_or_id, (int, long)):
|
||||
int_id = string_or_id
|
||||
oov_id = string_or_id
|
||||
if int_id < <uint64_t>self.size:
|
||||
return _decode(&self.c[int_id])
|
||||
else:
|
||||
utf8str = <Utf8Str*>self._oov.get(oov_id)
|
||||
if utf8str is not NULL:
|
||||
return _decode(utf8str)
|
||||
else:
|
||||
raise IndexError(string_or_id)
|
||||
else:
|
||||
if isinstance(string_or_id, bytes):
|
||||
byte_string = <bytes>string_or_id
|
||||
elif isinstance(string_or_id, unicode):
|
||||
byte_string = (<unicode>string_or_id).encode('utf8')
|
||||
else:
|
||||
raise TypeError(type(string_or_id))
|
||||
utf8str = self._intern_utf8(byte_string, len(byte_string))
|
||||
if utf8str is NULL:
|
||||
# TODO: We need to use 32 bit here, for compatibility with the
|
||||
# vocabulary values. This makes birthday paradox probabilities
|
||||
# pretty bad.
|
||||
# We could also get unlucky here, and hash into a value that
|
||||
# collides with the 'real' strings.
|
||||
return hash32_utf8(byte_string, len(byte_string))
|
||||
else:
|
||||
return utf8str - self.c
|
||||
|
||||
def __contains__(self, unicode string not None):
|
||||
def __contains__(self, string not None):
|
||||
"""Check whether a string is in the store.
|
||||
|
||||
string (unicode): The string to check.
|
||||
|
@ -154,7 +153,11 @@ cdef class StringStore:
|
|||
"""
|
||||
if len(string) == 0:
|
||||
return True
|
||||
cdef hash_t key = hash_string(string)
|
||||
if string in SYMBOLS_BY_STR:
|
||||
return True
|
||||
if isinstance(string, unicode):
|
||||
string = string.encode('utf8')
|
||||
cdef hash_t key = hash_utf8(string, len(string))
|
||||
return self._map.get(key) is not NULL
|
||||
|
||||
def __iter__(self):
|
||||
|
@ -163,16 +166,15 @@ cdef class StringStore:
|
|||
YIELDS (unicode): A string in the store.
|
||||
"""
|
||||
cdef int i
|
||||
for i in range(self.size):
|
||||
yield _decode(&self.c[i]) if i > 0 else u''
|
||||
cdef hash_t key
|
||||
for i in range(self.keys.size()):
|
||||
key = self.keys[i]
|
||||
utf8str = <Utf8Str*>self._map.get(key)
|
||||
yield decode_Utf8Str(utf8str)
|
||||
# TODO: Iterate OOV here?
|
||||
|
||||
def __reduce__(self):
|
||||
strings = [""]
|
||||
for i in range(1, self.size):
|
||||
string = &self.c[i]
|
||||
py_string = _decode(string)
|
||||
strings.append(py_string)
|
||||
strings = list(self)
|
||||
return (StringStore, (strings,), None, None, None)
|
||||
|
||||
def to_disk(self, path):
|
||||
|
@ -230,11 +232,9 @@ cdef class StringStore:
|
|||
self.mem = Pool()
|
||||
self._map = PreshMap()
|
||||
self._oov = PreshMap()
|
||||
self._resize_at = 10000
|
||||
self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
|
||||
self.size = 1
|
||||
self.keys.clear()
|
||||
for string in strings:
|
||||
_ = self[string]
|
||||
self.add(string)
|
||||
self.is_frozen = freeze
|
||||
|
||||
cdef const Utf8Str* intern_unicode(self, unicode py_string):
|
||||
|
@ -258,39 +258,11 @@ cdef class StringStore:
|
|||
key32 = hash32_utf8(utf8_string, length)
|
||||
# Important: Make the OOV store own the memory. That way it's trivial
|
||||
# to flush them all.
|
||||
value = <Utf8Str*>self._oov.mem.alloc(1, sizeof(Utf8Str))
|
||||
value[0] = _allocate(self._oov.mem, <unsigned char*>utf8_string, length)
|
||||
value = _allocate(self._oov.mem, <unsigned char*>utf8_string, length)
|
||||
self._oov.set(key32, value)
|
||||
return NULL
|
||||
|
||||
if self.size == self._resize_at:
|
||||
self._realloc()
|
||||
self.c[self.size] = _allocate(self.mem, <unsigned char*>utf8_string, length)
|
||||
self._map.set(key, <void*>&self.c[self.size])
|
||||
self.size += 1
|
||||
return &self.c[self.size-1]
|
||||
|
||||
def _realloc(self):
|
||||
# We want to map straight to pointers, but they'll be invalidated if
|
||||
# we resize our array. So, first we remap to indices, then we resize,
|
||||
# then we can acquire the new pointers.
|
||||
cdef Pool tmp_mem = Pool()
|
||||
keys = <key_t*>tmp_mem.alloc(self.size, sizeof(key_t))
|
||||
cdef key_t key
|
||||
cdef void* value
|
||||
cdef const Utf8Str ptr
|
||||
cdef int i = 0
|
||||
cdef size_t offset
|
||||
while map_iter(self._map.c_map, &i, &key, &value):
|
||||
# Find array index with pointer arithmetic
|
||||
offset = ((<Utf8Str*>value) - self.c)
|
||||
keys[offset] = key
|
||||
|
||||
self._resize_at *= 2
|
||||
cdef size_t new_size = self._resize_at * sizeof(Utf8Str)
|
||||
self.c = <Utf8Str*>self.mem.realloc(self.c, new_size)
|
||||
|
||||
self._map = PreshMap(self.size)
|
||||
for i in range(self.size):
|
||||
if keys[i]:
|
||||
self._map.set(keys[i], &self.c[i])
|
||||
value = _allocate(self.mem, <unsigned char*>utf8_string, length)
|
||||
self._map.set(key, value)
|
||||
self.keys.push_back(key)
|
||||
return value
|
||||
|
|
|
@ -5,8 +5,6 @@ from .parts_of_speech cimport univ_pos_t
|
|||
|
||||
|
||||
cdef struct LexemeC:
|
||||
float* vector
|
||||
|
||||
flags_t flags
|
||||
|
||||
attr_t lang
|
||||
|
@ -25,11 +23,10 @@ cdef struct LexemeC:
|
|||
|
||||
float prob
|
||||
float sentiment
|
||||
float l2_norm
|
||||
|
||||
|
||||
cdef struct SerializedLexemeC:
|
||||
unsigned char[4*13 + 8] data
|
||||
unsigned char[8 + 8*10 + 4 + 4] data
|
||||
# sizeof(flags_t) # flags
|
||||
# + sizeof(attr_t) # lang
|
||||
# + sizeof(attr_t) # id
|
||||
|
@ -50,7 +47,7 @@ cdef struct Entity:
|
|||
hash_t id
|
||||
int start
|
||||
int end
|
||||
int label
|
||||
attr_t label
|
||||
|
||||
|
||||
cdef struct TokenC:
|
||||
|
@ -58,12 +55,12 @@ cdef struct TokenC:
|
|||
uint64_t morph
|
||||
univ_pos_t pos
|
||||
bint spacy
|
||||
int tag
|
||||
attr_t tag
|
||||
int idx
|
||||
int lemma
|
||||
int sense
|
||||
attr_t lemma
|
||||
attr_t sense
|
||||
int head
|
||||
int dep
|
||||
attr_t dep
|
||||
bint sent_start
|
||||
|
||||
uint32_t l_kids
|
||||
|
@ -72,5 +69,5 @@ cdef struct TokenC:
|
|||
uint32_t r_edge
|
||||
|
||||
int ent_iob
|
||||
int ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
|
||||
attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
|
||||
hash_t ent_id
|
||||
|
|
|
@ -3,6 +3,7 @@ from cymem.cymem cimport Pool
|
|||
from thinc.typedefs cimport weight_t
|
||||
|
||||
from .stateclass cimport StateClass
|
||||
from ..typedefs cimport attr_t
|
||||
|
||||
from .transition_system cimport TransitionSystem, Transition
|
||||
from ..gold cimport GoldParseC
|
||||
|
|
|
@ -99,7 +99,7 @@ cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil:
|
|||
return False
|
||||
|
||||
|
||||
cdef bint label_is_gold(const GoldParseC* gold, int head, int child, int label) nogil:
|
||||
cdef bint label_is_gold(const GoldParseC* gold, int head, int child, attr_t label) nogil:
|
||||
if gold.labels[child] == -1:
|
||||
return True
|
||||
elif label == -1:
|
||||
|
@ -116,16 +116,16 @@ cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil:
|
|||
|
||||
cdef class Shift:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_start
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, int label) nogil:
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
st.push()
|
||||
st.fast_forward()
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t cost(StateClass st, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t cost(StateClass st, const GoldParseC* gold, attr_t label) nogil:
|
||||
return Shift.move_cost(st, gold) + Shift.label_cost(st, gold, label)
|
||||
|
||||
@staticmethod
|
||||
|
@ -133,17 +133,17 @@ cdef class Shift:
|
|||
return push_cost(s, gold, s.B(0))
|
||||
|
||||
@staticmethod
|
||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
return 0
|
||||
|
||||
|
||||
cdef class Reduce:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
return st.stack_depth() >= 2
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, int label) nogil:
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
if st.has_head(st.S(0)):
|
||||
st.pop()
|
||||
else:
|
||||
|
@ -151,7 +151,7 @@ cdef class Reduce:
|
|||
st.fast_forward()
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label)
|
||||
|
||||
@staticmethod
|
||||
|
@ -170,23 +170,23 @@ cdef class Reduce:
|
|||
return cost
|
||||
|
||||
@staticmethod
|
||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
return 0
|
||||
|
||||
|
||||
cdef class LeftArc:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
return not st.B_(0).sent_start
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, int label) nogil:
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
st.add_arc(st.B(0), st.S(0), label)
|
||||
st.pop()
|
||||
st.fast_forward()
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label)
|
||||
|
||||
@staticmethod
|
||||
|
@ -204,23 +204,23 @@ cdef class LeftArc:
|
|||
return cost + pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0))
|
||||
|
||||
@staticmethod
|
||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label)
|
||||
|
||||
|
||||
cdef class RightArc:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
return not st.B_(0).sent_start
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, int label) nogil:
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
st.add_arc(st.S(0), st.B(0), label)
|
||||
st.push()
|
||||
st.fast_forward()
|
||||
|
||||
@staticmethod
|
||||
cdef inline weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef inline weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label)
|
||||
|
||||
@staticmethod
|
||||
|
@ -233,13 +233,13 @@ cdef class RightArc:
|
|||
return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0))
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label)
|
||||
|
||||
|
||||
cdef class Break:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
cdef int i
|
||||
if not USE_BREAK:
|
||||
return False
|
||||
|
@ -251,12 +251,12 @@ cdef class Break:
|
|||
return True
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, int label) nogil:
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
st.set_break(st.B_(0).l_edge)
|
||||
st.fast_forward()
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
return Break.move_cost(s, gold) + Break.label_cost(s, gold, label)
|
||||
|
||||
@staticmethod
|
||||
|
@ -281,7 +281,7 @@ cdef class Break:
|
|||
return cost + 1
|
||||
|
||||
@staticmethod
|
||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
return 0
|
||||
|
||||
cdef int _get_root(int word, const GoldParseC* gold) nogil:
|
||||
|
@ -295,9 +295,7 @@ cdef int _get_root(int word, const GoldParseC* gold) nogil:
|
|||
|
||||
cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
|
||||
cdef StateClass st = StateClass.init(<const TokenC*>tokens, length)
|
||||
# Ensure sent_start is set to 0 throughout
|
||||
for i in range(st.c.length):
|
||||
st.c._sent[i].sent_start = False
|
||||
st.c._sent[i].l_edge = i
|
||||
st.c._sent[i].r_edge = i
|
||||
st.fast_forward()
|
||||
|
@ -371,7 +369,7 @@ cdef class ArcEager(TransitionSystem):
|
|||
if label.upper() == 'ROOT':
|
||||
label = 'ROOT'
|
||||
gold.c.heads[i] = gold.heads[i]
|
||||
gold.c.labels[i] = self.strings[label]
|
||||
gold.c.labels[i] = self.strings.add(label)
|
||||
return gold
|
||||
|
||||
cdef Transition lookup_transition(self, object name) except *:
|
||||
|
@ -386,14 +384,14 @@ cdef class ArcEager(TransitionSystem):
|
|||
if self.c[i].move == move and self.c[i].label == label:
|
||||
return self.c[i]
|
||||
|
||||
def move_name(self, int move, int label):
|
||||
def move_name(self, int move, attr_t label):
|
||||
label_str = self.strings[label]
|
||||
if label_str:
|
||||
return MOVE_NAMES[move] + '-' + label_str
|
||||
else:
|
||||
return MOVE_NAMES[move]
|
||||
|
||||
cdef Transition init_transition(self, int clas, int move, int label) except *:
|
||||
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
|
||||
# TODO: Apparent Cython bug here when we try to use the Transition()
|
||||
# constructor with the function pointers
|
||||
cdef Transition t
|
||||
|
@ -426,9 +424,7 @@ cdef class ArcEager(TransitionSystem):
|
|||
return t
|
||||
|
||||
cdef int initialize_state(self, StateC* st) nogil:
|
||||
# Ensure sent_start is set to 0 throughout
|
||||
for i in range(st.length):
|
||||
st._sent[i].sent_start = False
|
||||
st._sent[i].l_edge = i
|
||||
st._sent[i].r_edge = i
|
||||
st.fast_forward()
|
||||
|
@ -473,7 +469,7 @@ cdef class ArcEager(TransitionSystem):
|
|||
label_cost_funcs[RIGHT] = RightArc.label_cost
|
||||
label_cost_funcs[BREAK] = Break.label_cost
|
||||
|
||||
cdef int* labels = gold.c.labels
|
||||
cdef attr_t* labels = gold.c.labels
|
||||
cdef int* heads = gold.c.heads
|
||||
|
||||
n_gold = 0
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from .transition_system cimport TransitionSystem
|
||||
from .transition_system cimport Transition
|
||||
from ..gold cimport GoldParseC
|
||||
from ..typedefs cimport attr_t
|
||||
|
||||
|
||||
cdef class BiluoPushDown(TransitionSystem):
|
||||
|
|
|
@ -100,7 +100,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
def __get__(self):
|
||||
return (BEGIN, IN, LAST, UNIT, OUT)
|
||||
|
||||
def move_name(self, int move, int label):
|
||||
def move_name(self, int move, attr_t label):
|
||||
if move == OUT:
|
||||
return 'O'
|
||||
elif move == MISSING:
|
||||
|
@ -132,7 +132,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
if label_str.startswith('!'):
|
||||
label_str = label_str[1:]
|
||||
move_str = 'x'
|
||||
label = self.strings[label_str]
|
||||
label = self.strings.add(label_str)
|
||||
else:
|
||||
move_str = name
|
||||
label = 0
|
||||
|
@ -145,7 +145,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
else:
|
||||
raise KeyError(name)
|
||||
|
||||
cdef Transition init_transition(self, int clas, int move, int label) except *:
|
||||
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
|
||||
# TODO: Apparent Cython bug here when we try to use the Transition()
|
||||
# constructor with the function pointers
|
||||
cdef Transition t
|
||||
|
@ -194,21 +194,21 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
|
||||
cdef class Missing:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* s, int label) nogil:
|
||||
cdef int transition(StateC* s, attr_t label) nogil:
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
return 9000
|
||||
|
||||
|
||||
cdef class Begin:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
# Ensure we don't clobber preset entities. If no entity preset,
|
||||
# ent_iob is 0
|
||||
cdef int preset_ent_iob = st.B_(0).ent_iob
|
||||
|
@ -232,14 +232,14 @@ cdef class Begin:
|
|||
return label != 0 and not st.entity_is_open()
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, int label) nogil:
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
st.open_ent(label)
|
||||
st.set_ent_tag(st.B(0), 3, label)
|
||||
st.push()
|
||||
st.pop()
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
cdef int g_act = gold.ner[s.B(0)].move
|
||||
cdef int g_tag = gold.ner[s.B(0)].label
|
||||
|
||||
|
@ -261,7 +261,7 @@ cdef class Begin:
|
|||
|
||||
cdef class In:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
cdef int preset_ent_iob = st.B_(0).ent_iob
|
||||
if preset_ent_iob == 2:
|
||||
return False
|
||||
|
@ -277,17 +277,17 @@ cdef class In:
|
|||
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, int label) nogil:
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
st.set_ent_tag(st.B(0), 1, label)
|
||||
st.push()
|
||||
st.pop()
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
move = IN
|
||||
cdef int next_act = gold.ner[s.B(1)].move if s.B(0) < s.c.length else OUT
|
||||
cdef int g_act = gold.ner[s.B(0)].move
|
||||
cdef int g_tag = gold.ner[s.B(0)].label
|
||||
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
||||
cdef bint is_sunk = _entity_is_sunk(s, gold.ner)
|
||||
|
||||
if g_act == MISSING:
|
||||
|
@ -313,24 +313,24 @@ cdef class In:
|
|||
|
||||
cdef class Last:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
if st.B_(1).ent_iob == 1:
|
||||
return False
|
||||
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, int label) nogil:
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
st.close_ent()
|
||||
st.set_ent_tag(st.B(0), 1, label)
|
||||
st.push()
|
||||
st.pop()
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
move = LAST
|
||||
|
||||
cdef int g_act = gold.ner[s.B(0)].move
|
||||
cdef int g_tag = gold.ner[s.B(0)].label
|
||||
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
||||
|
||||
if g_act == MISSING:
|
||||
return 0
|
||||
|
@ -355,7 +355,7 @@ cdef class Last:
|
|||
|
||||
cdef class Unit:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
cdef int preset_ent_iob = st.B_(0).ent_iob
|
||||
if preset_ent_iob == 2:
|
||||
return False
|
||||
|
@ -368,7 +368,7 @@ cdef class Unit:
|
|||
return label != 0 and not st.entity_is_open()
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, int label) nogil:
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
st.open_ent(label)
|
||||
st.close_ent()
|
||||
st.set_ent_tag(st.B(0), 3, label)
|
||||
|
@ -376,9 +376,9 @@ cdef class Unit:
|
|||
st.pop()
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
cdef int g_act = gold.ner[s.B(0)].move
|
||||
cdef int g_tag = gold.ner[s.B(0)].label
|
||||
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
||||
|
||||
if g_act == MISSING:
|
||||
return 0
|
||||
|
@ -398,7 +398,7 @@ cdef class Unit:
|
|||
|
||||
cdef class Out:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
cdef int preset_ent_iob = st.B_(0).ent_iob
|
||||
if preset_ent_iob == 3:
|
||||
return False
|
||||
|
@ -407,15 +407,15 @@ cdef class Out:
|
|||
return not st.entity_is_open()
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, int label) nogil:
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
st.set_ent_tag(st.B(0), 2, 0)
|
||||
st.push()
|
||||
st.pop()
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
cdef int g_act = gold.ner[s.B(0)].move
|
||||
cdef int g_tag = gold.ner[s.B(0)].label
|
||||
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
||||
|
||||
if g_act == MISSING or g_act == ISNT:
|
||||
return 0
|
||||
|
|
|
@ -428,7 +428,7 @@ cdef class Parser:
|
|||
|
||||
cuda_stream = get_cuda_stream()
|
||||
|
||||
states, golds, max_length = self._init_gold_batch(docs, golds)
|
||||
states, golds, max_steps = self._init_gold_batch(docs, golds)
|
||||
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream,
|
||||
0.0)
|
||||
todo = [(s, g) for (s, g) in zip(states, golds)
|
||||
|
@ -439,6 +439,7 @@ cdef class Parser:
|
|||
backprops = []
|
||||
d_tokvecs = state2vec.ops.allocate(tokvecs.shape)
|
||||
cdef float loss = 0.
|
||||
n_steps = 0
|
||||
while todo:
|
||||
states, golds = zip(*todo)
|
||||
|
||||
|
@ -450,7 +451,7 @@ cdef class Parser:
|
|||
scores, bp_scores = vec2scores.begin_update(vector, drop=drop)
|
||||
|
||||
d_scores = self.get_batch_loss(states, golds, scores)
|
||||
d_vector = bp_scores(d_scores, sgd=sgd)
|
||||
d_vector = bp_scores(d_scores / d_scores.shape[0], sgd=sgd)
|
||||
if drop != 0:
|
||||
d_vector *= mask
|
||||
|
||||
|
@ -468,7 +469,8 @@ cdef class Parser:
|
|||
todo = [st for st in todo if not st[0].is_final()]
|
||||
if losses is not None:
|
||||
losses[self.name] += (d_scores**2).sum()
|
||||
if len(backprops) >= (max_length * 2):
|
||||
n_steps += 1
|
||||
if n_steps >= max_steps:
|
||||
break
|
||||
self._make_updates(d_tokvecs,
|
||||
backprops, sgd, cuda_stream)
|
||||
|
@ -483,7 +485,8 @@ cdef class Parser:
|
|||
StateClass state
|
||||
Transition action
|
||||
whole_states = self.moves.init_batch(whole_docs)
|
||||
max_length = max(5, min(20, min([len(doc) for doc in whole_docs])))
|
||||
max_length = max(5, min(50, min([len(doc) for doc in whole_docs])))
|
||||
max_moves = 0
|
||||
states = []
|
||||
golds = []
|
||||
for doc, state, gold in zip(whole_docs, whole_states, whole_golds):
|
||||
|
@ -494,16 +497,20 @@ cdef class Parser:
|
|||
start = 0
|
||||
while start < len(doc):
|
||||
state = state.copy()
|
||||
n_moves = 0
|
||||
while state.B(0) < start and not state.is_final():
|
||||
action = self.moves.c[oracle_actions.pop(0)]
|
||||
action.do(state.c, action.label)
|
||||
n_moves += 1
|
||||
has_gold = self.moves.has_gold(gold, start=start,
|
||||
end=start+max_length)
|
||||
if not state.is_final() and has_gold:
|
||||
states.append(state)
|
||||
golds.append(gold)
|
||||
max_moves = max(max_moves, n_moves)
|
||||
start += min(max_length, len(doc)-start)
|
||||
return states, golds, max_length
|
||||
max_moves = max(max_moves, len(oracle_actions))
|
||||
return states, golds, max_moves
|
||||
|
||||
def _make_updates(self, d_tokvecs, backprops, sgd, cuda_stream=None):
|
||||
# Tells CUDA to block, so our async copies complete.
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from cymem.cymem cimport Pool
|
||||
from thinc.typedefs cimport weight_t
|
||||
|
||||
from ..typedefs cimport attr_t
|
||||
from ..structs cimport TokenC
|
||||
from ..gold cimport GoldParse
|
||||
from ..gold cimport GoldParseC
|
||||
|
@ -13,20 +14,22 @@ from ._state cimport StateC
|
|||
cdef struct Transition:
|
||||
int clas
|
||||
int move
|
||||
int label
|
||||
attr_t label
|
||||
|
||||
weight_t score
|
||||
|
||||
bint (*is_valid)(const StateC* state, int label) nogil
|
||||
weight_t (*get_cost)(StateClass state, const GoldParseC* gold, int label) nogil
|
||||
int (*do)(StateC* state, int label) nogil
|
||||
bint (*is_valid)(const StateC* state, attr_t label) nogil
|
||||
weight_t (*get_cost)(StateClass state, const GoldParseC* gold, attr_t label) nogil
|
||||
int (*do)(StateC* state, attr_t label) nogil
|
||||
|
||||
|
||||
ctypedef weight_t (*get_cost_func_t)(StateClass state, const GoldParseC* gold, int label) nogil
|
||||
ctypedef weight_t (*get_cost_func_t)(StateClass state, const GoldParseC* gold,
|
||||
attr_tlabel) nogil
|
||||
ctypedef weight_t (*move_cost_func_t)(StateClass state, const GoldParseC* gold) nogil
|
||||
ctypedef weight_t (*label_cost_func_t)(StateClass state, const GoldParseC* gold, int label) nogil
|
||||
ctypedef weight_t (*label_cost_func_t)(StateClass state, const GoldParseC*
|
||||
gold, attr_t label) nogil
|
||||
|
||||
ctypedef int (*do_func_t)(StateC* state, int label) nogil
|
||||
ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil
|
||||
|
||||
ctypedef void* (*init_state_t)(Pool mem, int length, void* tokens) except NULL
|
||||
|
||||
|
@ -36,7 +39,7 @@ cdef class TransitionSystem:
|
|||
cdef Transition* c
|
||||
cdef readonly int n_moves
|
||||
cdef int _size
|
||||
cdef public int root_label
|
||||
cdef public attr_t root_label
|
||||
cdef public freqs
|
||||
cdef init_state_t init_beam_state
|
||||
|
||||
|
@ -45,7 +48,7 @@ cdef class TransitionSystem:
|
|||
|
||||
cdef Transition lookup_transition(self, object name) except *
|
||||
|
||||
cdef Transition init_transition(self, int clas, int move, int label) except *
|
||||
cdef Transition init_transition(self, int clas, int move, attr_t label) except *
|
||||
|
||||
cdef int set_valid(self, int* output, const StateC* st) nogil
|
||||
|
||||
|
|
|
@ -99,7 +99,7 @@ cdef class TransitionSystem:
|
|||
cdef Transition lookup_transition(self, object name) except *:
|
||||
raise NotImplementedError
|
||||
|
||||
cdef Transition init_transition(self, int clas, int move, int label) except *:
|
||||
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
|
||||
raise NotImplementedError
|
||||
|
||||
def is_valid(self, StateClass stcls, move_name):
|
||||
|
|
|
@ -204,6 +204,7 @@ def test_doc_api_right_edge(en_tokenizer):
|
|||
assert doc[6].right_edge.text == ','
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('text,vectors', [
|
||||
("apple orange pear", ["apple -1 -1 -1", "orange -1 -1 0", "pear -1 0 -1"])
|
||||
])
|
||||
|
|
|
@ -20,7 +20,7 @@ def test_doc_noun_chunks_not_nested(en_tokenizer):
|
|||
tokens.from_array(
|
||||
[HEAD, DEP],
|
||||
numpy.asarray([[1, nsubj], [0, root], [4, amod], [3, nmod], [-1, cc],
|
||||
[-2, conj], [-5, dobj]], dtype='int32'))
|
||||
[-2, conj], [-5, dobj]], dtype='uint64'))
|
||||
tokens.noun_chunks_iterator = english_noun_chunks
|
||||
word_occurred = {}
|
||||
for chunk in tokens.noun_chunks:
|
||||
|
|
|
@ -68,6 +68,7 @@ def test_doc_token_api_is_properties(en_vocab):
|
|||
assert doc[5].like_email
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('text,vectors', [
|
||||
("apples oranges ldskbjls", ["apples -1 -1 -1", "oranges -1 -1 0"])
|
||||
])
|
||||
|
|
|
@ -15,7 +15,9 @@ def test_issue615(en_tokenizer):
|
|||
# Get Span objects
|
||||
spans = [(ent_id, ent_id, doc[start : end]) for ent_id, start, end in matches]
|
||||
for ent_id, label, span in spans:
|
||||
span.merge('NNP' if label else span.root.tag_, span.text, doc.vocab.strings[label])
|
||||
span.merge(tag='NNP' if label else span.root.tag_, lemma=span.text,
|
||||
label=label)
|
||||
doc.ents = doc.ents + ((label, span.start, span.end),)
|
||||
|
||||
text = "The golf club is broken"
|
||||
pattern = [{'ORTH': "golf"}, {'ORTH': "club"}]
|
||||
|
@ -25,6 +27,7 @@ def test_issue615(en_tokenizer):
|
|||
matcher = Matcher(doc.vocab)
|
||||
matcher.add(label, merge_phrases, pattern)
|
||||
match = matcher(doc)
|
||||
print(match)
|
||||
entities = list(doc.ents)
|
||||
|
||||
assert entities != [] #assertion 1
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
|
||||
|
||||
word2vec_str = """, -0.046107 -0.035951 -0.560418
|
||||
|
@ -8,6 +9,7 @@ de -0.648927 -0.400976 -0.527124
|
|||
\u00A0 -1.499184 -0.184280 -0.598371"""
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_issue834(en_vocab, text_file):
|
||||
"""Test that no-break space (U+00A0) is detected as space by the load_vectors function."""
|
||||
text_file.write(word2vec_str)
|
||||
|
|
|
@ -7,6 +7,7 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('text', [["a", "b", "c"]])
|
||||
def test_stringstore_freeze_oov(stringstore, text):
|
||||
assert stringstore[text[0]] == 1
|
||||
|
|
|
@ -8,69 +8,65 @@ import pytest
|
|||
|
||||
@pytest.mark.parametrize('text1,text2,text3', [(b'Hello', b'goodbye', b'hello')])
|
||||
def test_stringstore_save_bytes(stringstore, text1, text2, text3):
|
||||
i = stringstore[text1]
|
||||
assert i == 1
|
||||
assert stringstore[text1] == 1
|
||||
assert stringstore[text2] != i
|
||||
assert stringstore[text3] != i
|
||||
assert i == 1
|
||||
key = stringstore.add(text1)
|
||||
assert stringstore[text1] == key
|
||||
assert stringstore[text2] != key
|
||||
assert stringstore[text3] != key
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text1,text2,text3', [('Hello', 'goodbye', 'hello')])
|
||||
def test_stringstore_save_unicode(stringstore, text1, text2, text3):
|
||||
i = stringstore[text1]
|
||||
assert i == 1
|
||||
assert stringstore[text1] == 1
|
||||
assert stringstore[text2] != i
|
||||
assert stringstore[text3] != i
|
||||
assert i == 1
|
||||
key = stringstore.add(text1)
|
||||
assert stringstore[text1] == key
|
||||
assert stringstore[text2] != key
|
||||
assert stringstore[text3] != key
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', [b'A'])
|
||||
def test_stringstore_retrieve_id(stringstore, text):
|
||||
i = stringstore[text]
|
||||
assert stringstore.size == 1
|
||||
assert stringstore[1] == text.decode('utf8')
|
||||
with pytest.raises(IndexError):
|
||||
stringstore[2]
|
||||
key = stringstore.add(text)
|
||||
assert len(stringstore) == 1
|
||||
assert stringstore[key] == text.decode('utf8')
|
||||
with pytest.raises(KeyError):
|
||||
stringstore[20000]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text1,text2', [(b'0123456789', b'A')])
|
||||
def test_stringstore_med_string(stringstore, text1, text2):
|
||||
store = stringstore[text1]
|
||||
store = stringstore.add(text1)
|
||||
assert stringstore[store] == text1.decode('utf8')
|
||||
dummy = stringstore[text2]
|
||||
dummy = stringstore.add(text2)
|
||||
assert stringstore[text1] == store
|
||||
|
||||
|
||||
def test_stringstore_long_string(stringstore):
|
||||
text = "INFORMATIVE](http://www.google.com/search?as_q=RedditMonkey&hl=en&num=50&btnG=Google+Search&as_epq=&as_oq=&as_eq=&lr=&as_ft=i&as_filetype=&as_qdr=all&as_nlo=&as_nhi=&as_occt=any&as_dt=i&as_sitesearch=&as_rights=&safe=off"
|
||||
store = stringstore[text]
|
||||
store = stringstore.add(text)
|
||||
assert stringstore[store] == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('factor', [254, 255, 256])
|
||||
def test_stringstore_multiply(stringstore, factor):
|
||||
text = 'a' * factor
|
||||
store = stringstore[text]
|
||||
store = stringstore.add(text)
|
||||
assert stringstore[store] == text
|
||||
|
||||
|
||||
def test_stringstore_massive_strings(stringstore):
|
||||
text = 'a' * 511
|
||||
store = stringstore[text]
|
||||
store = stringstore.add(text)
|
||||
assert stringstore[store] == text
|
||||
text2 = 'z' * 512
|
||||
store = stringstore[text2]
|
||||
store = stringstore.add(text2)
|
||||
assert stringstore[store] == text2
|
||||
text3 = '1' * 513
|
||||
store = stringstore[text3]
|
||||
store = stringstore.add(text3)
|
||||
assert stringstore[store] == text3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["qqqqq"])
|
||||
def test_stringstore_to_bytes(stringstore, text):
|
||||
store = stringstore[text]
|
||||
store = stringstore.add(text)
|
||||
serialized = stringstore.to_bytes()
|
||||
new_stringstore = StringStore().from_bytes(serialized)
|
||||
assert new_stringstore[store] == text
|
||||
|
|
|
@ -10,8 +10,11 @@ import numpy
|
|||
def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None):
|
||||
"""Create Doc object from given vocab, words and annotations."""
|
||||
pos = pos or [''] * len(words)
|
||||
tags = tags or [''] * len(words)
|
||||
heads = heads or [0] * len(words)
|
||||
deps = deps or [''] * len(words)
|
||||
for value in (deps+tags+pos):
|
||||
vocab.strings.add(value)
|
||||
|
||||
doc = Doc(vocab, words=words)
|
||||
attrs = doc.to_array([POS, HEAD, DEP])
|
||||
|
|
|
@ -16,7 +16,7 @@ def vectors():
|
|||
def vocab(en_vocab, vectors):
|
||||
return add_vecs_to_vocab(en_vocab, vectors)
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_vectors_similarity_LL(vocab, vectors):
|
||||
[(word1, vec1), (word2, vec2)] = vectors
|
||||
lex1 = vocab[word1]
|
||||
|
@ -30,6 +30,7 @@ def test_vectors_similarity_LL(vocab, vectors):
|
|||
assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1))
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_vectors_similarity_TT(vocab, vectors):
|
||||
[(word1, vec1), (word2, vec2)] = vectors
|
||||
doc = get_doc(vocab, words=[word1, word2])
|
||||
|
@ -42,18 +43,21 @@ def test_vectors_similarity_TT(vocab, vectors):
|
|||
assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1]))
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_vectors_similarity_TD(vocab, vectors):
|
||||
[(word1, vec1), (word2, vec2)] = vectors
|
||||
doc = get_doc(vocab, words=[word1, word2])
|
||||
assert doc.similarity(doc[0]) == doc[0].similarity(doc)
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_vectors_similarity_DS(vocab, vectors):
|
||||
[(word1, vec1), (word2, vec2)] = vectors
|
||||
doc = get_doc(vocab, words=[word1, word2])
|
||||
assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_vectors_similarity_TS(vocab, vectors):
|
||||
[(word1, vec1), (word2, vec2)] = vectors
|
||||
doc = get_doc(vocab, words=[word1, word2])
|
||||
|
|
|
@ -22,6 +22,7 @@ def tokenizer_v(vocab):
|
|||
return Tokenizer(vocab, {}, None, None, None)
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('text', ["apple and orange"])
|
||||
def test_vectors_token_vector(tokenizer_v, vectors, text):
|
||||
doc = tokenizer_v(text)
|
||||
|
@ -29,6 +30,7 @@ def test_vectors_token_vector(tokenizer_v, vectors, text):
|
|||
assert vectors[1] == (doc[2].text, list(doc[2].vector))
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('text', ["apple", "orange"])
|
||||
def test_vectors_lexeme_vector(vocab, text):
|
||||
lex = vocab[text]
|
||||
|
@ -36,6 +38,7 @@ def test_vectors_lexeme_vector(vocab, text):
|
|||
assert lex.vector_norm
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
|
||||
def test_vectors_doc_vector(vocab, text):
|
||||
doc = get_doc(vocab, text)
|
||||
|
@ -43,6 +46,7 @@ def test_vectors_doc_vector(vocab, text):
|
|||
assert doc.vector_norm
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
|
||||
def test_vectors_span_vector(vocab, text):
|
||||
span = get_doc(vocab, text)[0:2]
|
||||
|
@ -50,6 +54,7 @@ def test_vectors_span_vector(vocab, text):
|
|||
assert span.vector_norm
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('text', ["apple orange"])
|
||||
def test_vectors_token_token_similarity(tokenizer_v, text):
|
||||
doc = tokenizer_v(text)
|
||||
|
@ -57,6 +62,7 @@ def test_vectors_token_token_similarity(tokenizer_v, text):
|
|||
assert 0.0 < doc[0].similarity(doc[1]) < 1.0
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
|
||||
def test_vectors_token_lexeme_similarity(tokenizer_v, vocab, text1, text2):
|
||||
token = tokenizer_v(text1)
|
||||
|
@ -65,6 +71,7 @@ def test_vectors_token_lexeme_similarity(tokenizer_v, vocab, text1, text2):
|
|||
assert 0.0 < token.similarity(lex) < 1.0
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
||||
def test_vectors_token_span_similarity(vocab, text):
|
||||
doc = get_doc(vocab, text)
|
||||
|
@ -72,6 +79,7 @@ def test_vectors_token_span_similarity(vocab, text):
|
|||
assert 0.0 < doc[0].similarity(doc[1:3]) < 1.0
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
||||
def test_vectors_token_doc_similarity(vocab, text):
|
||||
doc = get_doc(vocab, text)
|
||||
|
@ -79,6 +87,7 @@ def test_vectors_token_doc_similarity(vocab, text):
|
|||
assert 0.0 < doc[0].similarity(doc) < 1.0
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
||||
def test_vectors_lexeme_span_similarity(vocab, text):
|
||||
doc = get_doc(vocab, text)
|
||||
|
@ -87,6 +96,7 @@ def test_vectors_lexeme_span_similarity(vocab, text):
|
|||
assert 0.0 < doc.similarity(doc[1:3]) < 1.0
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
|
||||
def test_vectors_lexeme_lexeme_similarity(vocab, text1, text2):
|
||||
lex1 = vocab[text1]
|
||||
|
@ -95,6 +105,7 @@ def test_vectors_lexeme_lexeme_similarity(vocab, text1, text2):
|
|||
assert 0.0 < lex1.similarity(lex2) < 1.0
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
||||
def test_vectors_lexeme_doc_similarity(vocab, text):
|
||||
doc = get_doc(vocab, text)
|
||||
|
@ -103,6 +114,7 @@ def test_vectors_lexeme_doc_similarity(vocab, text):
|
|||
assert 0.0 < lex.similarity(doc) < 1.0
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
||||
def test_vectors_span_span_similarity(vocab, text):
|
||||
doc = get_doc(vocab, text)
|
||||
|
@ -110,6 +122,7 @@ def test_vectors_span_span_similarity(vocab, text):
|
|||
assert 0.0 < doc[0:2].similarity(doc[1:3]) < 1.0
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
||||
def test_vectors_span_doc_similarity(vocab, text):
|
||||
doc = get_doc(vocab, text)
|
||||
|
@ -117,6 +130,7 @@ def test_vectors_span_doc_similarity(vocab, text):
|
|||
assert 0.0 < doc[0:2].similarity(doc) < 1.0
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('text1,text2', [
|
||||
(["apple", "and", "apple", "pie"], ["orange", "juice"])])
|
||||
def test_vectors_doc_doc_similarity(vocab, text1, text2):
|
||||
|
|
|
@ -5,6 +5,7 @@ import numpy
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('text', ["Hello"])
|
||||
def test_vocab_add_vector(en_vocab, text):
|
||||
en_vocab.resize_vectors(10)
|
||||
|
|
|
@ -11,7 +11,6 @@ import struct
|
|||
import dill
|
||||
|
||||
from libc.string cimport memcpy, memset
|
||||
from libc.stdint cimport uint32_t
|
||||
from libc.math cimport sqrt
|
||||
|
||||
from .span cimport Span
|
||||
|
@ -21,6 +20,7 @@ from .token cimport Token
|
|||
from .printers import parse_tree
|
||||
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
||||
from ..typedefs cimport attr_t, flags_t
|
||||
from ..attrs import intify_attrs
|
||||
from ..attrs cimport attr_id_t
|
||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
||||
|
@ -494,8 +494,8 @@ cdef class Doc:
|
|||
cdef np.ndarray[attr_t, ndim=2] output
|
||||
# Make an array from the attributes --- otherwise our inner loop is Python
|
||||
# dict iteration.
|
||||
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.int32)
|
||||
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int32)
|
||||
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
|
||||
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
|
||||
for i in range(self.length):
|
||||
for j, feature in enumerate(attr_ids):
|
||||
output[i, j] = get_token_attr(&self.c[i], feature)
|
||||
|
@ -640,7 +640,7 @@ cdef class Doc:
|
|||
"""
|
||||
if self.length != 0:
|
||||
raise ValueError("Cannot load into non-empty Doc")
|
||||
cdef int[:, :] attrs
|
||||
cdef attr_t[:, :] attrs
|
||||
cdef int i, start, end, has_space
|
||||
fields = dill.loads(data)
|
||||
text, attrs = fields[:2]
|
||||
|
@ -679,17 +679,15 @@ cdef class Doc:
|
|||
if len(args) == 3:
|
||||
# TODO: Warn deprecation
|
||||
tag, lemma, ent_type = args
|
||||
attributes[TAG] = self.vocab.strings[tag]
|
||||
attributes[LEMMA] = self.vocab.strings[lemma]
|
||||
attributes[ENT_TYPE] = self.vocab.strings[ent_type]
|
||||
attributes[TAG] = tag
|
||||
attributes[LEMMA] = lemma
|
||||
attributes[ENT_TYPE] = ent_type
|
||||
elif not args:
|
||||
# TODO: This code makes little sense overall. We're still
|
||||
# ignoring most of the attributes?
|
||||
if "label" in attributes and 'ent_type' not in attributes:
|
||||
if type(attributes["label"]) == int:
|
||||
attributes[ENT_TYPE] = attributes["label"]
|
||||
else:
|
||||
attributes[ENT_TYPE] = self.vocab.strings[attributes["label"]]
|
||||
attributes[ENT_TYPE] = self.vocab.strings.add(attributes["label"])
|
||||
if 'ent_type' in attributes:
|
||||
attributes[ENT_TYPE] = attributes['ent_type']
|
||||
elif args:
|
||||
|
@ -699,6 +697,12 @@ cdef class Doc:
|
|||
"Arguments supplied:\n%s\n"
|
||||
"Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes)))
|
||||
|
||||
# More deprecated attribute handling =/
|
||||
if 'label' in attributes:
|
||||
attributes['ent_type'] = attributes.pop('label')
|
||||
|
||||
attributes = intify_attrs(attributes, strings_map=self.vocab.strings)
|
||||
|
||||
cdef int start = token_by_start(self.c, self.length, start_idx)
|
||||
if start == -1:
|
||||
return None
|
||||
|
@ -708,13 +712,6 @@ cdef class Doc:
|
|||
# Currently we have the token index, we want the range-end index
|
||||
end += 1
|
||||
cdef Span span = self[start:end]
|
||||
tag = self.vocab.strings[attributes.get(TAG, span.root.tag)]
|
||||
lemma = self.vocab.strings[attributes.get(LEMMA, span.root.lemma)]
|
||||
ent_type = self.vocab.strings[attributes.get(ENT_TYPE, span.root.ent_type)]
|
||||
ent_id = attributes.get('ent_id', span.root.ent_id)
|
||||
if isinstance(ent_id, basestring):
|
||||
ent_id = self.vocab.strings[ent_id]
|
||||
|
||||
# Get LexemeC for newly merged token
|
||||
new_orth = ''.join([t.text_with_ws for t in span])
|
||||
if span[-1].whitespace_:
|
||||
|
@ -723,18 +720,11 @@ cdef class Doc:
|
|||
# House the new merged token where it starts
|
||||
cdef TokenC* token = &self.c[start]
|
||||
token.spacy = self.c[end-1].spacy
|
||||
if tag in self.vocab.morphology.tag_map:
|
||||
self.vocab.morphology.assign_tag(token, tag)
|
||||
for attr_name, attr_value in attributes.items():
|
||||
if attr_name == TAG:
|
||||
self.vocab.morphology.assign_tag(token, attr_value)
|
||||
else:
|
||||
token.tag = self.vocab.strings[tag]
|
||||
token.lemma = self.vocab.strings[lemma]
|
||||
if ent_type == 'O':
|
||||
token.ent_iob = 2
|
||||
token.ent_type = 0
|
||||
else:
|
||||
token.ent_iob = 3
|
||||
token.ent_type = self.vocab.strings[ent_type]
|
||||
token.ent_id = ent_id
|
||||
Token.set_struct_attr(token, attr_name, attr_value)
|
||||
# Begin by setting all the head indices to absolute token positions
|
||||
# This is easier to work with for now than the offsets
|
||||
# Before thinking of something simpler, beware the case where a dependency
|
||||
|
|
|
@ -21,14 +21,14 @@ from .. import about
|
|||
|
||||
cdef class Span:
|
||||
"""A slice from a Doc object."""
|
||||
def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None,
|
||||
def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None,
|
||||
vector_norm=None):
|
||||
"""Create a `Span` object from the slice `doc[start : end]`.
|
||||
|
||||
doc (Doc): The parent document.
|
||||
start (int): The index of the first token of the span.
|
||||
end (int): The index of the first token after the span.
|
||||
label (int): A label to attach to the Span, e.g. for named entities.
|
||||
label (uint64): A label to attach to the Span, e.g. for named entities.
|
||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
|
||||
RETURNS (Span): The newly constructed object.
|
||||
"""
|
||||
|
@ -377,7 +377,7 @@ cdef class Span:
|
|||
property ent_id:
|
||||
"""An (integer) entity ID. Usually assigned by patterns in the `Matcher`.
|
||||
|
||||
RETURNS (int): The entity ID.
|
||||
RETURNS (uint64): The entity ID.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.root.ent_id
|
||||
|
|
|
@ -202,11 +202,11 @@ cdef class Token:
|
|||
property lemma:
|
||||
"""Base form of the word, with no inflectional suffixes.
|
||||
|
||||
RETURNS (int): Token lemma.
|
||||
RETURNS (uint64): Token lemma.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.c.lemma
|
||||
def __set__(self, int lemma):
|
||||
def __set__(self, attr_t lemma):
|
||||
self.c.lemma = lemma
|
||||
|
||||
property pos:
|
||||
|
@ -216,13 +216,13 @@ cdef class Token:
|
|||
property tag:
|
||||
def __get__(self):
|
||||
return self.c.tag
|
||||
def __set__(self, int tag):
|
||||
def __set__(self, attr_t tag):
|
||||
self.vocab.morphology.assign_tag(self.c, tag)
|
||||
|
||||
property dep:
|
||||
def __get__(self):
|
||||
return self.c.dep
|
||||
def __set__(self, int label):
|
||||
def __set__(self, attr_t label):
|
||||
self.c.dep = label
|
||||
|
||||
property has_vector:
|
||||
|
@ -234,12 +234,7 @@ cdef class Token:
|
|||
def __get__(self):
|
||||
if 'has_vector' in self.doc.user_token_hooks:
|
||||
return self.doc.user_token_hooks['has_vector'](self)
|
||||
cdef int i
|
||||
for i in range(self.vocab.vectors_length):
|
||||
if self.c.lex.vector[i] != 0:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
return self.vocab.has_vector(self.lex.c.orth)
|
||||
|
||||
property vector:
|
||||
"""A real-valued meaning representation.
|
||||
|
@ -250,16 +245,7 @@ cdef class Token:
|
|||
def __get__(self):
|
||||
if 'vector' in self.doc.user_token_hooks:
|
||||
return self.doc.user_token_hooks['vector'](self)
|
||||
cdef int length = self.vocab.vectors_length
|
||||
if length == 0:
|
||||
raise ValueError(
|
||||
"Word vectors set to length 0. This may be because you "
|
||||
"don't have a model installed or loaded, or because your "
|
||||
"model doesn't include word vectors. For more info, see "
|
||||
"the documentation: \n%s\n" % about.__docs_models__
|
||||
)
|
||||
vector_view = <float[:length,]>self.c.lex.vector
|
||||
return numpy.asarray(vector_view)
|
||||
return self.vocab.get_vector(self.c.lex.orth)
|
||||
|
||||
property vector_norm:
|
||||
"""The L2 norm of the token's vector representation.
|
||||
|
@ -269,7 +255,8 @@ cdef class Token:
|
|||
def __get__(self):
|
||||
if 'vector_norm' in self.doc.user_token_hooks:
|
||||
return self.doc.user_token_hooks['vector_norm'](self)
|
||||
return self.c.lex.l2_norm
|
||||
vector = self.vector
|
||||
return numpy.sqrt((vector ** 2).sum())
|
||||
|
||||
property n_lefts:
|
||||
def __get__(self):
|
||||
|
@ -516,16 +503,18 @@ cdef class Token:
|
|||
property ent_type:
|
||||
"""Named entity type.
|
||||
|
||||
RETURNS (int): Named entity type.
|
||||
RETURNS (uint64): Named entity type.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.c.ent_type
|
||||
def __set__(self, ent_type):
|
||||
self.c.ent_type = ent_type
|
||||
|
||||
property ent_iob:
|
||||
"""IOB code of named entity tag. `1="I", 2="O", 3="B"`. 0 means no tag
|
||||
is assigned.
|
||||
|
||||
RETURNS (int): IOB code of named entity tag.
|
||||
RETURNS (uint64): IOB code of named entity tag.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.c.ent_iob
|
||||
|
@ -537,6 +526,8 @@ cdef class Token:
|
|||
"""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.ent_type]
|
||||
def __set__(self, ent_type):
|
||||
self.c.ent_type = self.vocab.strings.add(ent_type)
|
||||
|
||||
property ent_iob_:
|
||||
"""IOB code of named entity tag. "B" means the token begins an entity,
|
||||
|
@ -553,7 +544,7 @@ cdef class Token:
|
|||
"""ID of the entity the token is an instance of, if any. Usually
|
||||
assigned by patterns in the Matcher.
|
||||
|
||||
RETURNS (int): ID of the entity.
|
||||
RETURNS (uint64): ID of the entity.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.c.ent_id
|
||||
|
@ -571,7 +562,7 @@ cdef class Token:
|
|||
return self.vocab.strings[self.c.ent_id]
|
||||
|
||||
def __set__(self, name):
|
||||
self.c.ent_id = self.vocab.strings[name]
|
||||
self.c.ent_id = self.vocab.strings.add(name)
|
||||
|
||||
property whitespace_:
|
||||
def __get__(self):
|
||||
|
@ -613,7 +604,7 @@ cdef class Token:
|
|||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lemma]
|
||||
def __set__(self, unicode lemma_):
|
||||
self.c.lemma = self.vocab.strings[lemma_]
|
||||
self.c.lemma = self.vocab.strings.add(lemma_)
|
||||
|
||||
property pos_:
|
||||
def __get__(self):
|
||||
|
@ -623,13 +614,13 @@ cdef class Token:
|
|||
def __get__(self):
|
||||
return self.vocab.strings[self.c.tag]
|
||||
def __set__(self, tag):
|
||||
self.tag = self.vocab.strings[tag]
|
||||
self.tag = self.vocab.strings.add(tag)
|
||||
|
||||
property dep_:
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.dep]
|
||||
def __set__(self, unicode label):
|
||||
self.c.dep = self.vocab.strings[label]
|
||||
self.c.dep = self.vocab.strings.add(label)
|
||||
|
||||
property is_oov:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV)
|
||||
|
|
|
@ -4,7 +4,7 @@ from libc.stdint cimport uint8_t
|
|||
|
||||
ctypedef uint64_t hash_t
|
||||
ctypedef char* utf8_t
|
||||
ctypedef int32_t attr_t
|
||||
ctypedef uint64_t attr_t
|
||||
ctypedef uint64_t flags_t
|
||||
ctypedef uint16_t len_t
|
||||
ctypedef uint16_t tag_t
|
||||
|
|
133
spacy/util.py
|
@ -78,27 +78,86 @@ def ensure_path(path):
|
|||
return path
|
||||
|
||||
|
||||
def resolve_model_path(name):
|
||||
"""Resolve a model name or string to a model path.
|
||||
def load_model(name):
|
||||
"""Load a model from a shortcut link, package or data path.
|
||||
|
||||
name (unicode): Package name, shortcut link or model path.
|
||||
RETURNS (Path): Path to model data directory.
|
||||
RETURNS (Language): `Language` class with the loaded model.
|
||||
"""
|
||||
data_path = get_data_path()
|
||||
if not data_path or not data_path.exists():
|
||||
raise IOError("Can't find spaCy data path: %s" % path2str(data_path))
|
||||
if isinstance(name, basestring_):
|
||||
if (data_path / name).exists(): # in data dir or shortcut link
|
||||
return (data_path / name)
|
||||
if is_package(name): # installed as a package
|
||||
return get_model_package_path(name)
|
||||
if Path(name).exists(): # path to model
|
||||
return Path(name)
|
||||
elif hasattr(name, 'exists'): # Path or Path-like object
|
||||
return name
|
||||
if (data_path / name).exists(): # in data dir or shortcut
|
||||
return load_model_from_path(data_path / name)
|
||||
if is_package(name): # installed as package
|
||||
return load_model_from_pkg(name)
|
||||
if Path(name).exists(): # path to model data directory
|
||||
return load_data_from_path(Path(name))
|
||||
elif hasattr(name, 'exists'): # Path or Path-like to model data
|
||||
return load_data_from_path(name)
|
||||
raise IOError("Can't find model '%s'" % name)
|
||||
|
||||
|
||||
def load_model_from_init_py(init_file):
|
||||
"""Helper function to use in the `load()` method of a model package's
|
||||
__init__.py.
|
||||
|
||||
init_file (unicode): Path to model's __init__.py, i.e. `__file__`.
|
||||
RETURNS (Language): `Language` class with loaded model.
|
||||
"""
|
||||
model_path = Path(init_file).parent
|
||||
return load_data_from_path(model_path, package=True)
|
||||
|
||||
|
||||
def load_model_from_path(model_path):
|
||||
"""Import and load a model package from its file path.
|
||||
|
||||
path (unicode or Path): Path to package directory.
|
||||
RETURNS (Language): `Language` class with loaded model.
|
||||
"""
|
||||
model_path = ensure_path(model_path)
|
||||
spec = importlib.util.spec_from_file_location('model', model_path)
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module)
|
||||
return module.load()
|
||||
|
||||
|
||||
def load_model_from_pkg(name):
|
||||
"""Import and load a model package.
|
||||
|
||||
name (unicode): Name of model package installed via pip.
|
||||
RETURNS (Language): `Language` class with loaded model.
|
||||
"""
|
||||
module = importlib.import_module(name)
|
||||
return module.load()
|
||||
|
||||
|
||||
def load_data_from_path(model_path, package=False):
|
||||
"""Initialie a `Language` class with a loaded model from a model data path.
|
||||
|
||||
model_path (unicode or Path): Path to model data directory.
|
||||
package (bool): Does the path point to the parent package directory?
|
||||
RETURNS (Language): `Language` class with loaded model.
|
||||
"""
|
||||
model_path = ensure_path(model_path)
|
||||
meta_path = model_path / 'meta.json'
|
||||
if not meta_path.is_file():
|
||||
raise IOError("Could not read meta.json from %s" % location)
|
||||
meta = read_json(location)
|
||||
for setting in ['lang', 'name', 'version']:
|
||||
if setting not in meta:
|
||||
raise IOError('No %s setting found in model meta.json' % setting)
|
||||
if package:
|
||||
model_data_path = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version'])
|
||||
model_path = model_path / model_data_path
|
||||
if not model_path.exists():
|
||||
raise ValueError("Can't find model directory: %s" % path2str(model_path))
|
||||
cls = get_lang_class(meta['lang'])
|
||||
nlp = cls(pipeline=meta.get('pipeline', True))
|
||||
return nlp.from_disk(model_path)
|
||||
|
||||
|
||||
def is_package(name):
|
||||
"""Check if string maps to a package installed via pip.
|
||||
|
||||
|
@ -112,36 +171,16 @@ def is_package(name):
|
|||
return False
|
||||
|
||||
|
||||
def get_model_package_path(package_name):
|
||||
"""Get path to a model package installed via pip.
|
||||
def get_package_path(name):
|
||||
"""Get the path to an installed package.
|
||||
|
||||
package_name (unicode): Name of installed package.
|
||||
RETURNS (Path): Path to model data directory.
|
||||
name (unicode): Package name.
|
||||
RETURNS (Path): Path to installed package.
|
||||
"""
|
||||
# Here we're importing the module just to find it. This is worryingly
|
||||
# indirect, but it's otherwise very difficult to find the package.
|
||||
# Python's installation and import rules are very complicated.
|
||||
pkg = importlib.import_module(package_name)
|
||||
package_path = Path(pkg.__file__).parent.parent
|
||||
meta = parse_package_meta(package_path / package_name)
|
||||
model_name = '%s-%s' % (package_name, meta['version'])
|
||||
return package_path / package_name / model_name
|
||||
|
||||
|
||||
def parse_package_meta(package_path, require=True):
|
||||
"""Check if a meta.json exists in a package and return its contents.
|
||||
|
||||
package_path (Path): Path to model package directory.
|
||||
require (bool): If True, raise error if no meta.json is found.
|
||||
RETURNS (dict or None): Model meta.json data or None.
|
||||
"""
|
||||
location = package_path / 'meta.json'
|
||||
if location.is_file():
|
||||
return read_json(location)
|
||||
elif require:
|
||||
raise IOError("Could not read meta.json from %s" % location)
|
||||
else:
|
||||
return None
|
||||
return Path(pkg.__file__).parent
|
||||
|
||||
|
||||
def is_in_jupyter():
|
||||
|
@ -177,10 +216,13 @@ def get_async(stream, numpy_array):
|
|||
|
||||
def itershuffle(iterable, bufsize=1000):
|
||||
"""Shuffle an iterator. This works by holding `bufsize` items back
|
||||
and yielding them sometime later. Obviously, this is not unbiased --
|
||||
and yielding them sometime later. Obviously, this is not unbiased –
|
||||
but should be good enough for batching. Larger bufsize means less bias.
|
||||
|
||||
From https://gist.github.com/andres-erbsen/1307752
|
||||
|
||||
iterable (iterable): Iterator to shuffle.
|
||||
bufsize (int): Items to hold back.
|
||||
YIELDS (iterable): The shuffled iterator.
|
||||
"""
|
||||
iterable = iter(iterable)
|
||||
buf = []
|
||||
|
@ -315,17 +357,16 @@ def normalize_slice(length, start, stop, step=None):
|
|||
|
||||
|
||||
def compounding(start, stop, compound):
|
||||
'''Yield an infinite series of compounding values. Each time the
|
||||
"""Yield an infinite series of compounding values. Each time the
|
||||
generator is called, a value is produced by multiplying the previous
|
||||
value by the compound rate.
|
||||
|
||||
EXAMPLE
|
||||
|
||||
EXAMPLE:
|
||||
>>> sizes = compounding(1., 10., 1.5)
|
||||
>>> assert next(sizes) == 1.
|
||||
>>> assert next(sizes) == 1 * 1.5
|
||||
>>> assert next(sizes) == 1.5 * 1.5
|
||||
'''
|
||||
"""
|
||||
def clip(value):
|
||||
return max(value, stop) if (start>stop) else min(value, stop)
|
||||
curr = float(start)
|
||||
|
@ -335,7 +376,7 @@ def compounding(start, stop, compound):
|
|||
|
||||
|
||||
def decaying(start, stop, decay):
|
||||
'''Yield an infinite series of linearly decaying values.'''
|
||||
"""Yield an infinite series of linearly decaying values."""
|
||||
def clip(value):
|
||||
return max(value, stop) if (start>stop) else min(value, stop)
|
||||
nr_upd = 1.
|
||||
|
@ -344,12 +385,6 @@ def decaying(start, stop, decay):
|
|||
nr_upd += 1
|
||||
|
||||
|
||||
def check_renamed_kwargs(renamed, kwargs):
|
||||
for old, new in renamed.items():
|
||||
if old in kwargs:
|
||||
raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
|
||||
|
||||
|
||||
def read_json(location):
|
||||
"""Open and load JSON from file.
|
||||
|
||||
|
|
232
spacy/vocab.pyx
|
@ -26,15 +26,6 @@ from . import attrs
|
|||
from . import symbols
|
||||
|
||||
|
||||
DEF MAX_VEC_SIZE = 100000
|
||||
|
||||
|
||||
cdef float[MAX_VEC_SIZE] EMPTY_VEC
|
||||
memset(EMPTY_VEC, 0, sizeof(EMPTY_VEC))
|
||||
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
||||
EMPTY_LEXEME.vector = EMPTY_VEC
|
||||
|
||||
|
||||
cdef class Vocab:
|
||||
"""A look-up table that allows you to access `Lexeme` objects. The `Vocab`
|
||||
instance also provides access to the `StringStore`, and owns underlying
|
||||
|
@ -53,8 +44,6 @@ cdef class Vocab:
|
|||
vice versa.
|
||||
RETURNS (Vocab): The newly constructed vocab object.
|
||||
"""
|
||||
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
|
||||
|
||||
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
||||
tag_map = tag_map if tag_map is not None else {}
|
||||
if lemmatizer in (None, True, False):
|
||||
|
@ -66,7 +55,7 @@ cdef class Vocab:
|
|||
self.strings = StringStore()
|
||||
if strings:
|
||||
for string in strings:
|
||||
self.strings[string]
|
||||
self.strings.add(string)
|
||||
# Load strings in a special order, so that we have an onset number for
|
||||
# the vocabulary. This way, when words are added in order, the orth ID
|
||||
# is the frequency rank of the word, plus a certain offset. The structural
|
||||
|
@ -77,7 +66,7 @@ cdef class Vocab:
|
|||
# Need to rethink this.
|
||||
for name in symbols.NAMES + list(sorted(tag_map.keys())):
|
||||
if name:
|
||||
_ = self.strings[name]
|
||||
self.strings.add(name)
|
||||
self.lex_attr_getters = lex_attr_getters
|
||||
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
|
||||
|
||||
|
@ -176,15 +165,14 @@ cdef class Vocab:
|
|||
mem = self.mem
|
||||
cdef bint is_oov = mem is not self.mem
|
||||
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
||||
lex.orth = self.strings[string]
|
||||
lex.orth = self.strings.add(string)
|
||||
lex.length = len(string)
|
||||
lex.id = self.length
|
||||
lex.vector = <float*>mem.alloc(self.vectors_length, sizeof(float))
|
||||
if self.lex_attr_getters is not None:
|
||||
for attr, func in self.lex_attr_getters.items():
|
||||
value = func(string)
|
||||
if isinstance(value, unicode):
|
||||
value = self.strings[value]
|
||||
value = self.strings.add(value)
|
||||
if attr == PROB:
|
||||
lex.prob = value
|
||||
elif value is not None:
|
||||
|
@ -239,7 +227,7 @@ cdef class Vocab:
|
|||
"""
|
||||
cdef attr_t orth
|
||||
if type(id_or_string) == unicode:
|
||||
orth = self.strings[id_or_string]
|
||||
orth = self.strings.add(id_or_string)
|
||||
else:
|
||||
orth = id_or_string
|
||||
return Lexeme(self, orth)
|
||||
|
@ -258,6 +246,26 @@ cdef class Vocab:
|
|||
Token.set_struct_attr(token, attr_id, value)
|
||||
return tokens
|
||||
|
||||
def get_vector(self, orth):
|
||||
"""Retrieve a vector for a word in the vocabulary.
|
||||
|
||||
Words can be looked up by string or int ID.
|
||||
|
||||
RETURNS:
|
||||
A word vector. Size and shape determed by the
|
||||
vocab.vectors instance. Usually, a numpy ndarray
|
||||
of shape (300,) and dtype float32.
|
||||
|
||||
RAISES: If no vectors data is loaded, ValueError is raised.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def has_vector(self, orth):
|
||||
"""Check whether a word has a vector. Returns False if no
|
||||
vectors have been loaded. Words can be looked up by string
|
||||
or int ID."""
|
||||
raise NotImplementedError
|
||||
|
||||
def to_disk(self, path):
|
||||
"""Save the current state to a directory.
|
||||
|
||||
|
@ -271,9 +279,6 @@ cdef class Vocab:
|
|||
with strings_loc.open('w', encoding='utf8') as file_:
|
||||
self.strings.dump(file_)
|
||||
|
||||
# TODO: pickle
|
||||
# self.dump(path / 'lexemes.bin')
|
||||
|
||||
def from_disk(self, path):
|
||||
"""Loads state from a directory. Modifies the object in place and
|
||||
returns it.
|
||||
|
@ -286,7 +291,7 @@ cdef class Vocab:
|
|||
with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
|
||||
strings_list = ujson.load(file_)
|
||||
for string in strings_list:
|
||||
self.strings[string]
|
||||
self.strings.add(string)
|
||||
self.load_lexemes(path / 'lexemes.bin')
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
|
@ -346,7 +351,6 @@ cdef class Vocab:
|
|||
lex_data.data[j] = bytes_ptr[i+j]
|
||||
Lexeme.c_from_bytes(lexeme, lex_data)
|
||||
|
||||
lexeme.vector = EMPTY_VEC
|
||||
py_str = self.strings[lexeme.orth]
|
||||
assert self.strings[py_str] == lexeme.orth, (py_str, lexeme.orth)
|
||||
key = hash_string(py_str)
|
||||
|
@ -354,172 +358,6 @@ cdef class Vocab:
|
|||
self._by_orth.set(lexeme.orth, lexeme)
|
||||
self.length += 1
|
||||
|
||||
# Deprecated --- delete these once stable
|
||||
|
||||
def dump_vectors(self, out_loc):
|
||||
"""Save the word vectors to a binary file.
|
||||
|
||||
loc (Path): The path to save to.
|
||||
"""
|
||||
cdef int32_t vec_len = self.vectors_length
|
||||
cdef int32_t word_len
|
||||
cdef bytes word_str
|
||||
cdef char* chars
|
||||
|
||||
cdef Lexeme lexeme
|
||||
cdef CFile out_file = CFile(out_loc, 'wb')
|
||||
for lexeme in self:
|
||||
word_str = lexeme.orth_.encode('utf8')
|
||||
vec = lexeme.c.vector
|
||||
word_len = len(word_str)
|
||||
|
||||
out_file.write_from(&word_len, 1, sizeof(word_len))
|
||||
out_file.write_from(&vec_len, 1, sizeof(vec_len))
|
||||
|
||||
chars = <char*>word_str
|
||||
out_file.write_from(chars, word_len, sizeof(char))
|
||||
out_file.write_from(vec, vec_len, sizeof(float))
|
||||
out_file.close()
|
||||
|
||||
|
||||
|
||||
def load_vectors(self, file_):
|
||||
"""Load vectors from a text-based file.
|
||||
|
||||
file_ (buffer): The file to read from. Entries should be separated by
|
||||
newlines, and each entry should be whitespace delimited. The first value of the entry
|
||||
should be the word string, and subsequent entries should be the values of the
|
||||
vector.
|
||||
|
||||
RETURNS (int): The length of the vectors loaded.
|
||||
"""
|
||||
cdef LexemeC* lexeme
|
||||
cdef attr_t orth
|
||||
cdef int32_t vec_len = -1
|
||||
cdef double norm = 0.0
|
||||
|
||||
whitespace_pattern = re.compile(r'\s', re.UNICODE)
|
||||
|
||||
for line_num, line in enumerate(file_):
|
||||
pieces = line.split()
|
||||
word_str = " " if whitespace_pattern.match(line) else pieces.pop(0)
|
||||
if vec_len == -1:
|
||||
vec_len = len(pieces)
|
||||
elif vec_len != len(pieces):
|
||||
raise VectorReadError.mismatched_sizes(file_, line_num,
|
||||
vec_len, len(pieces))
|
||||
orth = self.strings[word_str]
|
||||
lexeme = <LexemeC*><void*>self.get_by_orth(self.mem, orth)
|
||||
lexeme.vector = <float*>self.mem.alloc(vec_len, sizeof(float))
|
||||
for i, val_str in enumerate(pieces):
|
||||
lexeme.vector[i] = float(val_str)
|
||||
norm = 0.0
|
||||
for i in range(vec_len):
|
||||
norm += lexeme.vector[i] * lexeme.vector[i]
|
||||
lexeme.l2_norm = sqrt(norm)
|
||||
self.vectors_length = vec_len
|
||||
return vec_len
|
||||
|
||||
def load_vectors_from_bin_loc(self, loc):
|
||||
"""Load vectors from the location of a binary file.
|
||||
|
||||
loc (unicode): The path of the binary file to load from.
|
||||
|
||||
RETURNS (int): The length of the vectors loaded.
|
||||
"""
|
||||
cdef CFile file_ = CFile(loc, b'rb')
|
||||
cdef int32_t word_len
|
||||
cdef int32_t vec_len = 0
|
||||
cdef int32_t prev_vec_len = 0
|
||||
cdef float* vec
|
||||
cdef Address mem
|
||||
cdef attr_t string_id
|
||||
cdef bytes py_word
|
||||
cdef vector[float*] vectors
|
||||
cdef int line_num = 0
|
||||
cdef Pool tmp_mem = Pool()
|
||||
while True:
|
||||
try:
|
||||
file_.read_into(&word_len, sizeof(word_len), 1)
|
||||
except IOError:
|
||||
break
|
||||
file_.read_into(&vec_len, sizeof(vec_len), 1)
|
||||
if prev_vec_len != 0 and vec_len != prev_vec_len:
|
||||
raise VectorReadError.mismatched_sizes(loc, line_num,
|
||||
vec_len, prev_vec_len)
|
||||
if 0 >= vec_len >= MAX_VEC_SIZE:
|
||||
raise VectorReadError.bad_size(loc, vec_len)
|
||||
|
||||
chars = <char*>file_.alloc_read(tmp_mem, word_len, sizeof(char))
|
||||
vec = <float*>file_.alloc_read(self.mem, vec_len, sizeof(float))
|
||||
|
||||
string_id = self.strings[chars[:word_len]]
|
||||
# Insert words into vocab to add vector.
|
||||
self.get_by_orth(self.mem, string_id)
|
||||
while string_id >= vectors.size():
|
||||
vectors.push_back(EMPTY_VEC)
|
||||
assert vec != NULL
|
||||
vectors[string_id] = vec
|
||||
line_num += 1
|
||||
cdef LexemeC* lex
|
||||
cdef size_t lex_addr
|
||||
cdef double norm = 0.0
|
||||
cdef int i
|
||||
for orth, lex_addr in self._by_orth.items():
|
||||
lex = <LexemeC*>lex_addr
|
||||
if lex.lower < vectors.size():
|
||||
lex.vector = vectors[lex.lower]
|
||||
norm = 0.0
|
||||
for i in range(vec_len):
|
||||
norm += lex.vector[i] * lex.vector[i]
|
||||
lex.l2_norm = sqrt(norm)
|
||||
else:
|
||||
lex.vector = EMPTY_VEC
|
||||
self.vectors_length = vec_len
|
||||
return vec_len
|
||||
|
||||
|
||||
def resize_vectors(self, int new_size):
|
||||
"""Set vectors_length to a new size, and allocate more memory for the
|
||||
`Lexeme` vectors if necessary. The memory will be zeroed.
|
||||
|
||||
new_size (int): The new size of the vectors.
|
||||
"""
|
||||
cdef hash_t key
|
||||
cdef size_t addr
|
||||
if new_size > self.vectors_length:
|
||||
for key, addr in self._by_hash.items():
|
||||
lex = <LexemeC*>addr
|
||||
lex.vector = <float*>self.mem.realloc(lex.vector,
|
||||
new_size * sizeof(lex.vector[0]))
|
||||
self.vectors_length = new_size
|
||||
|
||||
|
||||
def write_binary_vectors(in_loc, out_loc):
|
||||
cdef CFile out_file = CFile(out_loc, 'wb')
|
||||
cdef Address mem
|
||||
cdef int32_t word_len
|
||||
cdef int32_t vec_len
|
||||
cdef char* chars
|
||||
with bz2.BZ2File(in_loc, 'r') as file_:
|
||||
for line in file_:
|
||||
pieces = line.split()
|
||||
word = pieces.pop(0)
|
||||
mem = Address(len(pieces), sizeof(float))
|
||||
vec = <float*>mem.ptr
|
||||
for i, val_str in enumerate(pieces):
|
||||
vec[i] = float(val_str)
|
||||
|
||||
word_len = len(word)
|
||||
vec_len = len(pieces)
|
||||
|
||||
out_file.write_from(&word_len, 1, sizeof(word_len))
|
||||
out_file.write_from(&vec_len, 1, sizeof(vec_len))
|
||||
|
||||
chars = <char*>word
|
||||
out_file.write_from(chars, len(word), sizeof(char))
|
||||
out_file.write_from(vec, vec_len, sizeof(float))
|
||||
|
||||
|
||||
def pickle_vocab(vocab):
|
||||
sstore = vocab.strings
|
||||
|
@ -567,21 +405,3 @@ class LookupError(Exception):
|
|||
"ID of orth: {orth_id}".format(
|
||||
query=repr(original_string), orth_str=repr(id_string), orth_id=id_)
|
||||
)
|
||||
|
||||
|
||||
class VectorReadError(Exception):
|
||||
@classmethod
|
||||
def mismatched_sizes(cls, loc, line_num, prev_size, curr_size):
|
||||
return cls(
|
||||
"Error reading word vectors from %s on line %d.\n"
|
||||
"All vectors must be the same size.\n"
|
||||
"Prev size: %d\n"
|
||||
"Curr size: %d" % (loc, line_num, prev_size, curr_size))
|
||||
|
||||
@classmethod
|
||||
def bad_size(cls, loc, size):
|
||||
return cls(
|
||||
"Error reading word vectors from %s.\n"
|
||||
"Vector size: %d\n"
|
||||
"Max size: %d\n"
|
||||
"Min size: 1\n" % (loc, size, MAX_VEC_SIZE))
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-1 -1 907 737" width="906" height="746">
|
||||
<style>
|
||||
.svg__architecture__text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
|
||||
.svg__architecture__text-medium { fill: #1a1e23; font: 17px "Source Sans Pro" }
|
||||
.svg__architecture__text-small { fill: #1a1e23; font: bold 14px "Source Sans Pro" }
|
||||
.svg__architecture__text-code { fill: #1a1e23; font: 600 12px "Source Code Pro" }
|
||||
.svg__architecture__text-large { fill: #1a1e23; font: 20px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||
.svg__architecture__text-medium { fill: #1a1e23; font: 17px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||
.svg__architecture__text-small { fill: #1a1e23; font: bold 14px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||
.svg__architecture__text-code { fill: #1a1e23; font: 600 12px "Source Code Pro", Monaco, "Courier New", monospace }
|
||||
</style>
|
||||
<ellipse cx="404" cy="203" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="74.8" ry="49.8"/>
|
||||
<text class="svg__architecture__text-large" transform="translate(362.5 206.5)" width="81" height="40">Language</text>
|
||||
|
|
Before Width: | Height: | Size: 14 KiB After Width: | Height: | Size: 14 KiB |
|
@ -1,8 +1,8 @@
|
|||
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="931" height="456" viewBox="-1 -1 932 480" preserveAspectRatio="xMinYMin meet">
|
||||
<style>
|
||||
.svg__langdata__text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
|
||||
.svg__langdata__text-small { fill: #1a1e23; font: bold 16px "Source Sans Pro" }
|
||||
.svg__langdata__text-tiny { fill: #1a1e23; font: bold 16px "Source Sans Pro" }
|
||||
.svg__langdata__text-large { fill: #1a1e23; font: 20px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||
.svg__langdata__text-small { fill: #1a1e23; font: bold 16px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||
.svg__langdata__text-tiny { fill: #1a1e23; font: bold 16px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||
</style>
|
||||
<path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M610 404h-69.8" stroke-dasharray="1 6" stroke-linecap="round"/>
|
||||
<path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M534.2 404l8-4-2 4 2 4z"/>
|
||||
|
|
Before Width: | Height: | Size: 9.0 KiB After Width: | Height: | Size: 9.1 KiB |
|
@ -1,8 +1,8 @@
|
|||
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 923 200" width="923" height="200">
|
||||
<style>
|
||||
.svg__pipeline__text { fill: #1a1e23; font: 20px "Source Sans Pro" }
|
||||
.svg__pipeline__text-small { fill: #1a1e23; font: bold 18px "Source Sans Pro" }
|
||||
.svg__pipeline__text-code { fill: #1a1e23; font: 600 16px "Source Code Pro" }
|
||||
.svg__pipeline__text { fill: #1a1e23; font: 20px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||
.svg__pipeline__text-small { fill: #1a1e23; font: bold 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||
.svg__pipeline__text-code { fill: #1a1e23; font: 600 16px "Source Code Pro", Monaco, "Courier New", monospace }
|
||||
</style>
|
||||
<rect width="601" height="127" x="159" y="21" fill="none" stroke="#09a3d5" stroke-width="3" rx="19.1" stroke-dasharray="3 6" ry="19.1"/>
|
||||
<path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M801 55h120v60H801z"/>
|
||||
|
|
Before Width: | Height: | Size: 3.1 KiB After Width: | Height: | Size: 3.2 KiB |
123
website/assets/img/docs/tokenization.svg
Normal file
|
@ -0,0 +1,123 @@
|
|||
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="600" height="380" viewBox="-20 -10 550 400">
|
||||
<style>
|
||||
.svg__tokenization__text { fill: #1a1e23; font: 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||
.svg__tokenization__text-small { fill: #fff; font: 600 13px "Source Code Pro", Monaco, "Courier New", monospace }
|
||||
</style>
|
||||
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M71 39v12H16v11M71 39v12h20v11"/>
|
||||
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M1 1h140v38.2H1z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" width="43" height="19" transform="translate(48.5 9.5)">“Let’s</text>
|
||||
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M175 39v23"/>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M150 1h50v38.2h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 9.5)" width="19" height="19">go</text>
|
||||
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M235 39v23"/>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M210 1h50v38.2h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 9.5)" width="15" height="19">to</text>
|
||||
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M341 39v23"/>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M270 1h141v38.2H270z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(320.5 9.5)" width="38" height="19">N.Y.!”</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 100v20"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 62h30v38.2H1z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 70.5)" width="7" height="19">“</text>
|
||||
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M91 100v11H66v9M91 100v11h29v9"/>
|
||||
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M41 62h100v38.2H41z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(72.5 70.5)" width="35" height="19">Let’s</text>
|
||||
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M175 100v20"/>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M150 62h50v38.2h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 70.5)" width="19" height="19">go</text>
|
||||
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M235 100v20"/>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M210 62h50v38.2h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 70.5)" width="15" height="19">to</text>
|
||||
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M341 100v20"/>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M270 62h141v38.2H270z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(320.5 70.5)" width="38" height="19">N.Y.!”</text>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 120h30v38H1z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 128.5)" width="7" height="19">“</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 158v24"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 120h50v38H41z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 128.5)" width="23" height="19">Let</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 158v24"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 120h50v38h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 128.5)" width="19" height="19">go</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 158v24"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 120h50v38h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 128.5)" width="15" height="19">to</text>
|
||||
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M341 158v13h-20v11M341 158v13h55v11"/>
|
||||
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M270 120h141v38H270z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(320.5 128.5)" width="38" height="19">N.Y.!”</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 158v24"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 120h40v38h-40z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 128.5)" width="11" height="19">’s</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 220v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 181.8h30V220H1z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 190.5)" width="7" height="19">“</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 220v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 181.8h50V220H41z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 190.5)" width="23" height="19">Let</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 220v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 181.8h50V220h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 190.5)" width="19" height="19">go</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 220v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 181.8h50V220h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 190.5)" width="15" height="19">to</text>
|
||||
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M321 220v11h-20v12M321 220v11h34v12"/>
|
||||
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M270 181.8h101V220H270z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(304.5 190.5)" width="30" height="19">N.Y.!</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 220v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 181.8h40V220h-40z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 190.5)" width="11" height="19">’s</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M396 220v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 181.8h30V220h-30z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(391.5 190.5)" width="7" height="19">”</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 281v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 242.7h30V281H1z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 251.5)" width="7" height="19">“</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 281v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 242.7h50V281H41z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 251.5)" width="23" height="19">Let</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 281v20-17 20"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 242.7h50V281h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 251.5)" width="19" height="19">go</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 281v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 242.7h50V281h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 251.5)" width="15" height="19">to</text>
|
||||
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M301 281v23"/>
|
||||
<path fill="#f8cecc" stroke="#b85450" stroke-width="2" d="M270 242.7h61V281h-61z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(286.5 251.5)" width="26" height="19">N.Y.</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 281v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 242.7h40V281h-40z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 251.5)" width="11" height="19">’s</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M396 281v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 242.7h30V281h-30z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(391.5 251.5)" width="7" height="19">”</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M355 281v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M340 242.7h30V281h-30z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(351.5 251.5)" width="5" height="19">!</text>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 304h30v38H1z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 312.5)" width="7" height="19">“</text>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 304h50v38H41z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 312.5)" width="23" height="19">Let</text>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 304h50v38h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 312.5)" width="19" height="19">go</text>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 304h50v38h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 312.5)" width="15" height="19">to</text>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M270 304h61v38h-61z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(286.5 312.5)" width="26" height="19">N.Y.</text>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 304h40v38h-40z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 312.5)" width="11" height="19">’s</text>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 304h30v38h-30z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(391.5 312.5)" width="7" height="19">”</text>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M340 304h30v38h-30z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(351.5 312.5)" width="5" height="19">!</text>
|
||||
<rect width="104" height="19" x="437" y="72" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
|
||||
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(455.5 74.5)" width="65" height="12">EXCEPTION</text>
|
||||
<rect width="104" height="19" x="437" y="11" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
|
||||
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 13.5)" width="43" height="12">PREFIX</text>
|
||||
<rect width="104" height="19" x="437" y="130" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
|
||||
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 132.5)" width="43" height="12">SUFFIX</text>
|
||||
<rect width="104" height="19" x="437" y="191" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
|
||||
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 193.5)" width="43" height="12">SUFFIX</text>
|
||||
<rect width="104" height="19" x="437" y="252" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
|
||||
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(455.5 254.5)" width="65" height="12">EXCEPTION</text>
|
||||
<rect width="104" height="19" x="437" y="313" fill="#82b366" stroke="#82b366" stroke-width="2" rx="2.9" ry="2.9"/>
|
||||
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(473.5 315.5)" width="29" height="12">DONE</text>
|
||||
</svg>
|
After Width: | Height: | Size: 12 KiB |
|
@ -1,9 +1,9 @@
|
|||
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-10 -10 582 365" width="572" height="355">
|
||||
<style>
|
||||
.svg__vocab__text { fill: #1a1e23; font: 18px "Source Sans Pro" }
|
||||
.svg__vocab__text-large { fill: #fff; font: bold 18px "Source Sans Pro"; text-transform: uppercase }
|
||||
.svg__vocab__text-box { fill: #fff; font: bold 12px "Source Code Pro" }
|
||||
.svg__vocab__text-code { fill: #1a1e23; font: bold 12px "Source Code Pro" }
|
||||
.svg__vocab__text { fill: #1a1e23; font: 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||
.svg__vocab__text-large { fill: #fff; font: bold 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif; text-transform: uppercase }
|
||||
.svg__vocab__text-box { fill: #fff; font: bold 12px "Source Code Pro", Monaco, "Courier New", monospace }
|
||||
.svg__vocab__text-code { fill: #1a1e23; font: bold 12px "Source Code Pro", Monaco, "Courier New", monospace }
|
||||
</style>
|
||||
<rect width="570" height="88" x="1" y="135" fill="#d5e8d4" stroke="#82b366" stroke-width="2" rx="13.2" ry="13.2"/>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 164h100v40H444z"/>
|
||||
|
|
Before Width: | Height: | Size: 7.6 KiB After Width: | Height: | Size: 7.8 KiB |
|
@ -158,7 +158,8 @@
|
|||
|
||||
"binder": {
|
||||
"title": "Binder",
|
||||
"tag": "class"
|
||||
"tag": "class",
|
||||
"source": "spacy/tokens/binder.pyx"
|
||||
},
|
||||
|
||||
"annotation": {
|
||||
|
|
|
@ -2,7 +2,10 @@
|
|||
|
||||
include ../../_includes/_mixins
|
||||
|
||||
p spaCy currently supports the following languages and capabilities:
|
||||
p
|
||||
| spaCy currently provides models for the following languages and
|
||||
| capabilities:
|
||||
|
||||
|
||||
+aside-code("Download language models", "bash").
|
||||
python -m spacy download en
|
||||
|
@ -22,12 +25,16 @@ p spaCy currently supports the following languages and capabilities:
|
|||
|
||||
+row
|
||||
+cell French #[code fr]
|
||||
each icon in [ "pro", "pro", "con", "pro", "con", "pro", "pro", "con" ]
|
||||
each icon in [ "pro", "con", "con", "pro", "con", "pro", "pro", "con" ]
|
||||
+cell.u-text-center #[+procon(icon)]
|
||||
|
||||
+h(2, "available") Available models
|
||||
+row
|
||||
+cell Spanish #[code es]
|
||||
each icon in [ "pro", "pro", "con", "pro", "pro", "pro", "pro", "con" ]
|
||||
+cell.u-text-center #[+procon(icon)]
|
||||
|
||||
include ../usage/_models-list
|
||||
p
|
||||
+button("/docs/usage/models", true, "primary") See available models
|
||||
|
||||
+h(2, "alpha-support") Alpha tokenization support
|
||||
|
||||
|
@ -52,9 +59,35 @@ p
|
|||
| #[+a("https://github.com/mocobeta/janome") Janome].
|
||||
|
||||
+table([ "Language", "Code", "Source" ])
|
||||
each language, code in { es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", da: "Danish", hu: "Hungarian", pl: "Polish", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" }
|
||||
each language, code in { it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", da: "Danish", hu: "Hungarian", pl: "Polish", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" }
|
||||
+row
|
||||
+cell #{language}
|
||||
+cell #[code=code]
|
||||
+cell
|
||||
+src(gh("spaCy", "spacy/lang/" + code)) lang/#{code}
|
||||
|
||||
+h(2, "multi-language") Multi-language support
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| As of v2.0, spaCy supports models trained on more than one language. This
|
||||
| is especially useful for named entity recognition. The language ID used
|
||||
| for multi-language or language-neutral models is #[code xx]. The
|
||||
| language class, a generic subclass containing only the base language data,
|
||||
| can be found in #[+src(gh("spaCy", "spacy/lang/xx")) lang/xx].
|
||||
|
||||
p
|
||||
| To load your model with the neutral, multi-language class, simply set
|
||||
| #[code "language": "xx"] in your
|
||||
| #[+a("/docs/usage/saving-loading#models-generating") model package]'s
|
||||
| meta.json. You can also import the class directly, or call
|
||||
| #[+api("util#get_lang_class") #[code util.get_lang_class()]] for
|
||||
| lazy-loading.
|
||||
|
||||
+code("Standard import").
|
||||
from spacy.lang.xx import MultiLanguage
|
||||
nlp = MultiLanguage()
|
||||
|
||||
+code("With lazy-loading").
|
||||
from spacy.util import get_lang_class
|
||||
nlp = get_lang_class('xx')
|
||||
|
|
|
@ -11,8 +11,13 @@ p
|
|||
| the name of an installed
|
||||
| #[+a("/docs/usage/saving-loading#generating") model package], a unicode
|
||||
| path or a #[code Path]-like object. spaCy will try resolving the load
|
||||
| argument in this order. The #[code Language] class to initialise will be
|
||||
| determined based on the model's settings.
|
||||
| argument in this order. If a model is loaded from a shortcut link or
|
||||
| package name, spaCy will assume it's a Python package and import it and
|
||||
| call the model's own #[code load()] method. If a model is loaded from a
|
||||
| path, spaCy will assume it's a data directory, read the language and
|
||||
| pipeline settings off the meta.json and initialise the #[code Language]
|
||||
| class. The data will be loaded in via
|
||||
| #[+api("language#from_disk") #[code Language.from_disk()]].
|
||||
|
||||
+aside-code("Example").
|
||||
nlp = spacy.load('en') # shortcut link
|
||||
|
@ -20,7 +25,7 @@ p
|
|||
nlp = spacy.load('/path/to/en') # unicode path
|
||||
nlp = spacy.load(Path('/path/to/en')) # pathlib Path
|
||||
|
||||
nlp = spacy.load('en', disable['parser', 'tagger'])
|
||||
nlp = spacy.load('en', disable=['parser', 'tagger'])
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
|
|
|
@ -1,12 +1,10 @@
|
|||
//- 💫 DOCS > API > ANNOTATION SPECS
|
||||
//- 💫 DOCS > API > UTIL
|
||||
|
||||
include ../../_includes/_mixins
|
||||
|
||||
p
|
||||
| spaCy comes with a small collection of utility functions located in
|
||||
| #[+src(gh("spaCy", "spacy/util.py")) spacy/util.py].
|
||||
|
||||
+infobox("Important note")
|
||||
| Because utility functions are mostly intended for
|
||||
| #[strong internal use within spaCy], their behaviour may change with
|
||||
| future releases. The functions documented on this page should be safe
|
||||
|
@ -74,15 +72,23 @@ p
|
|||
+cell #[code Language]
|
||||
+cell Language class.
|
||||
|
||||
+h(2, "resolve_model_path") util.resolve_model_path
|
||||
+h(2, "load_model") util.load_model
|
||||
+tag function
|
||||
+tag-new(2)
|
||||
|
||||
p Resolve a model name or string to a model path.
|
||||
p
|
||||
| Load a model from a shortcut link, package or data path. If called with a
|
||||
| shortcut link or package name, spaCy will assume the model is a Python
|
||||
| package and import and call its #[code load()] method. If called with a
|
||||
| path, spaCy will assume it's a data directory, read the language and
|
||||
| pipeline settings from the meta.json and initialise a #[code Language]
|
||||
| class. The model data will then be loaded in via
|
||||
| #[+api("language#from_disk") #[code Language.from_disk()]].
|
||||
|
||||
+aside-code("Example").
|
||||
model_path = util.resolve_model_path('en')
|
||||
model_path = util.resolve_model_path('/path/to/en')
|
||||
nlp = util.load_model('en')
|
||||
nlp = util.load_model('en_core_web_sm')
|
||||
nlp = util.load_model('/path/to/data')
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
|
@ -92,8 +98,33 @@ p Resolve a model name or string to a model path.
|
|||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Path]
|
||||
+cell Path to model data directory.
|
||||
+cell #[code Language]
|
||||
+cell #[code Language] class with the loaded model.
|
||||
|
||||
+h(2, "load_model_from_init_py") util.load_model_from_init_py
|
||||
+tag function
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| A helper function to use in the #[code load()] method of a model package's
|
||||
| #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) __init__.py].
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.util import load_model_from_init_py
|
||||
|
||||
def load():
|
||||
return load_model_from_init_py(__file__)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code init_file]
|
||||
+cell unicode
|
||||
+cell Path to model's __init__.py, i.e. #[code __file__].
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Language]
|
||||
+cell #[code Language] class with the loaded model.
|
||||
|
||||
+h(2, "is_package") util.is_package
|
||||
+tag function
|
||||
|
@ -117,16 +148,18 @@ p
|
|||
+cell #[code bool]
|
||||
+cell #[code True] if installed package, #[code False] if not.
|
||||
|
||||
+h(2, "get_model_package_path") util.get_model_package_path
|
||||
+h(2, "get_package_path") util.get_package_path
|
||||
+tag function
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| Get path to a #[+a("/docs/usage/models") model package] installed via pip.
|
||||
| Currently imports the package to find it and parse its meta data.
|
||||
| Get path to an installed package. Mainly used to resolve the location of
|
||||
| #[+a("/docs/usage/models") model packages]. Currently imports the package
|
||||
| to find its path.
|
||||
|
||||
+aside-code("Example").
|
||||
util.get_model_package_path('en_core_web_sm')
|
||||
# /usr/lib/python3.6/site-packages/en_core_web_sm/en_core_web_sm-1.2.0
|
||||
util.get_package_path('en_core_web_sm')
|
||||
# /usr/lib/python3.6/site-packages/en_core_web_sm
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
|
@ -137,37 +170,8 @@ p
|
|||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Path]
|
||||
+cell Path to model data directory.
|
||||
|
||||
+h(2, "parse_package_meta") util.parse_package_meta
|
||||
+tag function
|
||||
|
||||
p
|
||||
| Check if a #[code meta.json] exists in a model package and return its
|
||||
| contents.
|
||||
|
||||
+aside-code("Example").
|
||||
if util.is_package('en_core_web_sm'):
|
||||
path = util.get_model_package_path('en_core_web_sm')
|
||||
meta = util.parse_package_meta(path, require=True)
|
||||
# {'name': 'core_web_sm', 'lang': 'en', ...}
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code package_path]
|
||||
+cell #[code Path]
|
||||
+cell Path to model package directory.
|
||||
|
||||
+row
|
||||
+cell #[code require]
|
||||
+cell #[code bool]
|
||||
+cell If #[code True], raise error if no #[code meta.json] is found.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell dict / #[code None]
|
||||
+cell Model meta data or #[code None].
|
||||
|
||||
+h(2, "is_in_jupyter") util.is_in_jupyter
|
||||
+tag function
|
||||
+tag-new(2)
|
||||
|
|
|
@ -5,7 +5,7 @@ p
|
|||
| #[strong how similar they are]. Predicting similarity is useful for
|
||||
| building recommendation systems or flagging duplicates. For example, you
|
||||
| can suggest a user content that's similar to what they're currently
|
||||
| looking at, or label a support ticket as a duplicate, if it's very
|
||||
| looking at, or label a support ticket as a duplicate if it's very
|
||||
| similar to an already existing one.
|
||||
|
||||
p
|
||||
|
|
|
@ -16,3 +16,47 @@ p
|
|||
+row
|
||||
for cell in ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "for", "$", "1", "billion"]
|
||||
+cell=cell
|
||||
|
||||
p
|
||||
| Fist, the raw text is split on whitespace characters, similar to
|
||||
| #[code text.split(' ')]. Then, the tokenizer processes the text from
|
||||
| left to right. On each substring, it performs two checks:
|
||||
|
||||
+list("numbers")
|
||||
+item
|
||||
| #[strong Does the substring match a tokenizer exception rule?] For
|
||||
| example, "don't" does not contain whitespace, but should be split
|
||||
| into two tokens, "do" and "n't", while "U.K." should always
|
||||
| remain one token.
|
||||
+item
|
||||
| #[strong Can a prefix, suffix or infixes be split off?]. For example
|
||||
| punctuation like commas, periods, hyphens or quotes.
|
||||
|
||||
p
|
||||
| If there's a match, the rule is applied and the tokenizer continues its
|
||||
| loop, starting with the newly split substrings. This way, spaCy can split
|
||||
| #[strong complex, nested tokens] like combinations of abbreviations and
|
||||
| multiple punctuation marks.
|
||||
|
||||
+aside
|
||||
| #[strong Tokenizer exception:] Special-case rule to split a string into
|
||||
| several tokens or prevent a token from being split when punctuation rules
|
||||
| are applied.#[br]
|
||||
| #[strong Prefix:] Character(s) at the beginning, e.g.
|
||||
| #[code $], #[code (], #[code “], #[code ¿].#[br]
|
||||
| #[strong Suffix:] Character(s) at the end, e.g.
|
||||
| #[code km], #[code )], #[code ”], #[code !].#[br]
|
||||
| #[strong Infix:] Character(s) in between, e.g.
|
||||
| #[code -], #[code --], #[code /], #[code …].#[br]
|
||||
|
||||
+image
|
||||
include ../../../assets/img/docs/tokenization.svg
|
||||
.u-text-right
|
||||
+button("/assets/img/docs/tokenization.svg", false, "secondary").u-text-tag View large graphic
|
||||
|
||||
p
|
||||
| While punctuation rules are usually pretty general, tokenizer exceptions
|
||||
| strongly depend on the specifics of the individual language. This is
|
||||
| why each #[+a("/docs/api/language-models") available language] has its
|
||||
| own subclass like #[code English] or #[code German], that loads in lists
|
||||
| of hard-coded data and exception rules.
|
||||
|
|
|
@ -89,4 +89,6 @@ p
|
|||
|
||||
p
|
||||
| Even though both #[code Doc] objects contain the same words, the internal
|
||||
| integer IDs are very different.
|
||||
| integer IDs are very different. The same applies for all other strings,
|
||||
| like the annotation scheme. To avoid mismatched IDs, spaCy will always
|
||||
| export the vocab if you save a #[code Doc] or #[code nlp] object.
|
||||
|
|
|
@ -144,7 +144,7 @@ p
|
|||
+table(["Argument", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code vocab]
|
||||
+cell #[coce Vocab]
|
||||
+cell #[code Vocab]
|
||||
+cell
|
||||
| Shared data between components, including strings, morphology,
|
||||
| vectors etc.
|
||||
|
|
|
@ -139,6 +139,8 @@ p
|
|||
new_doc = Doc(Vocab()).from_disk('/moby_dick.bin')
|
||||
|
||||
+infobox
|
||||
| #[strong API:] #[+api("language") #[code Language]],
|
||||
| #[+api("doc") #[code Doc]]
|
||||
| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
|
||||
|
||||
+h(2, "rule-matcher") Match text with token rules
|
||||
|
|
|
@ -345,7 +345,7 @@ p
|
|||
| account and check the #[code subtree] for intensifiers like "very", to
|
||||
| increase the sentiment score. At some point, you might also want to train
|
||||
| a sentiment model. However, the approach described in this example is
|
||||
| very useful for #[strong bootstrapping rules to gather training data].
|
||||
| very useful for #[strong bootstrapping rules to collect training data].
|
||||
| It's also an incredibly fast way to gather first insights into your data
|
||||
| – with about 1 million tweets, you'd be looking at a processing time of
|
||||
| #[strong under 1 minute].
|
||||
|
|
|
@ -65,7 +65,7 @@ p
|
|||
| spaCy provides a variety of linguistic annotations to give you insights
|
||||
| into a text's grammatical structure. This includes the word types,
|
||||
| i.e. the parts of speech, and how the words are related to each other.
|
||||
| For example, if you're analysing text, it makes a #[em huge] difference
|
||||
| For example, if you're analysing text, it makes a huge difference
|
||||
| whether a noun is the subject of a sentence, or the object – or whether
|
||||
| "google" is used as a verb, or refers to the website or company in a
|
||||
| specific context.
|
||||
|
@ -94,9 +94,10 @@ p
|
|||
include _spacy-101/_tokenization
|
||||
|
||||
+infobox
|
||||
| To learn more about how spaCy's tokenizer and its rules work in detail,
|
||||
| how to #[strong customise] it and how to #[strong add your own tokenizer]
|
||||
| to a processing pipeline, see the usage guide on
|
||||
| To learn more about how spaCy's tokenization rules work in detail,
|
||||
| how to #[strong customise and replace] the default tokenizer and how to
|
||||
| #[strong add language-specific data], see the usage guides on
|
||||
| #[+a("/docs/usage/adding-languages") adding languages] and
|
||||
| #[+a("/docs/usage/customizing-tokenizer") customising the tokenizer].
|
||||
|
||||
+h(3, "annotations-pos-deps") Part-of-speech tags and dependencies
|
||||
|
@ -118,9 +119,11 @@ include _spacy-101/_named-entities
|
|||
|
||||
+infobox
|
||||
| To learn more about entity recognition in spaCy, how to
|
||||
| #[strong add your own entities] to a document and how to train and update
|
||||
| the entity predictions of a model, see the usage guide on
|
||||
| #[+a("/docs/usage/entity-recognition") named entity recognition].
|
||||
| #[strong add your own entities] to a document and how to
|
||||
| #[strong train and update] the entity predictions of a model, see the
|
||||
| usage guides on
|
||||
| #[+a("/docs/usage/entity-recognition") named entity recognition] and
|
||||
| #[+a("/docs/usage/training-ner") training the named entity recognizer].
|
||||
|
||||
+h(2, "vectors-similarity") Word vectors and similarity
|
||||
+tag-model("vectors")
|
||||
|
|
|
@ -20,19 +20,18 @@ p
|
|||
nlp = Language(pipeline=['my_factory', mycomponent])
|
||||
|
||||
p
|
||||
| It's now much easier to customise the pipeline with your own components.
|
||||
| Components are functions that receive a #[code Doc] object, modify and
|
||||
| return it. If your component is stateful, you'll want to create a new one
|
||||
| for each pipeline. You can do that by defining and registering a factory
|
||||
| which receives the shared #[code Vocab] object and returns a component.
|
||||
|
||||
p
|
||||
| spaCy's default components – the vectorizer, tagger, parser and entity
|
||||
| recognizer, can be added to your pipeline by using their string IDs.
|
||||
| This way, you won't have to worry about finding and implementing them –
|
||||
| to use the default tagger, simply add #[code "tagger"] to the pipeline,
|
||||
| It's now much easier to #[strong customise the pipeline] with your own
|
||||
| components, functions that receive a #[code Doc] object, modify and
|
||||
| return it. If your component is stateful, you can define and register a
|
||||
| factory which receives the shared #[code Vocab] object and returns a
|
||||
| component. spaCy's default components can be added to your pipeline by
|
||||
| using their string IDs. This way, you won't have to worry about finding
|
||||
| and implementing them – simply add #[code "tagger"] to the pipeline,
|
||||
| and spaCy will know what to do.
|
||||
|
||||
+image
|
||||
include ../../assets/img/docs/pipeline.svg
|
||||
|
||||
+infobox
|
||||
| #[strong API:] #[+api("language") #[code Language]]
|
||||
| #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text]
|
||||
|
@ -96,11 +95,10 @@ p
|
|||
| #[code Language] class, or load a model that initialises one. This allows
|
||||
| languages to contain more custom data, e.g. lemmatizer lookup tables, or
|
||||
| complex regular expressions. The language data has also been tidied up
|
||||
| and simplified. It's now also possible to overwrite the functions that
|
||||
| compute lexical attributes like #[code like_num], and supply
|
||||
| language-specific syntax iterators, e.g. to determine noun chunks. spaCy
|
||||
| now also supports simple lookup-based lemmatization. The data is stored
|
||||
| in a dictionary mapping a string to its lemma.
|
||||
| and simplified. spaCy now also supports simple lookup-based lemmatization.
|
||||
|
||||
+image
|
||||
include ../../assets/img/docs/language_data.svg
|
||||
|
||||
+infobox
|
||||
| #[strong API:] #[+api("language") #[code Language]]
|
||||
|
@ -111,13 +109,10 @@ p
|
|||
|
||||
+aside-code("Example").
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.attrs import LOWER, IS_PUNCT
|
||||
matcher = Matcher(nlp.vocab)
|
||||
matcher.add('HelloWorld', None,
|
||||
[{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}],
|
||||
[{LOWER: 'hello'}, {LOWER: 'world'}])
|
||||
matcher.add('HEARTS', None, [{'ORTH': '❤️', 'OP': '+'}])
|
||||
assert len(matcher) == 1
|
||||
assert 'HelloWorld' in matcher
|
||||
assert 'HEARTS' in matcher
|
||||
|
||||
p
|
||||
| Patterns can now be added to the matcher by calling
|
||||
|
@ -157,28 +152,8 @@ p
|
|||
+cell #[+api("language#to_disk") #[code Language.to_disk]]
|
||||
|
||||
+row
|
||||
+cell #[code Tokenizer.load]
|
||||
+cell
|
||||
| #[+api("tokenizer#from_disk") #[code Tokenizer.from_disk]]
|
||||
| #[+api("tokenizer#from_bytes") #[code Tokenizer.from_bytes]]
|
||||
|
||||
+row
|
||||
+cell #[code Tagger.load]
|
||||
+cell
|
||||
| #[+api("tagger#from_disk") #[code Tagger.from_disk]]
|
||||
| #[+api("tagger#from_bytes") #[code Tagger.from_bytes]]
|
||||
|
||||
+row
|
||||
+cell #[code DependencyParser.load]
|
||||
+cell
|
||||
| #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]]
|
||||
| #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]]
|
||||
|
||||
+row
|
||||
+cell #[code EntityRecognizer.load]
|
||||
+cell
|
||||
| #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]]
|
||||
| #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]]
|
||||
+cell #[code Language.create_make_doc]
|
||||
+cell #[+api("language#attributes") #[code Language.tokenizer]]
|
||||
|
||||
+row
|
||||
+cell
|
||||
|
@ -212,6 +187,28 @@ p
|
|||
| #[+api("stringstore#to_disk") #[code StringStore.to_disk]]
|
||||
| #[+api("stringstore#to_bytes") #[code StringStore.to_bytes]]
|
||||
|
||||
+row
|
||||
+cell #[code Tokenizer.load]
|
||||
+cell -
|
||||
|
||||
+row
|
||||
+cell #[code Tagger.load]
|
||||
+cell
|
||||
| #[+api("tagger#from_disk") #[code Tagger.from_disk]]
|
||||
| #[+api("tagger#from_bytes") #[code Tagger.from_bytes]]
|
||||
|
||||
+row
|
||||
+cell #[code DependencyParser.load]
|
||||
+cell
|
||||
| #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]]
|
||||
| #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]]
|
||||
|
||||
+row
|
||||
+cell #[code EntityRecognizer.load]
|
||||
+cell
|
||||
| #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]]
|
||||
| #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]]
|
||||
|
||||
+row
|
||||
+cell #[code Matcher.load]
|
||||
+cell -
|
||||
|
@ -232,7 +229,7 @@ p
|
|||
|
||||
+row
|
||||
+cell #[code Doc.read_bytes]
|
||||
+cell
|
||||
+cell #[+api("binder") #[code Binder]]
|
||||
|
||||
+row
|
||||
+cell #[code Token.is_ancestor_of]
|
||||
|
|