Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-05-28 08:12:05 -05:00
commit 8a24c60c1e
58 changed files with 787 additions and 719 deletions

View File

@ -1,9 +1,6 @@
# coding: utf8
from __future__ import unicode_literals
import importlib
from .compat import basestring_
from .cli.info import info as cli_info
from .glossary import explain
from .deprecated import resolve_load_name
@ -12,14 +9,7 @@ from . import util
def load(name, **overrides):
name = resolve_load_name(name, **overrides)
model_path = util.resolve_model_path(name)
meta = util.parse_package_meta(model_path)
if 'lang' not in meta:
raise IOError('No language setting found in model meta.')
cls = util.get_lang_class(meta['lang'])
overrides['meta'] = meta
overrides['path'] = model_path
return cls(**overrides)
return util.load_model(name)
def info(model=None, markdown=False):

View File

@ -150,6 +150,9 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
else:
int_key = IDS[name.upper()]
if strings_map is not None and isinstance(value, basestring):
if hasattr(strings_map, 'add'):
value = strings_map.add(value)
else:
value = strings_map[value]
inty_attrs[int_key] = value
return inty_attrs

View File

@ -20,8 +20,14 @@ def info(cmd, model=None, markdown=False):
prints details in Markdown for easy copy-pasting to GitHub issues.
"""
if model:
model_path = util.resolve_model_path(model)
meta = util.parse_package_meta(model_path)
if util.is_package(model):
model_path = util.get_package_path(model)
else:
model_path = util.get_data_path() / model
meta_path = model_path / 'meta.json'
if not meta_path.is_file():
prints(meta_path, title="Can't find model meta.json", exits=1)
meta = read_json(meta_path)
if model_path.resolve() != model_path:
meta['link'] = path2str(model_path)
meta['source'] = path2str(model_path.resolve())

View File

@ -21,7 +21,7 @@ def link(cmd, origin, link_name, force=False):
directory. Linking models allows loading them via spacy.load(link_name).
"""
if util.is_package(origin):
model_path = util.get_model_package_path(origin)
model_path = util.get_package_path(model)
else:
model_path = Path(origin)
if not model_path.exists():

View File

@ -1,13 +1,14 @@
from cymem.cymem cimport Pool
from .structs cimport TokenC
from .typedefs cimport attr_t
from .syntax.transition_system cimport Transition
cdef struct GoldParseC:
int* tags
int* heads
int* labels
attr_t* labels
int** brackets
Transition* ner

View File

@ -384,7 +384,7 @@ cdef class GoldParse:
# These are filled by the tagger/parser/entity recogniser
self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.labels = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
self.words = [None] * len(doc)

View File

@ -35,4 +35,4 @@ class English(Language):
Defaults = EnglishDefaults
__all__ = ['English', 'EnglishDefaults']
__all__ = ['English']

26
spacy/lang/xx/__init__.py Normal file
View File

@ -0,0 +1,26 @@
# coding: utf8
from __future__ import unicode_literals
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...attrs import LANG
from ...util import update_exc
class MultiLanguageDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'xx'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
class MultiLanguage(Language):
"""Language class to be used for models that support multiple languages.
This module allows models to specify their language ID as 'xx'.
"""
lang = 'xx'
Defaults = MultiLanguageDefaults
__all__ = ['MultiLanguage']

View File

@ -215,7 +215,9 @@ class Language(object):
grads = {}
def get_grads(W, dW, key=None):
grads[key] = (W, dW)
for proc in self.pipeline[1:]:
pipes = list(self.pipeline[1:])
random.shuffle(pipes)
for proc in pipes:
if not hasattr(proc, 'update'):
continue
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)

View File

@ -27,7 +27,7 @@ cdef class Lexeme:
cdef inline SerializedLexemeC c_to_bytes(const LexemeC* lex) nogil:
cdef SerializedLexemeC lex_data
buff = <const unsigned char*>&lex.flags
end = <const unsigned char*>&lex.l2_norm + sizeof(lex.l2_norm)
end = <const unsigned char*>&lex.sentiment + sizeof(lex.sentiment)
for i in range(sizeof(lex_data.data)):
lex_data.data[i] = buff[i]
return lex_data
@ -35,7 +35,7 @@ cdef class Lexeme:
@staticmethod
cdef inline void c_from_bytes(LexemeC* lex, SerializedLexemeC lex_data) nogil:
buff = <unsigned char*>&lex.flags
end = <unsigned char*>&lex.l2_norm + sizeof(lex.l2_norm)
end = <unsigned char*>&lex.sentiment + sizeof(lex.sentiment)
for i in range(sizeof(lex_data.data)):
buff[i] = lex_data.data[i]

View File

@ -35,11 +35,11 @@ cdef class Lexeme:
tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
tag).
"""
def __init__(self, Vocab vocab, int orth):
def __init__(self, Vocab vocab, attr_t orth):
"""Create a Lexeme object.
vocab (Vocab): The parent vocabulary
orth (int): The orth id of the lexeme.
orth (uint64): The orth id of the lexeme.
Returns (Lexeme): The newly constructd object.
"""
self.vocab = vocab
@ -51,7 +51,7 @@ cdef class Lexeme:
if isinstance(other, Lexeme):
a = self.orth
b = other.orth
elif isinstance(other, int):
elif isinstance(other, long):
a = self.orth
b = other
elif isinstance(other, str):
@ -109,7 +109,7 @@ cdef class Lexeme:
def to_bytes(self):
lex_data = Lexeme.c_to_bytes(self.c)
start = <const char*>&self.c.flags
end = <const char*>&self.c.l2_norm + sizeof(self.c.l2_norm)
end = <const char*>&self.c.sentiment + sizeof(self.c.sentiment)
assert (end-start) == sizeof(lex_data.data), (end-start, sizeof(lex_data.data))
byte_string = b'\0' * sizeof(lex_data.data)
byte_chars = <char*>byte_string
@ -136,12 +136,7 @@ cdef class Lexeme:
RETURNS (bool): Whether a word vector is associated with the object.
"""
def __get__(self):
cdef int i
for i in range(self.vocab.vectors_length):
if self.c.vector[i] != 0:
return True
else:
return False
return self.vocab.has_vector(self.c.orth)
property vector_norm:
"""The L2 norm of the lexeme's vector representation.
@ -149,10 +144,8 @@ cdef class Lexeme:
RETURNS (float): The L2 norm of the vector representation.
"""
def __get__(self):
return self.c.l2_norm
def __set__(self, float value):
self.c.l2_norm = value
vector = self.vector
return numpy.sqrt((vector**2).sum())
property vector:
"""A real-valued meaning representation.
@ -169,27 +162,16 @@ cdef class Lexeme:
"model doesn't include word vectors. For more info, see "
"the documentation: \n%s\n" % about.__docs_models__
)
vector_view = <float[:length,]>self.c.vector
return numpy.asarray(vector_view)
return self.vocab.get_vector(self.c.orth)
def __set__(self, vector):
assert len(vector) == self.vocab.vectors_length
cdef float value
cdef double norm = 0.0
for i, value in enumerate(vector):
self.c.vector[i] = value
norm += value * value
self.c.l2_norm = sqrt(norm)
self.vocab.set_vector(self.c.orth, vector)
property rank:
def __get__(self):
return self.c.id
property repvec:
def __get__(self):
raise AttributeError("lex.repvec has been renamed to lex.vector")
property sentiment:
def __get__(self):
return self.c.sentiment
@ -210,31 +192,31 @@ cdef class Lexeme:
property lower:
def __get__(self): return self.c.lower
def __set__(self, int x): self.c.lower = x
def __set__(self, attr_t x): self.c.lower = x
property norm:
def __get__(self): return self.c.norm
def __set__(self, int x): self.c.norm = x
def __set__(self, attr_t x): self.c.norm = x
property shape:
def __get__(self): return self.c.shape
def __set__(self, int x): self.c.shape = x
def __set__(self, attr_t x): self.c.shape = x
property prefix:
def __get__(self): return self.c.prefix
def __set__(self, int x): self.c.prefix = x
def __set__(self, attr_t x): self.c.prefix = x
property suffix:
def __get__(self): return self.c.suffix
def __set__(self, int x): self.c.suffix = x
def __set__(self, attr_t x): self.c.suffix = x
property cluster:
def __get__(self): return self.c.cluster
def __set__(self, int x): self.c.cluster = x
def __set__(self, attr_t x): self.c.cluster = x
property lang:
def __get__(self): return self.c.lang
def __set__(self, int x): self.c.lang = x
def __set__(self, attr_t x): self.c.lang = x
property prob:
def __get__(self): return self.c.prob
@ -270,7 +252,7 @@ cdef class Lexeme:
property is_oov:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_OOV)
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_OOV, x)
def __set__(self, attr_t x): Lexeme.c_set_flag(self.c, IS_OOV, x)
property is_stop:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_STOP)
@ -320,7 +302,6 @@ cdef class Lexeme:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
property like_url:
def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL)
def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x)

View File

@ -154,7 +154,7 @@ def _convert_strings(token_specs, string_store):
if isinstance(attr, basestring):
attr = attrs.IDS.get(attr.upper())
if isinstance(value, basestring):
value = string_store[value]
value = string_store.add(value)
if isinstance(value, bool):
value = int(value)
if attr is not None:
@ -381,7 +381,7 @@ cdef class Matcher:
def _normalize_key(self, key):
if isinstance(key, basestring):
return self.vocab.strings[key]
return self.vocab.strings.add(key)
else:
return key
@ -469,7 +469,7 @@ cdef class PhraseMatcher:
self(doc)
yield doc
def accept_match(self, Doc doc, int ent_id, int label, int start, int end):
def accept_match(self, Doc doc, attr_t ent_id, attr_t label, int start, int end):
assert (end - start) < self.max_length
cdef int i, j
for i in range(self.max_length):

View File

@ -48,7 +48,7 @@ cdef class Morphology:
self.tag_map[tag_str] = dict(attrs)
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
self.rich_tags[i].id = i
self.rich_tags[i].name = self.strings[tag_str]
self.rich_tags[i].name = self.strings.add(tag_str)
self.rich_tags[i].morph = 0
self.rich_tags[i].pos = attrs[POS]
self.reverse_index[self.rich_tags[i].name] = i
@ -59,10 +59,12 @@ cdef class Morphology:
cdef int assign_tag(self, TokenC* token, tag) except -1:
if isinstance(tag, basestring):
tag_id = self.reverse_index[self.strings[tag]]
else:
tag = self.strings.add(tag)
if tag in self.reverse_index:
tag_id = self.reverse_index[tag]
self.assign_tag_id(token, tag_id)
else:
token.tag = tag
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
if tag_id >= self.n_tags:
@ -73,7 +75,7 @@ cdef class Morphology:
# the statistical model fails.
# Related to Issue #220
if Lexeme.c_check_flag(token.lex, IS_SPACE):
tag_id = self.reverse_index[self.strings['SP']]
tag_id = self.reverse_index[self.strings.add('SP')]
rich_tag = self.rich_tags[tag_id]
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
if analysis is NULL:
@ -104,7 +106,7 @@ cdef class Morphology:
tag (unicode): The part-of-speech tag to key the exception.
orth (unicode): The word-form to key the exception.
"""
tag = self.strings[tag_str]
tag = self.strings.add(tag_str)
tag_id = self.reverse_index[tag]
orth = self.strings[orth_str]
cdef RichTagC rich_tag = self.rich_tags[tag_id]
@ -140,14 +142,14 @@ cdef class Morphology:
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
cdef unicode py_string = self.strings[orth]
if self.lemmatizer is None:
return self.strings[py_string.lower()]
return self.strings.add(py_string.lower())
if univ_pos not in (NOUN, VERB, ADJ, PUNCT):
return self.strings[py_string.lower()]
return self.strings.add(py_string.lower())
cdef set lemma_strings
cdef unicode lemma_string
lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
lemma_string = sorted(lemma_strings)[0]
lemma = self.strings[lemma_string]
lemma = self.strings.add(lemma_string)
return lemma

View File

@ -228,6 +228,7 @@ class NeuralTagger(object):
idx += 1
correct = self.model.ops.xp.array(correct, dtype='i')
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
d_scores /= d_scores.shape[0]
loss = (d_scores**2).sum()
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
return float(loss), d_scores
@ -292,6 +293,7 @@ class NeuralLabeller(NeuralTagger):
idx += 1
correct = self.model.ops.xp.array(correct, dtype='i')
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
d_scores /= d_scores.shape[0]
loss = (d_scores**2).sum()
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
return float(loss), d_scores

View File

@ -1,4 +1,5 @@
from libc.stdint cimport int64_t
from libcpp.vector cimport vector
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
@ -8,6 +9,9 @@ from .typedefs cimport attr_t, hash_t
cpdef hash_t hash_string(unicode string) except 0
cdef hash_t hash_utf8(char* utf8_string, int length) nogil
cdef unicode decode_Utf8Str(const Utf8Str* string)
ctypedef union Utf8Str:
@ -17,13 +21,11 @@ ctypedef union Utf8Str:
cdef class StringStore:
cdef Pool mem
cdef Utf8Str* c
cdef int64_t size
cdef bint is_frozen
cdef vector[hash_t] keys
cdef public PreshMap _map
cdef public PreshMap _oov
cdef int64_t _resize_at
cdef const Utf8Str* intern_unicode(self, unicode py_string)
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)

View File

@ -11,6 +11,9 @@ from libc.stdint cimport uint32_t
import ujson
import dill
from .symbols import IDS as SYMBOLS_BY_STR
from .symbols import NAMES as SYMBOLS_BY_INT
from .typedefs cimport hash_t
from . import util
@ -28,7 +31,7 @@ cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil:
return hash32(utf8_string, length, 1)
cdef unicode _decode(const Utf8Str* string):
cdef unicode decode_Utf8Str(const Utf8Str* string):
cdef int i, length
if string.s[0] < sizeof(string.s) and string.s[0] != 0:
return string.s[1:string.s[0]+1].decode('utf8')
@ -45,10 +48,10 @@ cdef unicode _decode(const Utf8Str* string):
return string.p[i:length + i].decode('utf8')
cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *:
cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *:
cdef int n_length_bytes
cdef int i
cdef Utf8Str string
cdef Utf8Str* string = <Utf8Str*>mem.alloc(1, sizeof(Utf8Str))
cdef uint32_t ulength = length
if length < sizeof(string.s):
string.s[0] = <unsigned char>length
@ -73,7 +76,7 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) ex
cdef class StringStore:
"""Map strings to and from integer IDs."""
"""Lookup strings by 64-bit hash"""
def __init__(self, strings=None, freeze=False):
"""Create the StringStore.
@ -83,70 +86,66 @@ cdef class StringStore:
self.mem = Pool()
self._map = PreshMap()
self._oov = PreshMap()
self._resize_at = 10000
self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
self.size = 1
self.is_frozen = freeze
if strings is not None:
for string in strings:
_ = self[string]
self.add(string)
property size:
def __get__(self):
return self.size -1
def __getitem__(self, object string_or_id):
"""Retrieve a string from a given hash ID, or vice versa.
string_or_id (bytes or unicode or uint64): The value to encode.
Returns (unicode or uint64): The value to be retrieved.
"""
if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
return 0
elif string_or_id == 0:
return u''
elif string_or_id in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string_or_id]
cdef hash_t key
if isinstance(string_or_id, unicode):
key = hash_string(string_or_id)
return key
elif isinstance(string_or_id, bytes):
key = hash_utf8(string_or_id, len(string_or_id))
return key
else:
if string_or_id < len(SYMBOLS_BY_INT):
return SYMBOLS_BY_INT[string_or_id]
key = string_or_id
utf8str = <Utf8Str*>self._map.get(key)
if utf8str is NULL:
raise KeyError(string_or_id)
else:
return decode_Utf8Str(utf8str)
def add(self, string):
if isinstance(string, unicode):
if string in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string]
key = hash_string(string)
self.intern_unicode(string)
elif isinstance(string, bytes):
if string in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string]
key = hash_utf8(string, len(string))
self._intern_utf8(string, len(string))
else:
raise TypeError(
"Can only add unicode or bytes. Got type: %s" % type(string))
return key
def __len__(self):
"""The number of strings in the store.
RETURNS (int): The number of strings in the store.
"""
return self.size-1
return self.keys.size()
def __getitem__(self, object string_or_id):
"""Retrieve a string from a given integer ID, or vice versa.
string_or_id (bytes or unicode or int): The value to encode.
Returns (unicode or int): The value to be retrieved.
"""
if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
return 0
elif string_or_id == 0:
return u''
cdef bytes byte_string
cdef const Utf8Str* utf8str
cdef uint64_t int_id
cdef uint32_t oov_id
if isinstance(string_or_id, (int, long)):
int_id = string_or_id
oov_id = string_or_id
if int_id < <uint64_t>self.size:
return _decode(&self.c[int_id])
else:
utf8str = <Utf8Str*>self._oov.get(oov_id)
if utf8str is not NULL:
return _decode(utf8str)
else:
raise IndexError(string_or_id)
else:
if isinstance(string_or_id, bytes):
byte_string = <bytes>string_or_id
elif isinstance(string_or_id, unicode):
byte_string = (<unicode>string_or_id).encode('utf8')
else:
raise TypeError(type(string_or_id))
utf8str = self._intern_utf8(byte_string, len(byte_string))
if utf8str is NULL:
# TODO: We need to use 32 bit here, for compatibility with the
# vocabulary values. This makes birthday paradox probabilities
# pretty bad.
# We could also get unlucky here, and hash into a value that
# collides with the 'real' strings.
return hash32_utf8(byte_string, len(byte_string))
else:
return utf8str - self.c
def __contains__(self, unicode string not None):
def __contains__(self, string not None):
"""Check whether a string is in the store.
string (unicode): The string to check.
@ -154,7 +153,11 @@ cdef class StringStore:
"""
if len(string) == 0:
return True
cdef hash_t key = hash_string(string)
if string in SYMBOLS_BY_STR:
return True
if isinstance(string, unicode):
string = string.encode('utf8')
cdef hash_t key = hash_utf8(string, len(string))
return self._map.get(key) is not NULL
def __iter__(self):
@ -163,16 +166,15 @@ cdef class StringStore:
YIELDS (unicode): A string in the store.
"""
cdef int i
for i in range(self.size):
yield _decode(&self.c[i]) if i > 0 else u''
cdef hash_t key
for i in range(self.keys.size()):
key = self.keys[i]
utf8str = <Utf8Str*>self._map.get(key)
yield decode_Utf8Str(utf8str)
# TODO: Iterate OOV here?
def __reduce__(self):
strings = [""]
for i in range(1, self.size):
string = &self.c[i]
py_string = _decode(string)
strings.append(py_string)
strings = list(self)
return (StringStore, (strings,), None, None, None)
def to_disk(self, path):
@ -230,11 +232,9 @@ cdef class StringStore:
self.mem = Pool()
self._map = PreshMap()
self._oov = PreshMap()
self._resize_at = 10000
self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
self.size = 1
self.keys.clear()
for string in strings:
_ = self[string]
self.add(string)
self.is_frozen = freeze
cdef const Utf8Str* intern_unicode(self, unicode py_string):
@ -258,39 +258,11 @@ cdef class StringStore:
key32 = hash32_utf8(utf8_string, length)
# Important: Make the OOV store own the memory. That way it's trivial
# to flush them all.
value = <Utf8Str*>self._oov.mem.alloc(1, sizeof(Utf8Str))
value[0] = _allocate(self._oov.mem, <unsigned char*>utf8_string, length)
value = _allocate(self._oov.mem, <unsigned char*>utf8_string, length)
self._oov.set(key32, value)
return NULL
if self.size == self._resize_at:
self._realloc()
self.c[self.size] = _allocate(self.mem, <unsigned char*>utf8_string, length)
self._map.set(key, <void*>&self.c[self.size])
self.size += 1
return &self.c[self.size-1]
def _realloc(self):
# We want to map straight to pointers, but they'll be invalidated if
# we resize our array. So, first we remap to indices, then we resize,
# then we can acquire the new pointers.
cdef Pool tmp_mem = Pool()
keys = <key_t*>tmp_mem.alloc(self.size, sizeof(key_t))
cdef key_t key
cdef void* value
cdef const Utf8Str ptr
cdef int i = 0
cdef size_t offset
while map_iter(self._map.c_map, &i, &key, &value):
# Find array index with pointer arithmetic
offset = ((<Utf8Str*>value) - self.c)
keys[offset] = key
self._resize_at *= 2
cdef size_t new_size = self._resize_at * sizeof(Utf8Str)
self.c = <Utf8Str*>self.mem.realloc(self.c, new_size)
self._map = PreshMap(self.size)
for i in range(self.size):
if keys[i]:
self._map.set(keys[i], &self.c[i])
value = _allocate(self.mem, <unsigned char*>utf8_string, length)
self._map.set(key, value)
self.keys.push_back(key)
return value

View File

@ -5,8 +5,6 @@ from .parts_of_speech cimport univ_pos_t
cdef struct LexemeC:
float* vector
flags_t flags
attr_t lang
@ -25,11 +23,10 @@ cdef struct LexemeC:
float prob
float sentiment
float l2_norm
cdef struct SerializedLexemeC:
unsigned char[4*13 + 8] data
unsigned char[8 + 8*10 + 4 + 4] data
# sizeof(flags_t) # flags
# + sizeof(attr_t) # lang
# + sizeof(attr_t) # id
@ -50,7 +47,7 @@ cdef struct Entity:
hash_t id
int start
int end
int label
attr_t label
cdef struct TokenC:
@ -58,12 +55,12 @@ cdef struct TokenC:
uint64_t morph
univ_pos_t pos
bint spacy
int tag
attr_t tag
int idx
int lemma
int sense
attr_t lemma
attr_t sense
int head
int dep
attr_t dep
bint sent_start
uint32_t l_kids
@ -72,5 +69,5 @@ cdef struct TokenC:
uint32_t r_edge
int ent_iob
int ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
hash_t ent_id

View File

@ -3,6 +3,7 @@ from cymem.cymem cimport Pool
from thinc.typedefs cimport weight_t
from .stateclass cimport StateClass
from ..typedefs cimport attr_t
from .transition_system cimport TransitionSystem, Transition
from ..gold cimport GoldParseC

View File

@ -99,7 +99,7 @@ cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil:
return False
cdef bint label_is_gold(const GoldParseC* gold, int head, int child, int label) nogil:
cdef bint label_is_gold(const GoldParseC* gold, int head, int child, attr_t label) nogil:
if gold.labels[child] == -1:
return True
elif label == -1:
@ -116,16 +116,16 @@ cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil:
cdef class Shift:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_start
@staticmethod
cdef int transition(StateC* st, int label) nogil:
cdef int transition(StateC* st, attr_t label) nogil:
st.push()
st.fast_forward()
@staticmethod
cdef weight_t cost(StateClass st, const GoldParseC* gold, int label) nogil:
cdef weight_t cost(StateClass st, const GoldParseC* gold, attr_t label) nogil:
return Shift.move_cost(st, gold) + Shift.label_cost(st, gold, label)
@staticmethod
@ -133,17 +133,17 @@ cdef class Shift:
return push_cost(s, gold, s.B(0))
@staticmethod
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return 0
cdef class Reduce:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
return st.stack_depth() >= 2
@staticmethod
cdef int transition(StateC* st, int label) nogil:
cdef int transition(StateC* st, attr_t label) nogil:
if st.has_head(st.S(0)):
st.pop()
else:
@ -151,7 +151,7 @@ cdef class Reduce:
st.fast_forward()
@staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label)
@staticmethod
@ -170,23 +170,23 @@ cdef class Reduce:
return cost
@staticmethod
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return 0
cdef class LeftArc:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
return not st.B_(0).sent_start
@staticmethod
cdef int transition(StateC* st, int label) nogil:
cdef int transition(StateC* st, attr_t label) nogil:
st.add_arc(st.B(0), st.S(0), label)
st.pop()
st.fast_forward()
@staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label)
@staticmethod
@ -204,23 +204,23 @@ cdef class LeftArc:
return cost + pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0))
@staticmethod
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label)
cdef class RightArc:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
return not st.B_(0).sent_start
@staticmethod
cdef int transition(StateC* st, int label) nogil:
cdef int transition(StateC* st, attr_t label) nogil:
st.add_arc(st.S(0), st.B(0), label)
st.push()
st.fast_forward()
@staticmethod
cdef inline weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef inline weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label)
@staticmethod
@ -233,13 +233,13 @@ cdef class RightArc:
return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0))
@staticmethod
cdef weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label)
cdef class Break:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int i
if not USE_BREAK:
return False
@ -251,12 +251,12 @@ cdef class Break:
return True
@staticmethod
cdef int transition(StateC* st, int label) nogil:
cdef int transition(StateC* st, attr_t label) nogil:
st.set_break(st.B_(0).l_edge)
st.fast_forward()
@staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return Break.move_cost(s, gold) + Break.label_cost(s, gold, label)
@staticmethod
@ -281,7 +281,7 @@ cdef class Break:
return cost + 1
@staticmethod
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return 0
cdef int _get_root(int word, const GoldParseC* gold) nogil:
@ -295,9 +295,7 @@ cdef int _get_root(int word, const GoldParseC* gold) nogil:
cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
cdef StateClass st = StateClass.init(<const TokenC*>tokens, length)
# Ensure sent_start is set to 0 throughout
for i in range(st.c.length):
st.c._sent[i].sent_start = False
st.c._sent[i].l_edge = i
st.c._sent[i].r_edge = i
st.fast_forward()
@ -371,7 +369,7 @@ cdef class ArcEager(TransitionSystem):
if label.upper() == 'ROOT':
label = 'ROOT'
gold.c.heads[i] = gold.heads[i]
gold.c.labels[i] = self.strings[label]
gold.c.labels[i] = self.strings.add(label)
return gold
cdef Transition lookup_transition(self, object name) except *:
@ -386,14 +384,14 @@ cdef class ArcEager(TransitionSystem):
if self.c[i].move == move and self.c[i].label == label:
return self.c[i]
def move_name(self, int move, int label):
def move_name(self, int move, attr_t label):
label_str = self.strings[label]
if label_str:
return MOVE_NAMES[move] + '-' + label_str
else:
return MOVE_NAMES[move]
cdef Transition init_transition(self, int clas, int move, int label) except *:
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
# TODO: Apparent Cython bug here when we try to use the Transition()
# constructor with the function pointers
cdef Transition t
@ -426,9 +424,7 @@ cdef class ArcEager(TransitionSystem):
return t
cdef int initialize_state(self, StateC* st) nogil:
# Ensure sent_start is set to 0 throughout
for i in range(st.length):
st._sent[i].sent_start = False
st._sent[i].l_edge = i
st._sent[i].r_edge = i
st.fast_forward()
@ -473,7 +469,7 @@ cdef class ArcEager(TransitionSystem):
label_cost_funcs[RIGHT] = RightArc.label_cost
label_cost_funcs[BREAK] = Break.label_cost
cdef int* labels = gold.c.labels
cdef attr_t* labels = gold.c.labels
cdef int* heads = gold.c.heads
n_gold = 0

View File

@ -1,6 +1,7 @@
from .transition_system cimport TransitionSystem
from .transition_system cimport Transition
from ..gold cimport GoldParseC
from ..typedefs cimport attr_t
cdef class BiluoPushDown(TransitionSystem):

View File

@ -100,7 +100,7 @@ cdef class BiluoPushDown(TransitionSystem):
def __get__(self):
return (BEGIN, IN, LAST, UNIT, OUT)
def move_name(self, int move, int label):
def move_name(self, int move, attr_t label):
if move == OUT:
return 'O'
elif move == MISSING:
@ -132,7 +132,7 @@ cdef class BiluoPushDown(TransitionSystem):
if label_str.startswith('!'):
label_str = label_str[1:]
move_str = 'x'
label = self.strings[label_str]
label = self.strings.add(label_str)
else:
move_str = name
label = 0
@ -145,7 +145,7 @@ cdef class BiluoPushDown(TransitionSystem):
else:
raise KeyError(name)
cdef Transition init_transition(self, int clas, int move, int label) except *:
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
# TODO: Apparent Cython bug here when we try to use the Transition()
# constructor with the function pointers
cdef Transition t
@ -194,21 +194,21 @@ cdef class BiluoPushDown(TransitionSystem):
cdef class Missing:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
return False
@staticmethod
cdef int transition(StateC* s, int label) nogil:
cdef int transition(StateC* s, attr_t label) nogil:
pass
@staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return 9000
cdef class Begin:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
# Ensure we don't clobber preset entities. If no entity preset,
# ent_iob is 0
cdef int preset_ent_iob = st.B_(0).ent_iob
@ -232,14 +232,14 @@ cdef class Begin:
return label != 0 and not st.entity_is_open()
@staticmethod
cdef int transition(StateC* st, int label) nogil:
cdef int transition(StateC* st, attr_t label) nogil:
st.open_ent(label)
st.set_ent_tag(st.B(0), 3, label)
st.push()
st.pop()
@staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
cdef int g_act = gold.ner[s.B(0)].move
cdef int g_tag = gold.ner[s.B(0)].label
@ -261,7 +261,7 @@ cdef class Begin:
cdef class In:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob
if preset_ent_iob == 2:
return False
@ -277,17 +277,17 @@ cdef class In:
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
@staticmethod
cdef int transition(StateC* st, int label) nogil:
cdef int transition(StateC* st, attr_t label) nogil:
st.set_ent_tag(st.B(0), 1, label)
st.push()
st.pop()
@staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
move = IN
cdef int next_act = gold.ner[s.B(1)].move if s.B(0) < s.c.length else OUT
cdef int g_act = gold.ner[s.B(0)].move
cdef int g_tag = gold.ner[s.B(0)].label
cdef attr_t g_tag = gold.ner[s.B(0)].label
cdef bint is_sunk = _entity_is_sunk(s, gold.ner)
if g_act == MISSING:
@ -313,24 +313,24 @@ cdef class In:
cdef class Last:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
if st.B_(1).ent_iob == 1:
return False
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
@staticmethod
cdef int transition(StateC* st, int label) nogil:
cdef int transition(StateC* st, attr_t label) nogil:
st.close_ent()
st.set_ent_tag(st.B(0), 1, label)
st.push()
st.pop()
@staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
move = LAST
cdef int g_act = gold.ner[s.B(0)].move
cdef int g_tag = gold.ner[s.B(0)].label
cdef attr_t g_tag = gold.ner[s.B(0)].label
if g_act == MISSING:
return 0
@ -355,7 +355,7 @@ cdef class Last:
cdef class Unit:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob
if preset_ent_iob == 2:
return False
@ -368,7 +368,7 @@ cdef class Unit:
return label != 0 and not st.entity_is_open()
@staticmethod
cdef int transition(StateC* st, int label) nogil:
cdef int transition(StateC* st, attr_t label) nogil:
st.open_ent(label)
st.close_ent()
st.set_ent_tag(st.B(0), 3, label)
@ -376,9 +376,9 @@ cdef class Unit:
st.pop()
@staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
cdef int g_act = gold.ner[s.B(0)].move
cdef int g_tag = gold.ner[s.B(0)].label
cdef attr_t g_tag = gold.ner[s.B(0)].label
if g_act == MISSING:
return 0
@ -398,7 +398,7 @@ cdef class Unit:
cdef class Out:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob
if preset_ent_iob == 3:
return False
@ -407,15 +407,15 @@ cdef class Out:
return not st.entity_is_open()
@staticmethod
cdef int transition(StateC* st, int label) nogil:
cdef int transition(StateC* st, attr_t label) nogil:
st.set_ent_tag(st.B(0), 2, 0)
st.push()
st.pop()
@staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
cdef int g_act = gold.ner[s.B(0)].move
cdef int g_tag = gold.ner[s.B(0)].label
cdef attr_t g_tag = gold.ner[s.B(0)].label
if g_act == MISSING or g_act == ISNT:
return 0

View File

@ -428,7 +428,7 @@ cdef class Parser:
cuda_stream = get_cuda_stream()
states, golds, max_length = self._init_gold_batch(docs, golds)
states, golds, max_steps = self._init_gold_batch(docs, golds)
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream,
0.0)
todo = [(s, g) for (s, g) in zip(states, golds)
@ -439,6 +439,7 @@ cdef class Parser:
backprops = []
d_tokvecs = state2vec.ops.allocate(tokvecs.shape)
cdef float loss = 0.
n_steps = 0
while todo:
states, golds = zip(*todo)
@ -450,7 +451,7 @@ cdef class Parser:
scores, bp_scores = vec2scores.begin_update(vector, drop=drop)
d_scores = self.get_batch_loss(states, golds, scores)
d_vector = bp_scores(d_scores, sgd=sgd)
d_vector = bp_scores(d_scores / d_scores.shape[0], sgd=sgd)
if drop != 0:
d_vector *= mask
@ -468,7 +469,8 @@ cdef class Parser:
todo = [st for st in todo if not st[0].is_final()]
if losses is not None:
losses[self.name] += (d_scores**2).sum()
if len(backprops) >= (max_length * 2):
n_steps += 1
if n_steps >= max_steps:
break
self._make_updates(d_tokvecs,
backprops, sgd, cuda_stream)
@ -483,7 +485,8 @@ cdef class Parser:
StateClass state
Transition action
whole_states = self.moves.init_batch(whole_docs)
max_length = max(5, min(20, min([len(doc) for doc in whole_docs])))
max_length = max(5, min(50, min([len(doc) for doc in whole_docs])))
max_moves = 0
states = []
golds = []
for doc, state, gold in zip(whole_docs, whole_states, whole_golds):
@ -494,16 +497,20 @@ cdef class Parser:
start = 0
while start < len(doc):
state = state.copy()
n_moves = 0
while state.B(0) < start and not state.is_final():
action = self.moves.c[oracle_actions.pop(0)]
action.do(state.c, action.label)
n_moves += 1
has_gold = self.moves.has_gold(gold, start=start,
end=start+max_length)
if not state.is_final() and has_gold:
states.append(state)
golds.append(gold)
max_moves = max(max_moves, n_moves)
start += min(max_length, len(doc)-start)
return states, golds, max_length
max_moves = max(max_moves, len(oracle_actions))
return states, golds, max_moves
def _make_updates(self, d_tokvecs, backprops, sgd, cuda_stream=None):
# Tells CUDA to block, so our async copies complete.

View File

@ -1,6 +1,7 @@
from cymem.cymem cimport Pool
from thinc.typedefs cimport weight_t
from ..typedefs cimport attr_t
from ..structs cimport TokenC
from ..gold cimport GoldParse
from ..gold cimport GoldParseC
@ -13,20 +14,22 @@ from ._state cimport StateC
cdef struct Transition:
int clas
int move
int label
attr_t label
weight_t score
bint (*is_valid)(const StateC* state, int label) nogil
weight_t (*get_cost)(StateClass state, const GoldParseC* gold, int label) nogil
int (*do)(StateC* state, int label) nogil
bint (*is_valid)(const StateC* state, attr_t label) nogil
weight_t (*get_cost)(StateClass state, const GoldParseC* gold, attr_t label) nogil
int (*do)(StateC* state, attr_t label) nogil
ctypedef weight_t (*get_cost_func_t)(StateClass state, const GoldParseC* gold, int label) nogil
ctypedef weight_t (*get_cost_func_t)(StateClass state, const GoldParseC* gold,
attr_tlabel) nogil
ctypedef weight_t (*move_cost_func_t)(StateClass state, const GoldParseC* gold) nogil
ctypedef weight_t (*label_cost_func_t)(StateClass state, const GoldParseC* gold, int label) nogil
ctypedef weight_t (*label_cost_func_t)(StateClass state, const GoldParseC*
gold, attr_t label) nogil
ctypedef int (*do_func_t)(StateC* state, int label) nogil
ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil
ctypedef void* (*init_state_t)(Pool mem, int length, void* tokens) except NULL
@ -36,7 +39,7 @@ cdef class TransitionSystem:
cdef Transition* c
cdef readonly int n_moves
cdef int _size
cdef public int root_label
cdef public attr_t root_label
cdef public freqs
cdef init_state_t init_beam_state
@ -45,7 +48,7 @@ cdef class TransitionSystem:
cdef Transition lookup_transition(self, object name) except *
cdef Transition init_transition(self, int clas, int move, int label) except *
cdef Transition init_transition(self, int clas, int move, attr_t label) except *
cdef int set_valid(self, int* output, const StateC* st) nogil

View File

@ -99,7 +99,7 @@ cdef class TransitionSystem:
cdef Transition lookup_transition(self, object name) except *:
raise NotImplementedError
cdef Transition init_transition(self, int clas, int move, int label) except *:
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
raise NotImplementedError
def is_valid(self, StateClass stcls, move_name):

View File

@ -204,6 +204,7 @@ def test_doc_api_right_edge(en_tokenizer):
assert doc[6].right_edge.text == ','
@pytest.mark.xfail
@pytest.mark.parametrize('text,vectors', [
("apple orange pear", ["apple -1 -1 -1", "orange -1 -1 0", "pear -1 0 -1"])
])

View File

@ -20,7 +20,7 @@ def test_doc_noun_chunks_not_nested(en_tokenizer):
tokens.from_array(
[HEAD, DEP],
numpy.asarray([[1, nsubj], [0, root], [4, amod], [3, nmod], [-1, cc],
[-2, conj], [-5, dobj]], dtype='int32'))
[-2, conj], [-5, dobj]], dtype='uint64'))
tokens.noun_chunks_iterator = english_noun_chunks
word_occurred = {}
for chunk in tokens.noun_chunks:

View File

@ -68,6 +68,7 @@ def test_doc_token_api_is_properties(en_vocab):
assert doc[5].like_email
@pytest.mark.xfail
@pytest.mark.parametrize('text,vectors', [
("apples oranges ldskbjls", ["apples -1 -1 -1", "oranges -1 -1 0"])
])

View File

@ -15,7 +15,9 @@ def test_issue615(en_tokenizer):
# Get Span objects
spans = [(ent_id, ent_id, doc[start : end]) for ent_id, start, end in matches]
for ent_id, label, span in spans:
span.merge('NNP' if label else span.root.tag_, span.text, doc.vocab.strings[label])
span.merge(tag='NNP' if label else span.root.tag_, lemma=span.text,
label=label)
doc.ents = doc.ents + ((label, span.start, span.end),)
text = "The golf club is broken"
pattern = [{'ORTH': "golf"}, {'ORTH': "club"}]
@ -25,6 +27,7 @@ def test_issue615(en_tokenizer):
matcher = Matcher(doc.vocab)
matcher.add(label, merge_phrases, pattern)
match = matcher(doc)
print(match)
entities = list(doc.ents)
assert entities != [] #assertion 1

View File

@ -1,5 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
word2vec_str = """, -0.046107 -0.035951 -0.560418
@ -8,6 +9,7 @@ de -0.648927 -0.400976 -0.527124
\u00A0 -1.499184 -0.184280 -0.598371"""
@pytest.mark.xfail
def test_issue834(en_vocab, text_file):
"""Test that no-break space (U+00A0) is detected as space by the load_vectors function."""
text_file.write(word2vec_str)

View File

@ -7,6 +7,7 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.xfail
@pytest.mark.parametrize('text', [["a", "b", "c"]])
def test_stringstore_freeze_oov(stringstore, text):
assert stringstore[text[0]] == 1

View File

@ -8,69 +8,65 @@ import pytest
@pytest.mark.parametrize('text1,text2,text3', [(b'Hello', b'goodbye', b'hello')])
def test_stringstore_save_bytes(stringstore, text1, text2, text3):
i = stringstore[text1]
assert i == 1
assert stringstore[text1] == 1
assert stringstore[text2] != i
assert stringstore[text3] != i
assert i == 1
key = stringstore.add(text1)
assert stringstore[text1] == key
assert stringstore[text2] != key
assert stringstore[text3] != key
@pytest.mark.parametrize('text1,text2,text3', [('Hello', 'goodbye', 'hello')])
def test_stringstore_save_unicode(stringstore, text1, text2, text3):
i = stringstore[text1]
assert i == 1
assert stringstore[text1] == 1
assert stringstore[text2] != i
assert stringstore[text3] != i
assert i == 1
key = stringstore.add(text1)
assert stringstore[text1] == key
assert stringstore[text2] != key
assert stringstore[text3] != key
@pytest.mark.parametrize('text', [b'A'])
def test_stringstore_retrieve_id(stringstore, text):
i = stringstore[text]
assert stringstore.size == 1
assert stringstore[1] == text.decode('utf8')
with pytest.raises(IndexError):
stringstore[2]
key = stringstore.add(text)
assert len(stringstore) == 1
assert stringstore[key] == text.decode('utf8')
with pytest.raises(KeyError):
stringstore[20000]
@pytest.mark.parametrize('text1,text2', [(b'0123456789', b'A')])
def test_stringstore_med_string(stringstore, text1, text2):
store = stringstore[text1]
store = stringstore.add(text1)
assert stringstore[store] == text1.decode('utf8')
dummy = stringstore[text2]
dummy = stringstore.add(text2)
assert stringstore[text1] == store
def test_stringstore_long_string(stringstore):
text = "INFORMATIVE](http://www.google.com/search?as_q=RedditMonkey&amp;hl=en&amp;num=50&amp;btnG=Google+Search&amp;as_epq=&amp;as_oq=&amp;as_eq=&amp;lr=&amp;as_ft=i&amp;as_filetype=&amp;as_qdr=all&amp;as_nlo=&amp;as_nhi=&amp;as_occt=any&amp;as_dt=i&amp;as_sitesearch=&amp;as_rights=&amp;safe=off"
store = stringstore[text]
store = stringstore.add(text)
assert stringstore[store] == text
@pytest.mark.parametrize('factor', [254, 255, 256])
def test_stringstore_multiply(stringstore, factor):
text = 'a' * factor
store = stringstore[text]
store = stringstore.add(text)
assert stringstore[store] == text
def test_stringstore_massive_strings(stringstore):
text = 'a' * 511
store = stringstore[text]
store = stringstore.add(text)
assert stringstore[store] == text
text2 = 'z' * 512
store = stringstore[text2]
store = stringstore.add(text2)
assert stringstore[store] == text2
text3 = '1' * 513
store = stringstore[text3]
store = stringstore.add(text3)
assert stringstore[store] == text3
@pytest.mark.parametrize('text', ["qqqqq"])
def test_stringstore_to_bytes(stringstore, text):
store = stringstore[text]
store = stringstore.add(text)
serialized = stringstore.to_bytes()
new_stringstore = StringStore().from_bytes(serialized)
assert new_stringstore[store] == text

View File

@ -10,8 +10,11 @@ import numpy
def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None):
"""Create Doc object from given vocab, words and annotations."""
pos = pos or [''] * len(words)
tags = tags or [''] * len(words)
heads = heads or [0] * len(words)
deps = deps or [''] * len(words)
for value in (deps+tags+pos):
vocab.strings.add(value)
doc = Doc(vocab, words=words)
attrs = doc.to_array([POS, HEAD, DEP])

View File

@ -16,7 +16,7 @@ def vectors():
def vocab(en_vocab, vectors):
return add_vecs_to_vocab(en_vocab, vectors)
@pytest.mark.xfail
def test_vectors_similarity_LL(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
lex1 = vocab[word1]
@ -30,6 +30,7 @@ def test_vectors_similarity_LL(vocab, vectors):
assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1))
@pytest.mark.xfail
def test_vectors_similarity_TT(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc = get_doc(vocab, words=[word1, word2])
@ -42,18 +43,21 @@ def test_vectors_similarity_TT(vocab, vectors):
assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1]))
@pytest.mark.xfail
def test_vectors_similarity_TD(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc = get_doc(vocab, words=[word1, word2])
assert doc.similarity(doc[0]) == doc[0].similarity(doc)
@pytest.mark.xfail
def test_vectors_similarity_DS(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc = get_doc(vocab, words=[word1, word2])
assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
@pytest.mark.xfail
def test_vectors_similarity_TS(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc = get_doc(vocab, words=[word1, word2])

View File

@ -22,6 +22,7 @@ def tokenizer_v(vocab):
return Tokenizer(vocab, {}, None, None, None)
@pytest.mark.xfail
@pytest.mark.parametrize('text', ["apple and orange"])
def test_vectors_token_vector(tokenizer_v, vectors, text):
doc = tokenizer_v(text)
@ -29,6 +30,7 @@ def test_vectors_token_vector(tokenizer_v, vectors, text):
assert vectors[1] == (doc[2].text, list(doc[2].vector))
@pytest.mark.xfail
@pytest.mark.parametrize('text', ["apple", "orange"])
def test_vectors_lexeme_vector(vocab, text):
lex = vocab[text]
@ -36,6 +38,7 @@ def test_vectors_lexeme_vector(vocab, text):
assert lex.vector_norm
@pytest.mark.xfail
@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
def test_vectors_doc_vector(vocab, text):
doc = get_doc(vocab, text)
@ -43,6 +46,7 @@ def test_vectors_doc_vector(vocab, text):
assert doc.vector_norm
@pytest.mark.xfail
@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
def test_vectors_span_vector(vocab, text):
span = get_doc(vocab, text)[0:2]
@ -50,6 +54,7 @@ def test_vectors_span_vector(vocab, text):
assert span.vector_norm
@pytest.mark.xfail
@pytest.mark.parametrize('text', ["apple orange"])
def test_vectors_token_token_similarity(tokenizer_v, text):
doc = tokenizer_v(text)
@ -57,6 +62,7 @@ def test_vectors_token_token_similarity(tokenizer_v, text):
assert 0.0 < doc[0].similarity(doc[1]) < 1.0
@pytest.mark.xfail
@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
def test_vectors_token_lexeme_similarity(tokenizer_v, vocab, text1, text2):
token = tokenizer_v(text1)
@ -65,6 +71,7 @@ def test_vectors_token_lexeme_similarity(tokenizer_v, vocab, text1, text2):
assert 0.0 < token.similarity(lex) < 1.0
@pytest.mark.xfail
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
def test_vectors_token_span_similarity(vocab, text):
doc = get_doc(vocab, text)
@ -72,6 +79,7 @@ def test_vectors_token_span_similarity(vocab, text):
assert 0.0 < doc[0].similarity(doc[1:3]) < 1.0
@pytest.mark.xfail
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
def test_vectors_token_doc_similarity(vocab, text):
doc = get_doc(vocab, text)
@ -79,6 +87,7 @@ def test_vectors_token_doc_similarity(vocab, text):
assert 0.0 < doc[0].similarity(doc) < 1.0
@pytest.mark.xfail
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
def test_vectors_lexeme_span_similarity(vocab, text):
doc = get_doc(vocab, text)
@ -87,6 +96,7 @@ def test_vectors_lexeme_span_similarity(vocab, text):
assert 0.0 < doc.similarity(doc[1:3]) < 1.0
@pytest.mark.xfail
@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
def test_vectors_lexeme_lexeme_similarity(vocab, text1, text2):
lex1 = vocab[text1]
@ -95,6 +105,7 @@ def test_vectors_lexeme_lexeme_similarity(vocab, text1, text2):
assert 0.0 < lex1.similarity(lex2) < 1.0
@pytest.mark.xfail
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
def test_vectors_lexeme_doc_similarity(vocab, text):
doc = get_doc(vocab, text)
@ -103,6 +114,7 @@ def test_vectors_lexeme_doc_similarity(vocab, text):
assert 0.0 < lex.similarity(doc) < 1.0
@pytest.mark.xfail
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
def test_vectors_span_span_similarity(vocab, text):
doc = get_doc(vocab, text)
@ -110,6 +122,7 @@ def test_vectors_span_span_similarity(vocab, text):
assert 0.0 < doc[0:2].similarity(doc[1:3]) < 1.0
@pytest.mark.xfail
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
def test_vectors_span_doc_similarity(vocab, text):
doc = get_doc(vocab, text)
@ -117,6 +130,7 @@ def test_vectors_span_doc_similarity(vocab, text):
assert 0.0 < doc[0:2].similarity(doc) < 1.0
@pytest.mark.xfail
@pytest.mark.parametrize('text1,text2', [
(["apple", "and", "apple", "pie"], ["orange", "juice"])])
def test_vectors_doc_doc_similarity(vocab, text1, text2):

View File

@ -5,6 +5,7 @@ import numpy
import pytest
@pytest.mark.xfail
@pytest.mark.parametrize('text', ["Hello"])
def test_vocab_add_vector(en_vocab, text):
en_vocab.resize_vectors(10)

View File

@ -11,7 +11,6 @@ import struct
import dill
from libc.string cimport memcpy, memset
from libc.stdint cimport uint32_t
from libc.math cimport sqrt
from .span cimport Span
@ -21,6 +20,7 @@ from .token cimport Token
from .printers import parse_tree
from ..lexeme cimport Lexeme, EMPTY_LEXEME
from ..typedefs cimport attr_t, flags_t
from ..attrs import intify_attrs
from ..attrs cimport attr_id_t
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
@ -494,8 +494,8 @@ cdef class Doc:
cdef np.ndarray[attr_t, ndim=2] output
# Make an array from the attributes --- otherwise our inner loop is Python
# dict iteration.
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.int32)
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int32)
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
for i in range(self.length):
for j, feature in enumerate(attr_ids):
output[i, j] = get_token_attr(&self.c[i], feature)
@ -640,7 +640,7 @@ cdef class Doc:
"""
if self.length != 0:
raise ValueError("Cannot load into non-empty Doc")
cdef int[:, :] attrs
cdef attr_t[:, :] attrs
cdef int i, start, end, has_space
fields = dill.loads(data)
text, attrs = fields[:2]
@ -679,17 +679,15 @@ cdef class Doc:
if len(args) == 3:
# TODO: Warn deprecation
tag, lemma, ent_type = args
attributes[TAG] = self.vocab.strings[tag]
attributes[LEMMA] = self.vocab.strings[lemma]
attributes[ENT_TYPE] = self.vocab.strings[ent_type]
attributes[TAG] = tag
attributes[LEMMA] = lemma
attributes[ENT_TYPE] = ent_type
elif not args:
# TODO: This code makes little sense overall. We're still
# ignoring most of the attributes?
if "label" in attributes and 'ent_type' not in attributes:
if type(attributes["label"]) == int:
attributes[ENT_TYPE] = attributes["label"]
else:
attributes[ENT_TYPE] = self.vocab.strings[attributes["label"]]
attributes[ENT_TYPE] = self.vocab.strings.add(attributes["label"])
if 'ent_type' in attributes:
attributes[ENT_TYPE] = attributes['ent_type']
elif args:
@ -699,6 +697,12 @@ cdef class Doc:
"Arguments supplied:\n%s\n"
"Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes)))
# More deprecated attribute handling =/
if 'label' in attributes:
attributes['ent_type'] = attributes.pop('label')
attributes = intify_attrs(attributes, strings_map=self.vocab.strings)
cdef int start = token_by_start(self.c, self.length, start_idx)
if start == -1:
return None
@ -708,13 +712,6 @@ cdef class Doc:
# Currently we have the token index, we want the range-end index
end += 1
cdef Span span = self[start:end]
tag = self.vocab.strings[attributes.get(TAG, span.root.tag)]
lemma = self.vocab.strings[attributes.get(LEMMA, span.root.lemma)]
ent_type = self.vocab.strings[attributes.get(ENT_TYPE, span.root.ent_type)]
ent_id = attributes.get('ent_id', span.root.ent_id)
if isinstance(ent_id, basestring):
ent_id = self.vocab.strings[ent_id]
# Get LexemeC for newly merged token
new_orth = ''.join([t.text_with_ws for t in span])
if span[-1].whitespace_:
@ -723,18 +720,11 @@ cdef class Doc:
# House the new merged token where it starts
cdef TokenC* token = &self.c[start]
token.spacy = self.c[end-1].spacy
if tag in self.vocab.morphology.tag_map:
self.vocab.morphology.assign_tag(token, tag)
for attr_name, attr_value in attributes.items():
if attr_name == TAG:
self.vocab.morphology.assign_tag(token, attr_value)
else:
token.tag = self.vocab.strings[tag]
token.lemma = self.vocab.strings[lemma]
if ent_type == 'O':
token.ent_iob = 2
token.ent_type = 0
else:
token.ent_iob = 3
token.ent_type = self.vocab.strings[ent_type]
token.ent_id = ent_id
Token.set_struct_attr(token, attr_name, attr_value)
# Begin by setting all the head indices to absolute token positions
# This is easier to work with for now than the offsets
# Before thinking of something simpler, beware the case where a dependency

View File

@ -21,14 +21,14 @@ from .. import about
cdef class Span:
"""A slice from a Doc object."""
def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None,
def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None,
vector_norm=None):
"""Create a `Span` object from the slice `doc[start : end]`.
doc (Doc): The parent document.
start (int): The index of the first token of the span.
end (int): The index of the first token after the span.
label (int): A label to attach to the Span, e.g. for named entities.
label (uint64): A label to attach to the Span, e.g. for named entities.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
RETURNS (Span): The newly constructed object.
"""
@ -377,7 +377,7 @@ cdef class Span:
property ent_id:
"""An (integer) entity ID. Usually assigned by patterns in the `Matcher`.
RETURNS (int): The entity ID.
RETURNS (uint64): The entity ID.
"""
def __get__(self):
return self.root.ent_id

View File

@ -202,11 +202,11 @@ cdef class Token:
property lemma:
"""Base form of the word, with no inflectional suffixes.
RETURNS (int): Token lemma.
RETURNS (uint64): Token lemma.
"""
def __get__(self):
return self.c.lemma
def __set__(self, int lemma):
def __set__(self, attr_t lemma):
self.c.lemma = lemma
property pos:
@ -216,13 +216,13 @@ cdef class Token:
property tag:
def __get__(self):
return self.c.tag
def __set__(self, int tag):
def __set__(self, attr_t tag):
self.vocab.morphology.assign_tag(self.c, tag)
property dep:
def __get__(self):
return self.c.dep
def __set__(self, int label):
def __set__(self, attr_t label):
self.c.dep = label
property has_vector:
@ -234,12 +234,7 @@ cdef class Token:
def __get__(self):
if 'has_vector' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['has_vector'](self)
cdef int i
for i in range(self.vocab.vectors_length):
if self.c.lex.vector[i] != 0:
return True
else:
return False
return self.vocab.has_vector(self.lex.c.orth)
property vector:
"""A real-valued meaning representation.
@ -250,16 +245,7 @@ cdef class Token:
def __get__(self):
if 'vector' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['vector'](self)
cdef int length = self.vocab.vectors_length
if length == 0:
raise ValueError(
"Word vectors set to length 0. This may be because you "
"don't have a model installed or loaded, or because your "
"model doesn't include word vectors. For more info, see "
"the documentation: \n%s\n" % about.__docs_models__
)
vector_view = <float[:length,]>self.c.lex.vector
return numpy.asarray(vector_view)
return self.vocab.get_vector(self.c.lex.orth)
property vector_norm:
"""The L2 norm of the token's vector representation.
@ -269,7 +255,8 @@ cdef class Token:
def __get__(self):
if 'vector_norm' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['vector_norm'](self)
return self.c.lex.l2_norm
vector = self.vector
return numpy.sqrt((vector ** 2).sum())
property n_lefts:
def __get__(self):
@ -516,16 +503,18 @@ cdef class Token:
property ent_type:
"""Named entity type.
RETURNS (int): Named entity type.
RETURNS (uint64): Named entity type.
"""
def __get__(self):
return self.c.ent_type
def __set__(self, ent_type):
self.c.ent_type = ent_type
property ent_iob:
"""IOB code of named entity tag. `1="I", 2="O", 3="B"`. 0 means no tag
is assigned.
RETURNS (int): IOB code of named entity tag.
RETURNS (uint64): IOB code of named entity tag.
"""
def __get__(self):
return self.c.ent_iob
@ -537,6 +526,8 @@ cdef class Token:
"""
def __get__(self):
return self.vocab.strings[self.c.ent_type]
def __set__(self, ent_type):
self.c.ent_type = self.vocab.strings.add(ent_type)
property ent_iob_:
"""IOB code of named entity tag. "B" means the token begins an entity,
@ -553,7 +544,7 @@ cdef class Token:
"""ID of the entity the token is an instance of, if any. Usually
assigned by patterns in the Matcher.
RETURNS (int): ID of the entity.
RETURNS (uint64): ID of the entity.
"""
def __get__(self):
return self.c.ent_id
@ -571,7 +562,7 @@ cdef class Token:
return self.vocab.strings[self.c.ent_id]
def __set__(self, name):
self.c.ent_id = self.vocab.strings[name]
self.c.ent_id = self.vocab.strings.add(name)
property whitespace_:
def __get__(self):
@ -613,7 +604,7 @@ cdef class Token:
def __get__(self):
return self.vocab.strings[self.c.lemma]
def __set__(self, unicode lemma_):
self.c.lemma = self.vocab.strings[lemma_]
self.c.lemma = self.vocab.strings.add(lemma_)
property pos_:
def __get__(self):
@ -623,13 +614,13 @@ cdef class Token:
def __get__(self):
return self.vocab.strings[self.c.tag]
def __set__(self, tag):
self.tag = self.vocab.strings[tag]
self.tag = self.vocab.strings.add(tag)
property dep_:
def __get__(self):
return self.vocab.strings[self.c.dep]
def __set__(self, unicode label):
self.c.dep = self.vocab.strings[label]
self.c.dep = self.vocab.strings.add(label)
property is_oov:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV)

View File

@ -4,7 +4,7 @@ from libc.stdint cimport uint8_t
ctypedef uint64_t hash_t
ctypedef char* utf8_t
ctypedef int32_t attr_t
ctypedef uint64_t attr_t
ctypedef uint64_t flags_t
ctypedef uint16_t len_t
ctypedef uint16_t tag_t

View File

@ -78,27 +78,86 @@ def ensure_path(path):
return path
def resolve_model_path(name):
"""Resolve a model name or string to a model path.
def load_model(name):
"""Load a model from a shortcut link, package or data path.
name (unicode): Package name, shortcut link or model path.
RETURNS (Path): Path to model data directory.
RETURNS (Language): `Language` class with the loaded model.
"""
data_path = get_data_path()
if not data_path or not data_path.exists():
raise IOError("Can't find spaCy data path: %s" % path2str(data_path))
if isinstance(name, basestring_):
if (data_path / name).exists(): # in data dir or shortcut link
return (data_path / name)
if is_package(name): # installed as a package
return get_model_package_path(name)
if Path(name).exists(): # path to model
return Path(name)
elif hasattr(name, 'exists'): # Path or Path-like object
return name
if (data_path / name).exists(): # in data dir or shortcut
return load_model_from_path(data_path / name)
if is_package(name): # installed as package
return load_model_from_pkg(name)
if Path(name).exists(): # path to model data directory
return load_data_from_path(Path(name))
elif hasattr(name, 'exists'): # Path or Path-like to model data
return load_data_from_path(name)
raise IOError("Can't find model '%s'" % name)
def load_model_from_init_py(init_file):
"""Helper function to use in the `load()` method of a model package's
__init__.py.
init_file (unicode): Path to model's __init__.py, i.e. `__file__`.
RETURNS (Language): `Language` class with loaded model.
"""
model_path = Path(init_file).parent
return load_data_from_path(model_path, package=True)
def load_model_from_path(model_path):
"""Import and load a model package from its file path.
path (unicode or Path): Path to package directory.
RETURNS (Language): `Language` class with loaded model.
"""
model_path = ensure_path(model_path)
spec = importlib.util.spec_from_file_location('model', model_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module.load()
def load_model_from_pkg(name):
"""Import and load a model package.
name (unicode): Name of model package installed via pip.
RETURNS (Language): `Language` class with loaded model.
"""
module = importlib.import_module(name)
return module.load()
def load_data_from_path(model_path, package=False):
"""Initialie a `Language` class with a loaded model from a model data path.
model_path (unicode or Path): Path to model data directory.
package (bool): Does the path point to the parent package directory?
RETURNS (Language): `Language` class with loaded model.
"""
model_path = ensure_path(model_path)
meta_path = model_path / 'meta.json'
if not meta_path.is_file():
raise IOError("Could not read meta.json from %s" % location)
meta = read_json(location)
for setting in ['lang', 'name', 'version']:
if setting not in meta:
raise IOError('No %s setting found in model meta.json' % setting)
if package:
model_data_path = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version'])
model_path = model_path / model_data_path
if not model_path.exists():
raise ValueError("Can't find model directory: %s" % path2str(model_path))
cls = get_lang_class(meta['lang'])
nlp = cls(pipeline=meta.get('pipeline', True))
return nlp.from_disk(model_path)
def is_package(name):
"""Check if string maps to a package installed via pip.
@ -112,36 +171,16 @@ def is_package(name):
return False
def get_model_package_path(package_name):
"""Get path to a model package installed via pip.
def get_package_path(name):
"""Get the path to an installed package.
package_name (unicode): Name of installed package.
RETURNS (Path): Path to model data directory.
name (unicode): Package name.
RETURNS (Path): Path to installed package.
"""
# Here we're importing the module just to find it. This is worryingly
# indirect, but it's otherwise very difficult to find the package.
# Python's installation and import rules are very complicated.
pkg = importlib.import_module(package_name)
package_path = Path(pkg.__file__).parent.parent
meta = parse_package_meta(package_path / package_name)
model_name = '%s-%s' % (package_name, meta['version'])
return package_path / package_name / model_name
def parse_package_meta(package_path, require=True):
"""Check if a meta.json exists in a package and return its contents.
package_path (Path): Path to model package directory.
require (bool): If True, raise error if no meta.json is found.
RETURNS (dict or None): Model meta.json data or None.
"""
location = package_path / 'meta.json'
if location.is_file():
return read_json(location)
elif require:
raise IOError("Could not read meta.json from %s" % location)
else:
return None
return Path(pkg.__file__).parent
def is_in_jupyter():
@ -177,10 +216,13 @@ def get_async(stream, numpy_array):
def itershuffle(iterable, bufsize=1000):
"""Shuffle an iterator. This works by holding `bufsize` items back
and yielding them sometime later. Obviously, this is not unbiased --
and yielding them sometime later. Obviously, this is not unbiased
but should be good enough for batching. Larger bufsize means less bias.
From https://gist.github.com/andres-erbsen/1307752
iterable (iterable): Iterator to shuffle.
bufsize (int): Items to hold back.
YIELDS (iterable): The shuffled iterator.
"""
iterable = iter(iterable)
buf = []
@ -315,17 +357,16 @@ def normalize_slice(length, start, stop, step=None):
def compounding(start, stop, compound):
'''Yield an infinite series of compounding values. Each time the
"""Yield an infinite series of compounding values. Each time the
generator is called, a value is produced by multiplying the previous
value by the compound rate.
EXAMPLE
EXAMPLE:
>>> sizes = compounding(1., 10., 1.5)
>>> assert next(sizes) == 1.
>>> assert next(sizes) == 1 * 1.5
>>> assert next(sizes) == 1.5 * 1.5
'''
"""
def clip(value):
return max(value, stop) if (start>stop) else min(value, stop)
curr = float(start)
@ -335,7 +376,7 @@ def compounding(start, stop, compound):
def decaying(start, stop, decay):
'''Yield an infinite series of linearly decaying values.'''
"""Yield an infinite series of linearly decaying values."""
def clip(value):
return max(value, stop) if (start>stop) else min(value, stop)
nr_upd = 1.
@ -344,12 +385,6 @@ def decaying(start, stop, decay):
nr_upd += 1
def check_renamed_kwargs(renamed, kwargs):
for old, new in renamed.items():
if old in kwargs:
raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
def read_json(location):
"""Open and load JSON from file.

View File

@ -26,15 +26,6 @@ from . import attrs
from . import symbols
DEF MAX_VEC_SIZE = 100000
cdef float[MAX_VEC_SIZE] EMPTY_VEC
memset(EMPTY_VEC, 0, sizeof(EMPTY_VEC))
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
EMPTY_LEXEME.vector = EMPTY_VEC
cdef class Vocab:
"""A look-up table that allows you to access `Lexeme` objects. The `Vocab`
instance also provides access to the `StringStore`, and owns underlying
@ -53,8 +44,6 @@ cdef class Vocab:
vice versa.
RETURNS (Vocab): The newly constructed vocab object.
"""
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
tag_map = tag_map if tag_map is not None else {}
if lemmatizer in (None, True, False):
@ -66,7 +55,7 @@ cdef class Vocab:
self.strings = StringStore()
if strings:
for string in strings:
self.strings[string]
self.strings.add(string)
# Load strings in a special order, so that we have an onset number for
# the vocabulary. This way, when words are added in order, the orth ID
# is the frequency rank of the word, plus a certain offset. The structural
@ -77,7 +66,7 @@ cdef class Vocab:
# Need to rethink this.
for name in symbols.NAMES + list(sorted(tag_map.keys())):
if name:
_ = self.strings[name]
self.strings.add(name)
self.lex_attr_getters = lex_attr_getters
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
@ -176,15 +165,14 @@ cdef class Vocab:
mem = self.mem
cdef bint is_oov = mem is not self.mem
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
lex.orth = self.strings[string]
lex.orth = self.strings.add(string)
lex.length = len(string)
lex.id = self.length
lex.vector = <float*>mem.alloc(self.vectors_length, sizeof(float))
if self.lex_attr_getters is not None:
for attr, func in self.lex_attr_getters.items():
value = func(string)
if isinstance(value, unicode):
value = self.strings[value]
value = self.strings.add(value)
if attr == PROB:
lex.prob = value
elif value is not None:
@ -239,7 +227,7 @@ cdef class Vocab:
"""
cdef attr_t orth
if type(id_or_string) == unicode:
orth = self.strings[id_or_string]
orth = self.strings.add(id_or_string)
else:
orth = id_or_string
return Lexeme(self, orth)
@ -258,6 +246,26 @@ cdef class Vocab:
Token.set_struct_attr(token, attr_id, value)
return tokens
def get_vector(self, orth):
"""Retrieve a vector for a word in the vocabulary.
Words can be looked up by string or int ID.
RETURNS:
A word vector. Size and shape determed by the
vocab.vectors instance. Usually, a numpy ndarray
of shape (300,) and dtype float32.
RAISES: If no vectors data is loaded, ValueError is raised.
"""
raise NotImplementedError
def has_vector(self, orth):
"""Check whether a word has a vector. Returns False if no
vectors have been loaded. Words can be looked up by string
or int ID."""
raise NotImplementedError
def to_disk(self, path):
"""Save the current state to a directory.
@ -271,9 +279,6 @@ cdef class Vocab:
with strings_loc.open('w', encoding='utf8') as file_:
self.strings.dump(file_)
# TODO: pickle
# self.dump(path / 'lexemes.bin')
def from_disk(self, path):
"""Loads state from a directory. Modifies the object in place and
returns it.
@ -286,7 +291,7 @@ cdef class Vocab:
with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
strings_list = ujson.load(file_)
for string in strings_list:
self.strings[string]
self.strings.add(string)
self.load_lexemes(path / 'lexemes.bin')
def to_bytes(self, **exclude):
@ -346,7 +351,6 @@ cdef class Vocab:
lex_data.data[j] = bytes_ptr[i+j]
Lexeme.c_from_bytes(lexeme, lex_data)
lexeme.vector = EMPTY_VEC
py_str = self.strings[lexeme.orth]
assert self.strings[py_str] == lexeme.orth, (py_str, lexeme.orth)
key = hash_string(py_str)
@ -354,172 +358,6 @@ cdef class Vocab:
self._by_orth.set(lexeme.orth, lexeme)
self.length += 1
# Deprecated --- delete these once stable
def dump_vectors(self, out_loc):
"""Save the word vectors to a binary file.
loc (Path): The path to save to.
"""
cdef int32_t vec_len = self.vectors_length
cdef int32_t word_len
cdef bytes word_str
cdef char* chars
cdef Lexeme lexeme
cdef CFile out_file = CFile(out_loc, 'wb')
for lexeme in self:
word_str = lexeme.orth_.encode('utf8')
vec = lexeme.c.vector
word_len = len(word_str)
out_file.write_from(&word_len, 1, sizeof(word_len))
out_file.write_from(&vec_len, 1, sizeof(vec_len))
chars = <char*>word_str
out_file.write_from(chars, word_len, sizeof(char))
out_file.write_from(vec, vec_len, sizeof(float))
out_file.close()
def load_vectors(self, file_):
"""Load vectors from a text-based file.
file_ (buffer): The file to read from. Entries should be separated by
newlines, and each entry should be whitespace delimited. The first value of the entry
should be the word string, and subsequent entries should be the values of the
vector.
RETURNS (int): The length of the vectors loaded.
"""
cdef LexemeC* lexeme
cdef attr_t orth
cdef int32_t vec_len = -1
cdef double norm = 0.0
whitespace_pattern = re.compile(r'\s', re.UNICODE)
for line_num, line in enumerate(file_):
pieces = line.split()
word_str = " " if whitespace_pattern.match(line) else pieces.pop(0)
if vec_len == -1:
vec_len = len(pieces)
elif vec_len != len(pieces):
raise VectorReadError.mismatched_sizes(file_, line_num,
vec_len, len(pieces))
orth = self.strings[word_str]
lexeme = <LexemeC*><void*>self.get_by_orth(self.mem, orth)
lexeme.vector = <float*>self.mem.alloc(vec_len, sizeof(float))
for i, val_str in enumerate(pieces):
lexeme.vector[i] = float(val_str)
norm = 0.0
for i in range(vec_len):
norm += lexeme.vector[i] * lexeme.vector[i]
lexeme.l2_norm = sqrt(norm)
self.vectors_length = vec_len
return vec_len
def load_vectors_from_bin_loc(self, loc):
"""Load vectors from the location of a binary file.
loc (unicode): The path of the binary file to load from.
RETURNS (int): The length of the vectors loaded.
"""
cdef CFile file_ = CFile(loc, b'rb')
cdef int32_t word_len
cdef int32_t vec_len = 0
cdef int32_t prev_vec_len = 0
cdef float* vec
cdef Address mem
cdef attr_t string_id
cdef bytes py_word
cdef vector[float*] vectors
cdef int line_num = 0
cdef Pool tmp_mem = Pool()
while True:
try:
file_.read_into(&word_len, sizeof(word_len), 1)
except IOError:
break
file_.read_into(&vec_len, sizeof(vec_len), 1)
if prev_vec_len != 0 and vec_len != prev_vec_len:
raise VectorReadError.mismatched_sizes(loc, line_num,
vec_len, prev_vec_len)
if 0 >= vec_len >= MAX_VEC_SIZE:
raise VectorReadError.bad_size(loc, vec_len)
chars = <char*>file_.alloc_read(tmp_mem, word_len, sizeof(char))
vec = <float*>file_.alloc_read(self.mem, vec_len, sizeof(float))
string_id = self.strings[chars[:word_len]]
# Insert words into vocab to add vector.
self.get_by_orth(self.mem, string_id)
while string_id >= vectors.size():
vectors.push_back(EMPTY_VEC)
assert vec != NULL
vectors[string_id] = vec
line_num += 1
cdef LexemeC* lex
cdef size_t lex_addr
cdef double norm = 0.0
cdef int i
for orth, lex_addr in self._by_orth.items():
lex = <LexemeC*>lex_addr
if lex.lower < vectors.size():
lex.vector = vectors[lex.lower]
norm = 0.0
for i in range(vec_len):
norm += lex.vector[i] * lex.vector[i]
lex.l2_norm = sqrt(norm)
else:
lex.vector = EMPTY_VEC
self.vectors_length = vec_len
return vec_len
def resize_vectors(self, int new_size):
"""Set vectors_length to a new size, and allocate more memory for the
`Lexeme` vectors if necessary. The memory will be zeroed.
new_size (int): The new size of the vectors.
"""
cdef hash_t key
cdef size_t addr
if new_size > self.vectors_length:
for key, addr in self._by_hash.items():
lex = <LexemeC*>addr
lex.vector = <float*>self.mem.realloc(lex.vector,
new_size * sizeof(lex.vector[0]))
self.vectors_length = new_size
def write_binary_vectors(in_loc, out_loc):
cdef CFile out_file = CFile(out_loc, 'wb')
cdef Address mem
cdef int32_t word_len
cdef int32_t vec_len
cdef char* chars
with bz2.BZ2File(in_loc, 'r') as file_:
for line in file_:
pieces = line.split()
word = pieces.pop(0)
mem = Address(len(pieces), sizeof(float))
vec = <float*>mem.ptr
for i, val_str in enumerate(pieces):
vec[i] = float(val_str)
word_len = len(word)
vec_len = len(pieces)
out_file.write_from(&word_len, 1, sizeof(word_len))
out_file.write_from(&vec_len, 1, sizeof(vec_len))
chars = <char*>word
out_file.write_from(chars, len(word), sizeof(char))
out_file.write_from(vec, vec_len, sizeof(float))
def pickle_vocab(vocab):
sstore = vocab.strings
@ -567,21 +405,3 @@ class LookupError(Exception):
"ID of orth: {orth_id}".format(
query=repr(original_string), orth_str=repr(id_string), orth_id=id_)
)
class VectorReadError(Exception):
@classmethod
def mismatched_sizes(cls, loc, line_num, prev_size, curr_size):
return cls(
"Error reading word vectors from %s on line %d.\n"
"All vectors must be the same size.\n"
"Prev size: %d\n"
"Curr size: %d" % (loc, line_num, prev_size, curr_size))
@classmethod
def bad_size(cls, loc, size):
return cls(
"Error reading word vectors from %s.\n"
"Vector size: %d\n"
"Max size: %d\n"
"Min size: 1\n" % (loc, size, MAX_VEC_SIZE))

View File

@ -1,9 +1,9 @@
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-1 -1 907 737" width="906" height="746">
<style>
.svg__architecture__text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
.svg__architecture__text-medium { fill: #1a1e23; font: 17px "Source Sans Pro" }
.svg__architecture__text-small { fill: #1a1e23; font: bold 14px "Source Sans Pro" }
.svg__architecture__text-code { fill: #1a1e23; font: 600 12px "Source Code Pro" }
.svg__architecture__text-large { fill: #1a1e23; font: 20px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__architecture__text-medium { fill: #1a1e23; font: 17px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__architecture__text-small { fill: #1a1e23; font: bold 14px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__architecture__text-code { fill: #1a1e23; font: 600 12px "Source Code Pro", Monaco, "Courier New", monospace }
</style>
<ellipse cx="404" cy="203" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="74.8" ry="49.8"/>
<text class="svg__architecture__text-large" transform="translate(362.5 206.5)" width="81" height="40">Language</text>

Before

Width:  |  Height:  |  Size: 14 KiB

After

Width:  |  Height:  |  Size: 14 KiB

View File

@ -1,8 +1,8 @@
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="931" height="456" viewBox="-1 -1 932 480" preserveAspectRatio="xMinYMin meet">
<style>
.svg__langdata__text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
.svg__langdata__text-small { fill: #1a1e23; font: bold 16px "Source Sans Pro" }
.svg__langdata__text-tiny { fill: #1a1e23; font: bold 16px "Source Sans Pro" }
.svg__langdata__text-large { fill: #1a1e23; font: 20px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__langdata__text-small { fill: #1a1e23; font: bold 16px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__langdata__text-tiny { fill: #1a1e23; font: bold 16px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
</style>
<path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M610 404h-69.8" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M534.2 404l8-4-2 4 2 4z"/>

Before

Width:  |  Height:  |  Size: 9.0 KiB

After

Width:  |  Height:  |  Size: 9.1 KiB

View File

@ -1,8 +1,8 @@
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 923 200" width="923" height="200">
<style>
.svg__pipeline__text { fill: #1a1e23; font: 20px "Source Sans Pro" }
.svg__pipeline__text-small { fill: #1a1e23; font: bold 18px "Source Sans Pro" }
.svg__pipeline__text-code { fill: #1a1e23; font: 600 16px "Source Code Pro" }
.svg__pipeline__text { fill: #1a1e23; font: 20px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__pipeline__text-small { fill: #1a1e23; font: bold 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__pipeline__text-code { fill: #1a1e23; font: 600 16px "Source Code Pro", Monaco, "Courier New", monospace }
</style>
<rect width="601" height="127" x="159" y="21" fill="none" stroke="#09a3d5" stroke-width="3" rx="19.1" stroke-dasharray="3 6" ry="19.1"/>
<path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M801 55h120v60H801z"/>

Before

Width:  |  Height:  |  Size: 3.1 KiB

After

Width:  |  Height:  |  Size: 3.2 KiB

View File

@ -0,0 +1,123 @@
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="600" height="380" viewBox="-20 -10 550 400">
<style>
.svg__tokenization__text { fill: #1a1e23; font: 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__tokenization__text-small { fill: #fff; font: 600 13px "Source Code Pro", Monaco, "Courier New", monospace }
</style>
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M71 39v12H16v11M71 39v12h20v11"/>
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M1 1h140v38.2H1z"/>
<text class="svg__tokenization__text" dy="1em" width="43" height="19" transform="translate(48.5 9.5)">“Lets</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M175 39v23"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M150 1h50v38.2h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 9.5)" width="19" height="19">go</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M235 39v23"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M210 1h50v38.2h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 9.5)" width="15" height="19">to</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M341 39v23"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M270 1h141v38.2H270z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(320.5 9.5)" width="38" height="19">N.Y.!”</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 100v20"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 62h30v38.2H1z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 70.5)" width="7" height="19"></text>
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M91 100v11H66v9M91 100v11h29v9"/>
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M41 62h100v38.2H41z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(72.5 70.5)" width="35" height="19">Lets</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M175 100v20"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M150 62h50v38.2h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 70.5)" width="19" height="19">go</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M235 100v20"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M210 62h50v38.2h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 70.5)" width="15" height="19">to</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M341 100v20"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M270 62h141v38.2H270z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(320.5 70.5)" width="38" height="19">N.Y.!”</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 120h30v38H1z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 128.5)" width="7" height="19"></text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 158v24"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 120h50v38H41z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 128.5)" width="23" height="19">Let</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 158v24"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 120h50v38h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 128.5)" width="19" height="19">go</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 158v24"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 120h50v38h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 128.5)" width="15" height="19">to</text>
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M341 158v13h-20v11M341 158v13h55v11"/>
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M270 120h141v38H270z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(320.5 128.5)" width="38" height="19">N.Y.!”</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 158v24"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 120h40v38h-40z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 128.5)" width="11" height="19">s</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 220v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 181.8h30V220H1z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 190.5)" width="7" height="19"></text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 220v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 181.8h50V220H41z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 190.5)" width="23" height="19">Let</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 220v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 181.8h50V220h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 190.5)" width="19" height="19">go</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 220v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 181.8h50V220h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 190.5)" width="15" height="19">to</text>
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M321 220v11h-20v12M321 220v11h34v12"/>
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M270 181.8h101V220H270z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(304.5 190.5)" width="30" height="19">N.Y.!</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 220v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 181.8h40V220h-40z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 190.5)" width="11" height="19">s</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M396 220v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 181.8h30V220h-30z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(391.5 190.5)" width="7" height="19"></text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 281v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 242.7h30V281H1z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 251.5)" width="7" height="19"></text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 281v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 242.7h50V281H41z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 251.5)" width="23" height="19">Let</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 281v20-17 20"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 242.7h50V281h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 251.5)" width="19" height="19">go</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 281v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 242.7h50V281h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 251.5)" width="15" height="19">to</text>
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M301 281v23"/>
<path fill="#f8cecc" stroke="#b85450" stroke-width="2" d="M270 242.7h61V281h-61z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(286.5 251.5)" width="26" height="19">N.Y.</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 281v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 242.7h40V281h-40z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 251.5)" width="11" height="19">s</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M396 281v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 242.7h30V281h-30z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(391.5 251.5)" width="7" height="19"></text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M355 281v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M340 242.7h30V281h-30z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(351.5 251.5)" width="5" height="19">!</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 304h30v38H1z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 312.5)" width="7" height="19"></text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 304h50v38H41z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 312.5)" width="23" height="19">Let</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 304h50v38h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 312.5)" width="19" height="19">go</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 304h50v38h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 312.5)" width="15" height="19">to</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M270 304h61v38h-61z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(286.5 312.5)" width="26" height="19">N.Y.</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 304h40v38h-40z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 312.5)" width="11" height="19">s</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 304h30v38h-30z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(391.5 312.5)" width="7" height="19"></text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M340 304h30v38h-30z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(351.5 312.5)" width="5" height="19">!</text>
<rect width="104" height="19" x="437" y="72" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(455.5 74.5)" width="65" height="12">EXCEPTION</text>
<rect width="104" height="19" x="437" y="11" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 13.5)" width="43" height="12">PREFIX</text>
<rect width="104" height="19" x="437" y="130" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 132.5)" width="43" height="12">SUFFIX</text>
<rect width="104" height="19" x="437" y="191" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 193.5)" width="43" height="12">SUFFIX</text>
<rect width="104" height="19" x="437" y="252" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(455.5 254.5)" width="65" height="12">EXCEPTION</text>
<rect width="104" height="19" x="437" y="313" fill="#82b366" stroke="#82b366" stroke-width="2" rx="2.9" ry="2.9"/>
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(473.5 315.5)" width="29" height="12">DONE</text>
</svg>

After

Width:  |  Height:  |  Size: 12 KiB

View File

@ -1,9 +1,9 @@
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-10 -10 582 365" width="572" height="355">
<style>
.svg__vocab__text { fill: #1a1e23; font: 18px "Source Sans Pro" }
.svg__vocab__text-large { fill: #fff; font: bold 18px "Source Sans Pro"; text-transform: uppercase }
.svg__vocab__text-box { fill: #fff; font: bold 12px "Source Code Pro" }
.svg__vocab__text-code { fill: #1a1e23; font: bold 12px "Source Code Pro" }
.svg__vocab__text { fill: #1a1e23; font: 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__vocab__text-large { fill: #fff; font: bold 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif; text-transform: uppercase }
.svg__vocab__text-box { fill: #fff; font: bold 12px "Source Code Pro", Monaco, "Courier New", monospace }
.svg__vocab__text-code { fill: #1a1e23; font: bold 12px "Source Code Pro", Monaco, "Courier New", monospace }
</style>
<rect width="570" height="88" x="1" y="135" fill="#d5e8d4" stroke="#82b366" stroke-width="2" rx="13.2" ry="13.2"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 164h100v40H444z"/>

Before

Width:  |  Height:  |  Size: 7.6 KiB

After

Width:  |  Height:  |  Size: 7.8 KiB

View File

@ -158,7 +158,8 @@
"binder": {
"title": "Binder",
"tag": "class"
"tag": "class",
"source": "spacy/tokens/binder.pyx"
},
"annotation": {

View File

@ -2,7 +2,10 @@
include ../../_includes/_mixins
p spaCy currently supports the following languages and capabilities:
p
| spaCy currently provides models for the following languages and
| capabilities:
+aside-code("Download language models", "bash").
python -m spacy download en
@ -22,12 +25,16 @@ p spaCy currently supports the following languages and capabilities:
+row
+cell French #[code fr]
each icon in [ "pro", "pro", "con", "pro", "con", "pro", "pro", "con" ]
each icon in [ "pro", "con", "con", "pro", "con", "pro", "pro", "con" ]
+cell.u-text-center #[+procon(icon)]
+h(2, "available") Available models
+row
+cell Spanish #[code es]
each icon in [ "pro", "pro", "con", "pro", "pro", "pro", "pro", "con" ]
+cell.u-text-center #[+procon(icon)]
include ../usage/_models-list
p
+button("/docs/usage/models", true, "primary") See available models
+h(2, "alpha-support") Alpha tokenization support
@ -52,9 +59,35 @@ p
| #[+a("https://github.com/mocobeta/janome") Janome].
+table([ "Language", "Code", "Source" ])
each language, code in { es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", da: "Danish", hu: "Hungarian", pl: "Polish", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" }
each language, code in { it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", da: "Danish", hu: "Hungarian", pl: "Polish", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" }
+row
+cell #{language}
+cell #[code=code]
+cell
+src(gh("spaCy", "spacy/lang/" + code)) lang/#{code}
+h(2, "multi-language") Multi-language support
+tag-new(2)
p
| As of v2.0, spaCy supports models trained on more than one language. This
| is especially useful for named entity recognition. The language ID used
| for multi-language or language-neutral models is #[code xx]. The
| language class, a generic subclass containing only the base language data,
| can be found in #[+src(gh("spaCy", "spacy/lang/xx")) lang/xx].
p
| To load your model with the neutral, multi-language class, simply set
| #[code "language": "xx"] in your
| #[+a("/docs/usage/saving-loading#models-generating") model package]'s
| meta.json. You can also import the class directly, or call
| #[+api("util#get_lang_class") #[code util.get_lang_class()]] for
| lazy-loading.
+code("Standard import").
from spacy.lang.xx import MultiLanguage
nlp = MultiLanguage()
+code("With lazy-loading").
from spacy.util import get_lang_class
nlp = get_lang_class('xx')

View File

@ -11,8 +11,13 @@ p
| the name of an installed
| #[+a("/docs/usage/saving-loading#generating") model package], a unicode
| path or a #[code Path]-like object. spaCy will try resolving the load
| argument in this order. The #[code Language] class to initialise will be
| determined based on the model's settings.
| argument in this order. If a model is loaded from a shortcut link or
| package name, spaCy will assume it's a Python package and import it and
| call the model's own #[code load()] method. If a model is loaded from a
| path, spaCy will assume it's a data directory, read the language and
| pipeline settings off the meta.json and initialise the #[code Language]
| class. The data will be loaded in via
| #[+api("language#from_disk") #[code Language.from_disk()]].
+aside-code("Example").
nlp = spacy.load('en') # shortcut link
@ -20,7 +25,7 @@ p
nlp = spacy.load('/path/to/en') # unicode path
nlp = spacy.load(Path('/path/to/en')) # pathlib Path
nlp = spacy.load('en', disable['parser', 'tagger'])
nlp = spacy.load('en', disable=['parser', 'tagger'])
+table(["Name", "Type", "Description"])
+row

View File

@ -1,12 +1,10 @@
//- 💫 DOCS > API > ANNOTATION SPECS
//- 💫 DOCS > API > UTIL
include ../../_includes/_mixins
p
| spaCy comes with a small collection of utility functions located in
| #[+src(gh("spaCy", "spacy/util.py")) spacy/util.py].
+infobox("Important note")
| Because utility functions are mostly intended for
| #[strong internal use within spaCy], their behaviour may change with
| future releases. The functions documented on this page should be safe
@ -74,15 +72,23 @@ p
+cell #[code Language]
+cell Language class.
+h(2, "resolve_model_path") util.resolve_model_path
+h(2, "load_model") util.load_model
+tag function
+tag-new(2)
p Resolve a model name or string to a model path.
p
| Load a model from a shortcut link, package or data path. If called with a
| shortcut link or package name, spaCy will assume the model is a Python
| package and import and call its #[code load()] method. If called with a
| path, spaCy will assume it's a data directory, read the language and
| pipeline settings from the meta.json and initialise a #[code Language]
| class. The model data will then be loaded in via
| #[+api("language#from_disk") #[code Language.from_disk()]].
+aside-code("Example").
model_path = util.resolve_model_path('en')
model_path = util.resolve_model_path('/path/to/en')
nlp = util.load_model('en')
nlp = util.load_model('en_core_web_sm')
nlp = util.load_model('/path/to/data')
+table(["Name", "Type", "Description"])
+row
@ -92,8 +98,33 @@ p Resolve a model name or string to a model path.
+footrow
+cell returns
+cell #[code Path]
+cell Path to model data directory.
+cell #[code Language]
+cell #[code Language] class with the loaded model.
+h(2, "load_model_from_init_py") util.load_model_from_init_py
+tag function
+tag-new(2)
p
| A helper function to use in the #[code load()] method of a model package's
| #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) __init__.py].
+aside-code("Example").
from spacy.util import load_model_from_init_py
def load():
return load_model_from_init_py(__file__)
+table(["Name", "Type", "Description"])
+row
+cell #[code init_file]
+cell unicode
+cell Path to model's __init__.py, i.e. #[code __file__].
+footrow
+cell returns
+cell #[code Language]
+cell #[code Language] class with the loaded model.
+h(2, "is_package") util.is_package
+tag function
@ -117,16 +148,18 @@ p
+cell #[code bool]
+cell #[code True] if installed package, #[code False] if not.
+h(2, "get_model_package_path") util.get_model_package_path
+h(2, "get_package_path") util.get_package_path
+tag function
+tag-new(2)
p
| Get path to a #[+a("/docs/usage/models") model package] installed via pip.
| Currently imports the package to find it and parse its meta data.
| Get path to an installed package. Mainly used to resolve the location of
| #[+a("/docs/usage/models") model packages]. Currently imports the package
| to find its path.
+aside-code("Example").
util.get_model_package_path('en_core_web_sm')
# /usr/lib/python3.6/site-packages/en_core_web_sm/en_core_web_sm-1.2.0
util.get_package_path('en_core_web_sm')
# /usr/lib/python3.6/site-packages/en_core_web_sm
+table(["Name", "Type", "Description"])
+row
@ -137,37 +170,8 @@ p
+footrow
+cell returns
+cell #[code Path]
+cell Path to model data directory.
+h(2, "parse_package_meta") util.parse_package_meta
+tag function
p
| Check if a #[code meta.json] exists in a model package and return its
| contents.
+aside-code("Example").
if util.is_package('en_core_web_sm'):
path = util.get_model_package_path('en_core_web_sm')
meta = util.parse_package_meta(path, require=True)
# {'name': 'core_web_sm', 'lang': 'en', ...}
+table(["Name", "Type", "Description"])
+row
+cell #[code package_path]
+cell #[code Path]
+cell Path to model package directory.
+row
+cell #[code require]
+cell #[code bool]
+cell If #[code True], raise error if no #[code meta.json] is found.
+footrow
+cell returns
+cell dict / #[code None]
+cell Model meta data or #[code None].
+h(2, "is_in_jupyter") util.is_in_jupyter
+tag function
+tag-new(2)

View File

@ -5,7 +5,7 @@ p
| #[strong how similar they are]. Predicting similarity is useful for
| building recommendation systems or flagging duplicates. For example, you
| can suggest a user content that's similar to what they're currently
| looking at, or label a support ticket as a duplicate, if it's very
| looking at, or label a support ticket as a duplicate if it's very
| similar to an already existing one.
p

View File

@ -16,3 +16,47 @@ p
+row
for cell in ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "for", "$", "1", "billion"]
+cell=cell
p
| Fist, the raw text is split on whitespace characters, similar to
| #[code text.split(' ')]. Then, the tokenizer processes the text from
| left to right. On each substring, it performs two checks:
+list("numbers")
+item
| #[strong Does the substring match a tokenizer exception rule?] For
| example, "don't" does not contain whitespace, but should be split
| into two tokens, "do" and "n't", while "U.K." should always
| remain one token.
+item
| #[strong Can a prefix, suffix or infixes be split off?]. For example
| punctuation like commas, periods, hyphens or quotes.
p
| If there's a match, the rule is applied and the tokenizer continues its
| loop, starting with the newly split substrings. This way, spaCy can split
| #[strong complex, nested tokens] like combinations of abbreviations and
| multiple punctuation marks.
+aside
| #[strong Tokenizer exception:] Special-case rule to split a string into
| several tokens or prevent a token from being split when punctuation rules
| are applied.#[br]
| #[strong Prefix:] Character(s) at the beginning, e.g.
| #[code $], #[code (], #[code “], #[code ¿].#[br]
| #[strong Suffix:] Character(s) at the end, e.g.
| #[code km], #[code &#41;], #[code ”], #[code !].#[br]
| #[strong Infix:] Character(s) in between, e.g.
| #[code -], #[code --], #[code /], #[code …].#[br]
+image
include ../../../assets/img/docs/tokenization.svg
.u-text-right
+button("/assets/img/docs/tokenization.svg", false, "secondary").u-text-tag View large graphic
p
| While punctuation rules are usually pretty general, tokenizer exceptions
| strongly depend on the specifics of the individual language. This is
| why each #[+a("/docs/api/language-models") available language] has its
| own subclass like #[code English] or #[code German], that loads in lists
| of hard-coded data and exception rules.

View File

@ -89,4 +89,6 @@ p
p
| Even though both #[code Doc] objects contain the same words, the internal
| integer IDs are very different.
| integer IDs are very different. The same applies for all other strings,
| like the annotation scheme. To avoid mismatched IDs, spaCy will always
| export the vocab if you save a #[code Doc] or #[code nlp] object.

View File

@ -144,7 +144,7 @@ p
+table(["Argument", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[coce Vocab]
+cell #[code Vocab]
+cell
| Shared data between components, including strings, morphology,
| vectors etc.

View File

@ -139,6 +139,8 @@ p
new_doc = Doc(Vocab()).from_disk('/moby_dick.bin')
+infobox
| #[strong API:] #[+api("language") #[code Language]],
| #[+api("doc") #[code Doc]]
| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
+h(2, "rule-matcher") Match text with token rules

View File

@ -345,7 +345,7 @@ p
| account and check the #[code subtree] for intensifiers like "very", to
| increase the sentiment score. At some point, you might also want to train
| a sentiment model. However, the approach described in this example is
| very useful for #[strong bootstrapping rules to gather training data].
| very useful for #[strong bootstrapping rules to collect training data].
| It's also an incredibly fast way to gather first insights into your data
| with about 1 million tweets, you'd be looking at a processing time of
| #[strong under 1 minute].

View File

@ -65,7 +65,7 @@ p
| spaCy provides a variety of linguistic annotations to give you insights
| into a text's grammatical structure. This includes the word types,
| i.e. the parts of speech, and how the words are related to each other.
| For example, if you're analysing text, it makes a #[em huge] difference
| For example, if you're analysing text, it makes a huge difference
| whether a noun is the subject of a sentence, or the object or whether
| "google" is used as a verb, or refers to the website or company in a
| specific context.
@ -94,9 +94,10 @@ p
include _spacy-101/_tokenization
+infobox
| To learn more about how spaCy's tokenizer and its rules work in detail,
| how to #[strong customise] it and how to #[strong add your own tokenizer]
| to a processing pipeline, see the usage guide on
| To learn more about how spaCy's tokenization rules work in detail,
| how to #[strong customise and replace] the default tokenizer and how to
| #[strong add language-specific data], see the usage guides on
| #[+a("/docs/usage/adding-languages") adding languages] and
| #[+a("/docs/usage/customizing-tokenizer") customising the tokenizer].
+h(3, "annotations-pos-deps") Part-of-speech tags and dependencies
@ -118,9 +119,11 @@ include _spacy-101/_named-entities
+infobox
| To learn more about entity recognition in spaCy, how to
| #[strong add your own entities] to a document and how to train and update
| the entity predictions of a model, see the usage guide on
| #[+a("/docs/usage/entity-recognition") named entity recognition].
| #[strong add your own entities] to a document and how to
| #[strong train and update] the entity predictions of a model, see the
| usage guides on
| #[+a("/docs/usage/entity-recognition") named entity recognition] and
| #[+a("/docs/usage/training-ner") training the named entity recognizer].
+h(2, "vectors-similarity") Word vectors and similarity
+tag-model("vectors")

View File

@ -20,19 +20,18 @@ p
nlp = Language(pipeline=['my_factory', mycomponent])
p
| It's now much easier to customise the pipeline with your own components.
| Components are functions that receive a #[code Doc] object, modify and
| return it. If your component is stateful, you'll want to create a new one
| for each pipeline. You can do that by defining and registering a factory
| which receives the shared #[code Vocab] object and returns a component.
p
| spaCy's default components the vectorizer, tagger, parser and entity
| recognizer, can be added to your pipeline by using their string IDs.
| This way, you won't have to worry about finding and implementing them
| to use the default tagger, simply add #[code "tagger"] to the pipeline,
| It's now much easier to #[strong customise the pipeline] with your own
| components, functions that receive a #[code Doc] object, modify and
| return it. If your component is stateful, you can define and register a
| factory which receives the shared #[code Vocab] object and returns a
|  component. spaCy's default components can be added to your pipeline by
| using their string IDs. This way, you won't have to worry about finding
| and implementing them simply add #[code "tagger"] to the pipeline,
| and spaCy will know what to do.
+image
include ../../assets/img/docs/pipeline.svg
+infobox
| #[strong API:] #[+api("language") #[code Language]]
| #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text]
@ -96,11 +95,10 @@ p
| #[code Language] class, or load a model that initialises one. This allows
| languages to contain more custom data, e.g. lemmatizer lookup tables, or
| complex regular expressions. The language data has also been tidied up
| and simplified. It's now also possible to overwrite the functions that
| compute lexical attributes like #[code like_num], and supply
| language-specific syntax iterators, e.g. to determine noun chunks. spaCy
| now also supports simple lookup-based lemmatization. The data is stored
| in a dictionary mapping a string to its lemma.
| and simplified. spaCy now also supports simple lookup-based lemmatization.
+image
include ../../assets/img/docs/language_data.svg
+infobox
| #[strong API:] #[+api("language") #[code Language]]
@ -111,13 +109,10 @@ p
+aside-code("Example").
from spacy.matcher import Matcher
from spacy.attrs import LOWER, IS_PUNCT
matcher = Matcher(nlp.vocab)
matcher.add('HelloWorld', None,
[{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}],
[{LOWER: 'hello'}, {LOWER: 'world'}])
matcher.add('HEARTS', None, [{'ORTH': '❤️', 'OP': '+'}])
assert len(matcher) == 1
assert 'HelloWorld' in matcher
assert 'HEARTS' in matcher
p
| Patterns can now be added to the matcher by calling
@ -157,28 +152,8 @@ p
+cell #[+api("language#to_disk") #[code Language.to_disk]]
+row
+cell #[code Tokenizer.load]
+cell
| #[+api("tokenizer#from_disk") #[code Tokenizer.from_disk]]
| #[+api("tokenizer#from_bytes") #[code Tokenizer.from_bytes]]
+row
+cell #[code Tagger.load]
+cell
| #[+api("tagger#from_disk") #[code Tagger.from_disk]]
| #[+api("tagger#from_bytes") #[code Tagger.from_bytes]]
+row
+cell #[code DependencyParser.load]
+cell
| #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]]
| #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]]
+row
+cell #[code EntityRecognizer.load]
+cell
| #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]]
| #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]]
+cell #[code Language.create_make_doc]
+cell #[+api("language#attributes") #[code Language.tokenizer]]
+row
+cell
@ -212,6 +187,28 @@ p
| #[+api("stringstore#to_disk") #[code StringStore.to_disk]]
| #[+api("stringstore#to_bytes") #[code StringStore.to_bytes]]
+row
+cell #[code Tokenizer.load]
+cell -
+row
+cell #[code Tagger.load]
+cell
| #[+api("tagger#from_disk") #[code Tagger.from_disk]]
| #[+api("tagger#from_bytes") #[code Tagger.from_bytes]]
+row
+cell #[code DependencyParser.load]
+cell
| #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]]
| #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]]
+row
+cell #[code EntityRecognizer.load]
+cell
| #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]]
| #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]]
+row
+cell #[code Matcher.load]
+cell -
@ -232,7 +229,7 @@ p
+row
+cell #[code Doc.read_bytes]
+cell
+cell #[+api("binder") #[code Binder]]
+row
+cell #[code Token.is_ancestor_of]