Merge branch 'develop' of https://github.com/explosion/spaCy into develop
|
@ -1,9 +1,6 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import importlib
|
|
||||||
|
|
||||||
from .compat import basestring_
|
|
||||||
from .cli.info import info as cli_info
|
from .cli.info import info as cli_info
|
||||||
from .glossary import explain
|
from .glossary import explain
|
||||||
from .deprecated import resolve_load_name
|
from .deprecated import resolve_load_name
|
||||||
|
@ -12,14 +9,7 @@ from . import util
|
||||||
|
|
||||||
def load(name, **overrides):
|
def load(name, **overrides):
|
||||||
name = resolve_load_name(name, **overrides)
|
name = resolve_load_name(name, **overrides)
|
||||||
model_path = util.resolve_model_path(name)
|
return util.load_model(name)
|
||||||
meta = util.parse_package_meta(model_path)
|
|
||||||
if 'lang' not in meta:
|
|
||||||
raise IOError('No language setting found in model meta.')
|
|
||||||
cls = util.get_lang_class(meta['lang'])
|
|
||||||
overrides['meta'] = meta
|
|
||||||
overrides['path'] = model_path
|
|
||||||
return cls(**overrides)
|
|
||||||
|
|
||||||
|
|
||||||
def info(model=None, markdown=False):
|
def info(model=None, markdown=False):
|
||||||
|
|
|
@ -150,6 +150,9 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||||
else:
|
else:
|
||||||
int_key = IDS[name.upper()]
|
int_key = IDS[name.upper()]
|
||||||
if strings_map is not None and isinstance(value, basestring):
|
if strings_map is not None and isinstance(value, basestring):
|
||||||
|
if hasattr(strings_map, 'add'):
|
||||||
|
value = strings_map.add(value)
|
||||||
|
else:
|
||||||
value = strings_map[value]
|
value = strings_map[value]
|
||||||
inty_attrs[int_key] = value
|
inty_attrs[int_key] = value
|
||||||
return inty_attrs
|
return inty_attrs
|
||||||
|
|
|
@ -20,8 +20,14 @@ def info(cmd, model=None, markdown=False):
|
||||||
prints details in Markdown for easy copy-pasting to GitHub issues.
|
prints details in Markdown for easy copy-pasting to GitHub issues.
|
||||||
"""
|
"""
|
||||||
if model:
|
if model:
|
||||||
model_path = util.resolve_model_path(model)
|
if util.is_package(model):
|
||||||
meta = util.parse_package_meta(model_path)
|
model_path = util.get_package_path(model)
|
||||||
|
else:
|
||||||
|
model_path = util.get_data_path() / model
|
||||||
|
meta_path = model_path / 'meta.json'
|
||||||
|
if not meta_path.is_file():
|
||||||
|
prints(meta_path, title="Can't find model meta.json", exits=1)
|
||||||
|
meta = read_json(meta_path)
|
||||||
if model_path.resolve() != model_path:
|
if model_path.resolve() != model_path:
|
||||||
meta['link'] = path2str(model_path)
|
meta['link'] = path2str(model_path)
|
||||||
meta['source'] = path2str(model_path.resolve())
|
meta['source'] = path2str(model_path.resolve())
|
||||||
|
|
|
@ -21,7 +21,7 @@ def link(cmd, origin, link_name, force=False):
|
||||||
directory. Linking models allows loading them via spacy.load(link_name).
|
directory. Linking models allows loading them via spacy.load(link_name).
|
||||||
"""
|
"""
|
||||||
if util.is_package(origin):
|
if util.is_package(origin):
|
||||||
model_path = util.get_model_package_path(origin)
|
model_path = util.get_package_path(model)
|
||||||
else:
|
else:
|
||||||
model_path = Path(origin)
|
model_path = Path(origin)
|
||||||
if not model_path.exists():
|
if not model_path.exists():
|
||||||
|
|
|
@ -1,13 +1,14 @@
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from .structs cimport TokenC
|
from .structs cimport TokenC
|
||||||
|
from .typedefs cimport attr_t
|
||||||
from .syntax.transition_system cimport Transition
|
from .syntax.transition_system cimport Transition
|
||||||
|
|
||||||
|
|
||||||
cdef struct GoldParseC:
|
cdef struct GoldParseC:
|
||||||
int* tags
|
int* tags
|
||||||
int* heads
|
int* heads
|
||||||
int* labels
|
attr_t* labels
|
||||||
int** brackets
|
int** brackets
|
||||||
Transition* ner
|
Transition* ner
|
||||||
|
|
||||||
|
|
|
@ -384,7 +384,7 @@ cdef class GoldParse:
|
||||||
# These are filled by the tagger/parser/entity recogniser
|
# These are filled by the tagger/parser/entity recogniser
|
||||||
self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
|
self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||||
self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
|
self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||||
self.c.labels = <int*>self.mem.alloc(len(doc), sizeof(int))
|
self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
|
||||||
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
|
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
|
||||||
|
|
||||||
self.words = [None] * len(doc)
|
self.words = [None] * len(doc)
|
||||||
|
|
|
@ -35,4 +35,4 @@ class English(Language):
|
||||||
Defaults = EnglishDefaults
|
Defaults = EnglishDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['English', 'EnglishDefaults']
|
__all__ = ['English']
|
||||||
|
|
26
spacy/lang/xx/__init__.py
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
from ...language import Language
|
||||||
|
from ...attrs import LANG
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
|
class MultiLanguageDefaults(Language.Defaults):
|
||||||
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
|
lex_attr_getters[LANG] = lambda text: 'xx'
|
||||||
|
|
||||||
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||||
|
|
||||||
|
|
||||||
|
class MultiLanguage(Language):
|
||||||
|
"""Language class to be used for models that support multiple languages.
|
||||||
|
This module allows models to specify their language ID as 'xx'.
|
||||||
|
"""
|
||||||
|
lang = 'xx'
|
||||||
|
Defaults = MultiLanguageDefaults
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ['MultiLanguage']
|
|
@ -215,7 +215,9 @@ class Language(object):
|
||||||
grads = {}
|
grads = {}
|
||||||
def get_grads(W, dW, key=None):
|
def get_grads(W, dW, key=None):
|
||||||
grads[key] = (W, dW)
|
grads[key] = (W, dW)
|
||||||
for proc in self.pipeline[1:]:
|
pipes = list(self.pipeline[1:])
|
||||||
|
random.shuffle(pipes)
|
||||||
|
for proc in pipes:
|
||||||
if not hasattr(proc, 'update'):
|
if not hasattr(proc, 'update'):
|
||||||
continue
|
continue
|
||||||
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
|
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
|
||||||
|
|
|
@ -27,7 +27,7 @@ cdef class Lexeme:
|
||||||
cdef inline SerializedLexemeC c_to_bytes(const LexemeC* lex) nogil:
|
cdef inline SerializedLexemeC c_to_bytes(const LexemeC* lex) nogil:
|
||||||
cdef SerializedLexemeC lex_data
|
cdef SerializedLexemeC lex_data
|
||||||
buff = <const unsigned char*>&lex.flags
|
buff = <const unsigned char*>&lex.flags
|
||||||
end = <const unsigned char*>&lex.l2_norm + sizeof(lex.l2_norm)
|
end = <const unsigned char*>&lex.sentiment + sizeof(lex.sentiment)
|
||||||
for i in range(sizeof(lex_data.data)):
|
for i in range(sizeof(lex_data.data)):
|
||||||
lex_data.data[i] = buff[i]
|
lex_data.data[i] = buff[i]
|
||||||
return lex_data
|
return lex_data
|
||||||
|
@ -35,7 +35,7 @@ cdef class Lexeme:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline void c_from_bytes(LexemeC* lex, SerializedLexemeC lex_data) nogil:
|
cdef inline void c_from_bytes(LexemeC* lex, SerializedLexemeC lex_data) nogil:
|
||||||
buff = <unsigned char*>&lex.flags
|
buff = <unsigned char*>&lex.flags
|
||||||
end = <unsigned char*>&lex.l2_norm + sizeof(lex.l2_norm)
|
end = <unsigned char*>&lex.sentiment + sizeof(lex.sentiment)
|
||||||
for i in range(sizeof(lex_data.data)):
|
for i in range(sizeof(lex_data.data)):
|
||||||
buff[i] = lex_data.data[i]
|
buff[i] = lex_data.data[i]
|
||||||
|
|
||||||
|
|
|
@ -35,11 +35,11 @@ cdef class Lexeme:
|
||||||
tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
|
tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
|
||||||
tag).
|
tag).
|
||||||
"""
|
"""
|
||||||
def __init__(self, Vocab vocab, int orth):
|
def __init__(self, Vocab vocab, attr_t orth):
|
||||||
"""Create a Lexeme object.
|
"""Create a Lexeme object.
|
||||||
|
|
||||||
vocab (Vocab): The parent vocabulary
|
vocab (Vocab): The parent vocabulary
|
||||||
orth (int): The orth id of the lexeme.
|
orth (uint64): The orth id of the lexeme.
|
||||||
Returns (Lexeme): The newly constructd object.
|
Returns (Lexeme): The newly constructd object.
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
|
@ -51,7 +51,7 @@ cdef class Lexeme:
|
||||||
if isinstance(other, Lexeme):
|
if isinstance(other, Lexeme):
|
||||||
a = self.orth
|
a = self.orth
|
||||||
b = other.orth
|
b = other.orth
|
||||||
elif isinstance(other, int):
|
elif isinstance(other, long):
|
||||||
a = self.orth
|
a = self.orth
|
||||||
b = other
|
b = other
|
||||||
elif isinstance(other, str):
|
elif isinstance(other, str):
|
||||||
|
@ -109,7 +109,7 @@ cdef class Lexeme:
|
||||||
def to_bytes(self):
|
def to_bytes(self):
|
||||||
lex_data = Lexeme.c_to_bytes(self.c)
|
lex_data = Lexeme.c_to_bytes(self.c)
|
||||||
start = <const char*>&self.c.flags
|
start = <const char*>&self.c.flags
|
||||||
end = <const char*>&self.c.l2_norm + sizeof(self.c.l2_norm)
|
end = <const char*>&self.c.sentiment + sizeof(self.c.sentiment)
|
||||||
assert (end-start) == sizeof(lex_data.data), (end-start, sizeof(lex_data.data))
|
assert (end-start) == sizeof(lex_data.data), (end-start, sizeof(lex_data.data))
|
||||||
byte_string = b'\0' * sizeof(lex_data.data)
|
byte_string = b'\0' * sizeof(lex_data.data)
|
||||||
byte_chars = <char*>byte_string
|
byte_chars = <char*>byte_string
|
||||||
|
@ -136,12 +136,7 @@ cdef class Lexeme:
|
||||||
RETURNS (bool): Whether a word vector is associated with the object.
|
RETURNS (bool): Whether a word vector is associated with the object.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
cdef int i
|
return self.vocab.has_vector(self.c.orth)
|
||||||
for i in range(self.vocab.vectors_length):
|
|
||||||
if self.c.vector[i] != 0:
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
property vector_norm:
|
property vector_norm:
|
||||||
"""The L2 norm of the lexeme's vector representation.
|
"""The L2 norm of the lexeme's vector representation.
|
||||||
|
@ -149,10 +144,8 @@ cdef class Lexeme:
|
||||||
RETURNS (float): The L2 norm of the vector representation.
|
RETURNS (float): The L2 norm of the vector representation.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.l2_norm
|
vector = self.vector
|
||||||
|
return numpy.sqrt((vector**2).sum())
|
||||||
def __set__(self, float value):
|
|
||||||
self.c.l2_norm = value
|
|
||||||
|
|
||||||
property vector:
|
property vector:
|
||||||
"""A real-valued meaning representation.
|
"""A real-valued meaning representation.
|
||||||
|
@ -169,27 +162,16 @@ cdef class Lexeme:
|
||||||
"model doesn't include word vectors. For more info, see "
|
"model doesn't include word vectors. For more info, see "
|
||||||
"the documentation: \n%s\n" % about.__docs_models__
|
"the documentation: \n%s\n" % about.__docs_models__
|
||||||
)
|
)
|
||||||
|
return self.vocab.get_vector(self.c.orth)
|
||||||
vector_view = <float[:length,]>self.c.vector
|
|
||||||
return numpy.asarray(vector_view)
|
|
||||||
|
|
||||||
def __set__(self, vector):
|
def __set__(self, vector):
|
||||||
assert len(vector) == self.vocab.vectors_length
|
assert len(vector) == self.vocab.vectors_length
|
||||||
cdef float value
|
self.vocab.set_vector(self.c.orth, vector)
|
||||||
cdef double norm = 0.0
|
|
||||||
for i, value in enumerate(vector):
|
|
||||||
self.c.vector[i] = value
|
|
||||||
norm += value * value
|
|
||||||
self.c.l2_norm = sqrt(norm)
|
|
||||||
|
|
||||||
property rank:
|
property rank:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.id
|
return self.c.id
|
||||||
|
|
||||||
property repvec:
|
|
||||||
def __get__(self):
|
|
||||||
raise AttributeError("lex.repvec has been renamed to lex.vector")
|
|
||||||
|
|
||||||
property sentiment:
|
property sentiment:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.sentiment
|
return self.c.sentiment
|
||||||
|
@ -210,31 +192,31 @@ cdef class Lexeme:
|
||||||
|
|
||||||
property lower:
|
property lower:
|
||||||
def __get__(self): return self.c.lower
|
def __get__(self): return self.c.lower
|
||||||
def __set__(self, int x): self.c.lower = x
|
def __set__(self, attr_t x): self.c.lower = x
|
||||||
|
|
||||||
property norm:
|
property norm:
|
||||||
def __get__(self): return self.c.norm
|
def __get__(self): return self.c.norm
|
||||||
def __set__(self, int x): self.c.norm = x
|
def __set__(self, attr_t x): self.c.norm = x
|
||||||
|
|
||||||
property shape:
|
property shape:
|
||||||
def __get__(self): return self.c.shape
|
def __get__(self): return self.c.shape
|
||||||
def __set__(self, int x): self.c.shape = x
|
def __set__(self, attr_t x): self.c.shape = x
|
||||||
|
|
||||||
property prefix:
|
property prefix:
|
||||||
def __get__(self): return self.c.prefix
|
def __get__(self): return self.c.prefix
|
||||||
def __set__(self, int x): self.c.prefix = x
|
def __set__(self, attr_t x): self.c.prefix = x
|
||||||
|
|
||||||
property suffix:
|
property suffix:
|
||||||
def __get__(self): return self.c.suffix
|
def __get__(self): return self.c.suffix
|
||||||
def __set__(self, int x): self.c.suffix = x
|
def __set__(self, attr_t x): self.c.suffix = x
|
||||||
|
|
||||||
property cluster:
|
property cluster:
|
||||||
def __get__(self): return self.c.cluster
|
def __get__(self): return self.c.cluster
|
||||||
def __set__(self, int x): self.c.cluster = x
|
def __set__(self, attr_t x): self.c.cluster = x
|
||||||
|
|
||||||
property lang:
|
property lang:
|
||||||
def __get__(self): return self.c.lang
|
def __get__(self): return self.c.lang
|
||||||
def __set__(self, int x): self.c.lang = x
|
def __set__(self, attr_t x): self.c.lang = x
|
||||||
|
|
||||||
property prob:
|
property prob:
|
||||||
def __get__(self): return self.c.prob
|
def __get__(self): return self.c.prob
|
||||||
|
@ -270,7 +252,7 @@ cdef class Lexeme:
|
||||||
|
|
||||||
property is_oov:
|
property is_oov:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_OOV)
|
def __get__(self): return Lexeme.c_check_flag(self.c, IS_OOV)
|
||||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_OOV, x)
|
def __set__(self, attr_t x): Lexeme.c_set_flag(self.c, IS_OOV, x)
|
||||||
|
|
||||||
property is_stop:
|
property is_stop:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_STOP)
|
def __get__(self): return Lexeme.c_check_flag(self.c, IS_STOP)
|
||||||
|
@ -320,7 +302,6 @@ cdef class Lexeme:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
|
def __get__(self): return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
|
||||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
|
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
|
||||||
|
|
||||||
|
|
||||||
property like_url:
|
property like_url:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL)
|
def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL)
|
||||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x)
|
def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x)
|
||||||
|
|
|
@ -154,7 +154,7 @@ def _convert_strings(token_specs, string_store):
|
||||||
if isinstance(attr, basestring):
|
if isinstance(attr, basestring):
|
||||||
attr = attrs.IDS.get(attr.upper())
|
attr = attrs.IDS.get(attr.upper())
|
||||||
if isinstance(value, basestring):
|
if isinstance(value, basestring):
|
||||||
value = string_store[value]
|
value = string_store.add(value)
|
||||||
if isinstance(value, bool):
|
if isinstance(value, bool):
|
||||||
value = int(value)
|
value = int(value)
|
||||||
if attr is not None:
|
if attr is not None:
|
||||||
|
@ -381,7 +381,7 @@ cdef class Matcher:
|
||||||
|
|
||||||
def _normalize_key(self, key):
|
def _normalize_key(self, key):
|
||||||
if isinstance(key, basestring):
|
if isinstance(key, basestring):
|
||||||
return self.vocab.strings[key]
|
return self.vocab.strings.add(key)
|
||||||
else:
|
else:
|
||||||
return key
|
return key
|
||||||
|
|
||||||
|
@ -469,7 +469,7 @@ cdef class PhraseMatcher:
|
||||||
self(doc)
|
self(doc)
|
||||||
yield doc
|
yield doc
|
||||||
|
|
||||||
def accept_match(self, Doc doc, int ent_id, int label, int start, int end):
|
def accept_match(self, Doc doc, attr_t ent_id, attr_t label, int start, int end):
|
||||||
assert (end - start) < self.max_length
|
assert (end - start) < self.max_length
|
||||||
cdef int i, j
|
cdef int i, j
|
||||||
for i in range(self.max_length):
|
for i in range(self.max_length):
|
||||||
|
|
|
@ -48,7 +48,7 @@ cdef class Morphology:
|
||||||
self.tag_map[tag_str] = dict(attrs)
|
self.tag_map[tag_str] = dict(attrs)
|
||||||
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
||||||
self.rich_tags[i].id = i
|
self.rich_tags[i].id = i
|
||||||
self.rich_tags[i].name = self.strings[tag_str]
|
self.rich_tags[i].name = self.strings.add(tag_str)
|
||||||
self.rich_tags[i].morph = 0
|
self.rich_tags[i].morph = 0
|
||||||
self.rich_tags[i].pos = attrs[POS]
|
self.rich_tags[i].pos = attrs[POS]
|
||||||
self.reverse_index[self.rich_tags[i].name] = i
|
self.reverse_index[self.rich_tags[i].name] = i
|
||||||
|
@ -59,10 +59,12 @@ cdef class Morphology:
|
||||||
|
|
||||||
cdef int assign_tag(self, TokenC* token, tag) except -1:
|
cdef int assign_tag(self, TokenC* token, tag) except -1:
|
||||||
if isinstance(tag, basestring):
|
if isinstance(tag, basestring):
|
||||||
tag_id = self.reverse_index[self.strings[tag]]
|
tag = self.strings.add(tag)
|
||||||
else:
|
if tag in self.reverse_index:
|
||||||
tag_id = self.reverse_index[tag]
|
tag_id = self.reverse_index[tag]
|
||||||
self.assign_tag_id(token, tag_id)
|
self.assign_tag_id(token, tag_id)
|
||||||
|
else:
|
||||||
|
token.tag = tag
|
||||||
|
|
||||||
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
|
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
|
||||||
if tag_id >= self.n_tags:
|
if tag_id >= self.n_tags:
|
||||||
|
@ -73,7 +75,7 @@ cdef class Morphology:
|
||||||
# the statistical model fails.
|
# the statistical model fails.
|
||||||
# Related to Issue #220
|
# Related to Issue #220
|
||||||
if Lexeme.c_check_flag(token.lex, IS_SPACE):
|
if Lexeme.c_check_flag(token.lex, IS_SPACE):
|
||||||
tag_id = self.reverse_index[self.strings['SP']]
|
tag_id = self.reverse_index[self.strings.add('SP')]
|
||||||
rich_tag = self.rich_tags[tag_id]
|
rich_tag = self.rich_tags[tag_id]
|
||||||
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
|
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
|
||||||
if analysis is NULL:
|
if analysis is NULL:
|
||||||
|
@ -104,7 +106,7 @@ cdef class Morphology:
|
||||||
tag (unicode): The part-of-speech tag to key the exception.
|
tag (unicode): The part-of-speech tag to key the exception.
|
||||||
orth (unicode): The word-form to key the exception.
|
orth (unicode): The word-form to key the exception.
|
||||||
"""
|
"""
|
||||||
tag = self.strings[tag_str]
|
tag = self.strings.add(tag_str)
|
||||||
tag_id = self.reverse_index[tag]
|
tag_id = self.reverse_index[tag]
|
||||||
orth = self.strings[orth_str]
|
orth = self.strings[orth_str]
|
||||||
cdef RichTagC rich_tag = self.rich_tags[tag_id]
|
cdef RichTagC rich_tag = self.rich_tags[tag_id]
|
||||||
|
@ -140,14 +142,14 @@ cdef class Morphology:
|
||||||
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
|
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
|
||||||
cdef unicode py_string = self.strings[orth]
|
cdef unicode py_string = self.strings[orth]
|
||||||
if self.lemmatizer is None:
|
if self.lemmatizer is None:
|
||||||
return self.strings[py_string.lower()]
|
return self.strings.add(py_string.lower())
|
||||||
if univ_pos not in (NOUN, VERB, ADJ, PUNCT):
|
if univ_pos not in (NOUN, VERB, ADJ, PUNCT):
|
||||||
return self.strings[py_string.lower()]
|
return self.strings.add(py_string.lower())
|
||||||
cdef set lemma_strings
|
cdef set lemma_strings
|
||||||
cdef unicode lemma_string
|
cdef unicode lemma_string
|
||||||
lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
|
lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
|
||||||
lemma_string = sorted(lemma_strings)[0]
|
lemma_string = sorted(lemma_strings)[0]
|
||||||
lemma = self.strings[lemma_string]
|
lemma = self.strings.add(lemma_string)
|
||||||
return lemma
|
return lemma
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -228,6 +228,7 @@ class NeuralTagger(object):
|
||||||
idx += 1
|
idx += 1
|
||||||
correct = self.model.ops.xp.array(correct, dtype='i')
|
correct = self.model.ops.xp.array(correct, dtype='i')
|
||||||
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
|
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
|
||||||
|
d_scores /= d_scores.shape[0]
|
||||||
loss = (d_scores**2).sum()
|
loss = (d_scores**2).sum()
|
||||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
@ -292,6 +293,7 @@ class NeuralLabeller(NeuralTagger):
|
||||||
idx += 1
|
idx += 1
|
||||||
correct = self.model.ops.xp.array(correct, dtype='i')
|
correct = self.model.ops.xp.array(correct, dtype='i')
|
||||||
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
|
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
|
||||||
|
d_scores /= d_scores.shape[0]
|
||||||
loss = (d_scores**2).sum()
|
loss = (d_scores**2).sum()
|
||||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
from libc.stdint cimport int64_t
|
from libc.stdint cimport int64_t
|
||||||
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
|
@ -8,6 +9,9 @@ from .typedefs cimport attr_t, hash_t
|
||||||
|
|
||||||
|
|
||||||
cpdef hash_t hash_string(unicode string) except 0
|
cpdef hash_t hash_string(unicode string) except 0
|
||||||
|
cdef hash_t hash_utf8(char* utf8_string, int length) nogil
|
||||||
|
|
||||||
|
cdef unicode decode_Utf8Str(const Utf8Str* string)
|
||||||
|
|
||||||
|
|
||||||
ctypedef union Utf8Str:
|
ctypedef union Utf8Str:
|
||||||
|
@ -17,13 +21,11 @@ ctypedef union Utf8Str:
|
||||||
|
|
||||||
cdef class StringStore:
|
cdef class StringStore:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cdef Utf8Str* c
|
|
||||||
cdef int64_t size
|
|
||||||
cdef bint is_frozen
|
cdef bint is_frozen
|
||||||
|
|
||||||
|
cdef vector[hash_t] keys
|
||||||
cdef public PreshMap _map
|
cdef public PreshMap _map
|
||||||
cdef public PreshMap _oov
|
cdef public PreshMap _oov
|
||||||
cdef int64_t _resize_at
|
|
||||||
|
|
||||||
cdef const Utf8Str* intern_unicode(self, unicode py_string)
|
cdef const Utf8Str* intern_unicode(self, unicode py_string)
|
||||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)
|
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)
|
||||||
|
|
|
@ -11,6 +11,9 @@ from libc.stdint cimport uint32_t
|
||||||
import ujson
|
import ujson
|
||||||
import dill
|
import dill
|
||||||
|
|
||||||
|
from .symbols import IDS as SYMBOLS_BY_STR
|
||||||
|
from .symbols import NAMES as SYMBOLS_BY_INT
|
||||||
|
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
from . import util
|
from . import util
|
||||||
|
|
||||||
|
@ -28,7 +31,7 @@ cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil:
|
||||||
return hash32(utf8_string, length, 1)
|
return hash32(utf8_string, length, 1)
|
||||||
|
|
||||||
|
|
||||||
cdef unicode _decode(const Utf8Str* string):
|
cdef unicode decode_Utf8Str(const Utf8Str* string):
|
||||||
cdef int i, length
|
cdef int i, length
|
||||||
if string.s[0] < sizeof(string.s) and string.s[0] != 0:
|
if string.s[0] < sizeof(string.s) and string.s[0] != 0:
|
||||||
return string.s[1:string.s[0]+1].decode('utf8')
|
return string.s[1:string.s[0]+1].decode('utf8')
|
||||||
|
@ -45,10 +48,10 @@ cdef unicode _decode(const Utf8Str* string):
|
||||||
return string.p[i:length + i].decode('utf8')
|
return string.p[i:length + i].decode('utf8')
|
||||||
|
|
||||||
|
|
||||||
cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *:
|
cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *:
|
||||||
cdef int n_length_bytes
|
cdef int n_length_bytes
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef Utf8Str string
|
cdef Utf8Str* string = <Utf8Str*>mem.alloc(1, sizeof(Utf8Str))
|
||||||
cdef uint32_t ulength = length
|
cdef uint32_t ulength = length
|
||||||
if length < sizeof(string.s):
|
if length < sizeof(string.s):
|
||||||
string.s[0] = <unsigned char>length
|
string.s[0] = <unsigned char>length
|
||||||
|
@ -73,7 +76,7 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) ex
|
||||||
|
|
||||||
|
|
||||||
cdef class StringStore:
|
cdef class StringStore:
|
||||||
"""Map strings to and from integer IDs."""
|
"""Lookup strings by 64-bit hash"""
|
||||||
def __init__(self, strings=None, freeze=False):
|
def __init__(self, strings=None, freeze=False):
|
||||||
"""Create the StringStore.
|
"""Create the StringStore.
|
||||||
|
|
||||||
|
@ -83,70 +86,66 @@ cdef class StringStore:
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._map = PreshMap()
|
self._map = PreshMap()
|
||||||
self._oov = PreshMap()
|
self._oov = PreshMap()
|
||||||
self._resize_at = 10000
|
|
||||||
self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
|
|
||||||
self.size = 1
|
|
||||||
self.is_frozen = freeze
|
self.is_frozen = freeze
|
||||||
if strings is not None:
|
if strings is not None:
|
||||||
for string in strings:
|
for string in strings:
|
||||||
_ = self[string]
|
self.add(string)
|
||||||
|
|
||||||
property size:
|
def __getitem__(self, object string_or_id):
|
||||||
def __get__(self):
|
"""Retrieve a string from a given hash ID, or vice versa.
|
||||||
return self.size -1
|
|
||||||
|
string_or_id (bytes or unicode or uint64): The value to encode.
|
||||||
|
Returns (unicode or uint64): The value to be retrieved.
|
||||||
|
"""
|
||||||
|
if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
|
||||||
|
return 0
|
||||||
|
elif string_or_id == 0:
|
||||||
|
return u''
|
||||||
|
elif string_or_id in SYMBOLS_BY_STR:
|
||||||
|
return SYMBOLS_BY_STR[string_or_id]
|
||||||
|
|
||||||
|
cdef hash_t key
|
||||||
|
|
||||||
|
if isinstance(string_or_id, unicode):
|
||||||
|
key = hash_string(string_or_id)
|
||||||
|
return key
|
||||||
|
elif isinstance(string_or_id, bytes):
|
||||||
|
key = hash_utf8(string_or_id, len(string_or_id))
|
||||||
|
return key
|
||||||
|
else:
|
||||||
|
if string_or_id < len(SYMBOLS_BY_INT):
|
||||||
|
return SYMBOLS_BY_INT[string_or_id]
|
||||||
|
key = string_or_id
|
||||||
|
utf8str = <Utf8Str*>self._map.get(key)
|
||||||
|
if utf8str is NULL:
|
||||||
|
raise KeyError(string_or_id)
|
||||||
|
else:
|
||||||
|
return decode_Utf8Str(utf8str)
|
||||||
|
|
||||||
|
def add(self, string):
|
||||||
|
if isinstance(string, unicode):
|
||||||
|
if string in SYMBOLS_BY_STR:
|
||||||
|
return SYMBOLS_BY_STR[string]
|
||||||
|
key = hash_string(string)
|
||||||
|
self.intern_unicode(string)
|
||||||
|
elif isinstance(string, bytes):
|
||||||
|
if string in SYMBOLS_BY_STR:
|
||||||
|
return SYMBOLS_BY_STR[string]
|
||||||
|
key = hash_utf8(string, len(string))
|
||||||
|
self._intern_utf8(string, len(string))
|
||||||
|
else:
|
||||||
|
raise TypeError(
|
||||||
|
"Can only add unicode or bytes. Got type: %s" % type(string))
|
||||||
|
return key
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""The number of strings in the store.
|
"""The number of strings in the store.
|
||||||
|
|
||||||
RETURNS (int): The number of strings in the store.
|
RETURNS (int): The number of strings in the store.
|
||||||
"""
|
"""
|
||||||
return self.size-1
|
return self.keys.size()
|
||||||
|
|
||||||
def __getitem__(self, object string_or_id):
|
def __contains__(self, string not None):
|
||||||
"""Retrieve a string from a given integer ID, or vice versa.
|
|
||||||
|
|
||||||
string_or_id (bytes or unicode or int): The value to encode.
|
|
||||||
Returns (unicode or int): The value to be retrieved.
|
|
||||||
"""
|
|
||||||
if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
|
|
||||||
return 0
|
|
||||||
elif string_or_id == 0:
|
|
||||||
return u''
|
|
||||||
|
|
||||||
cdef bytes byte_string
|
|
||||||
cdef const Utf8Str* utf8str
|
|
||||||
cdef uint64_t int_id
|
|
||||||
cdef uint32_t oov_id
|
|
||||||
if isinstance(string_or_id, (int, long)):
|
|
||||||
int_id = string_or_id
|
|
||||||
oov_id = string_or_id
|
|
||||||
if int_id < <uint64_t>self.size:
|
|
||||||
return _decode(&self.c[int_id])
|
|
||||||
else:
|
|
||||||
utf8str = <Utf8Str*>self._oov.get(oov_id)
|
|
||||||
if utf8str is not NULL:
|
|
||||||
return _decode(utf8str)
|
|
||||||
else:
|
|
||||||
raise IndexError(string_or_id)
|
|
||||||
else:
|
|
||||||
if isinstance(string_or_id, bytes):
|
|
||||||
byte_string = <bytes>string_or_id
|
|
||||||
elif isinstance(string_or_id, unicode):
|
|
||||||
byte_string = (<unicode>string_or_id).encode('utf8')
|
|
||||||
else:
|
|
||||||
raise TypeError(type(string_or_id))
|
|
||||||
utf8str = self._intern_utf8(byte_string, len(byte_string))
|
|
||||||
if utf8str is NULL:
|
|
||||||
# TODO: We need to use 32 bit here, for compatibility with the
|
|
||||||
# vocabulary values. This makes birthday paradox probabilities
|
|
||||||
# pretty bad.
|
|
||||||
# We could also get unlucky here, and hash into a value that
|
|
||||||
# collides with the 'real' strings.
|
|
||||||
return hash32_utf8(byte_string, len(byte_string))
|
|
||||||
else:
|
|
||||||
return utf8str - self.c
|
|
||||||
|
|
||||||
def __contains__(self, unicode string not None):
|
|
||||||
"""Check whether a string is in the store.
|
"""Check whether a string is in the store.
|
||||||
|
|
||||||
string (unicode): The string to check.
|
string (unicode): The string to check.
|
||||||
|
@ -154,7 +153,11 @@ cdef class StringStore:
|
||||||
"""
|
"""
|
||||||
if len(string) == 0:
|
if len(string) == 0:
|
||||||
return True
|
return True
|
||||||
cdef hash_t key = hash_string(string)
|
if string in SYMBOLS_BY_STR:
|
||||||
|
return True
|
||||||
|
if isinstance(string, unicode):
|
||||||
|
string = string.encode('utf8')
|
||||||
|
cdef hash_t key = hash_utf8(string, len(string))
|
||||||
return self._map.get(key) is not NULL
|
return self._map.get(key) is not NULL
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
|
@ -163,16 +166,15 @@ cdef class StringStore:
|
||||||
YIELDS (unicode): A string in the store.
|
YIELDS (unicode): A string in the store.
|
||||||
"""
|
"""
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(self.size):
|
cdef hash_t key
|
||||||
yield _decode(&self.c[i]) if i > 0 else u''
|
for i in range(self.keys.size()):
|
||||||
|
key = self.keys[i]
|
||||||
|
utf8str = <Utf8Str*>self._map.get(key)
|
||||||
|
yield decode_Utf8Str(utf8str)
|
||||||
# TODO: Iterate OOV here?
|
# TODO: Iterate OOV here?
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
strings = [""]
|
strings = list(self)
|
||||||
for i in range(1, self.size):
|
|
||||||
string = &self.c[i]
|
|
||||||
py_string = _decode(string)
|
|
||||||
strings.append(py_string)
|
|
||||||
return (StringStore, (strings,), None, None, None)
|
return (StringStore, (strings,), None, None, None)
|
||||||
|
|
||||||
def to_disk(self, path):
|
def to_disk(self, path):
|
||||||
|
@ -230,11 +232,9 @@ cdef class StringStore:
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._map = PreshMap()
|
self._map = PreshMap()
|
||||||
self._oov = PreshMap()
|
self._oov = PreshMap()
|
||||||
self._resize_at = 10000
|
self.keys.clear()
|
||||||
self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
|
|
||||||
self.size = 1
|
|
||||||
for string in strings:
|
for string in strings:
|
||||||
_ = self[string]
|
self.add(string)
|
||||||
self.is_frozen = freeze
|
self.is_frozen = freeze
|
||||||
|
|
||||||
cdef const Utf8Str* intern_unicode(self, unicode py_string):
|
cdef const Utf8Str* intern_unicode(self, unicode py_string):
|
||||||
|
@ -258,39 +258,11 @@ cdef class StringStore:
|
||||||
key32 = hash32_utf8(utf8_string, length)
|
key32 = hash32_utf8(utf8_string, length)
|
||||||
# Important: Make the OOV store own the memory. That way it's trivial
|
# Important: Make the OOV store own the memory. That way it's trivial
|
||||||
# to flush them all.
|
# to flush them all.
|
||||||
value = <Utf8Str*>self._oov.mem.alloc(1, sizeof(Utf8Str))
|
value = _allocate(self._oov.mem, <unsigned char*>utf8_string, length)
|
||||||
value[0] = _allocate(self._oov.mem, <unsigned char*>utf8_string, length)
|
|
||||||
self._oov.set(key32, value)
|
self._oov.set(key32, value)
|
||||||
return NULL
|
return NULL
|
||||||
|
|
||||||
if self.size == self._resize_at:
|
value = _allocate(self.mem, <unsigned char*>utf8_string, length)
|
||||||
self._realloc()
|
self._map.set(key, value)
|
||||||
self.c[self.size] = _allocate(self.mem, <unsigned char*>utf8_string, length)
|
self.keys.push_back(key)
|
||||||
self._map.set(key, <void*>&self.c[self.size])
|
return value
|
||||||
self.size += 1
|
|
||||||
return &self.c[self.size-1]
|
|
||||||
|
|
||||||
def _realloc(self):
|
|
||||||
# We want to map straight to pointers, but they'll be invalidated if
|
|
||||||
# we resize our array. So, first we remap to indices, then we resize,
|
|
||||||
# then we can acquire the new pointers.
|
|
||||||
cdef Pool tmp_mem = Pool()
|
|
||||||
keys = <key_t*>tmp_mem.alloc(self.size, sizeof(key_t))
|
|
||||||
cdef key_t key
|
|
||||||
cdef void* value
|
|
||||||
cdef const Utf8Str ptr
|
|
||||||
cdef int i = 0
|
|
||||||
cdef size_t offset
|
|
||||||
while map_iter(self._map.c_map, &i, &key, &value):
|
|
||||||
# Find array index with pointer arithmetic
|
|
||||||
offset = ((<Utf8Str*>value) - self.c)
|
|
||||||
keys[offset] = key
|
|
||||||
|
|
||||||
self._resize_at *= 2
|
|
||||||
cdef size_t new_size = self._resize_at * sizeof(Utf8Str)
|
|
||||||
self.c = <Utf8Str*>self.mem.realloc(self.c, new_size)
|
|
||||||
|
|
||||||
self._map = PreshMap(self.size)
|
|
||||||
for i in range(self.size):
|
|
||||||
if keys[i]:
|
|
||||||
self._map.set(keys[i], &self.c[i])
|
|
||||||
|
|
|
@ -5,8 +5,6 @@ from .parts_of_speech cimport univ_pos_t
|
||||||
|
|
||||||
|
|
||||||
cdef struct LexemeC:
|
cdef struct LexemeC:
|
||||||
float* vector
|
|
||||||
|
|
||||||
flags_t flags
|
flags_t flags
|
||||||
|
|
||||||
attr_t lang
|
attr_t lang
|
||||||
|
@ -25,11 +23,10 @@ cdef struct LexemeC:
|
||||||
|
|
||||||
float prob
|
float prob
|
||||||
float sentiment
|
float sentiment
|
||||||
float l2_norm
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct SerializedLexemeC:
|
cdef struct SerializedLexemeC:
|
||||||
unsigned char[4*13 + 8] data
|
unsigned char[8 + 8*10 + 4 + 4] data
|
||||||
# sizeof(flags_t) # flags
|
# sizeof(flags_t) # flags
|
||||||
# + sizeof(attr_t) # lang
|
# + sizeof(attr_t) # lang
|
||||||
# + sizeof(attr_t) # id
|
# + sizeof(attr_t) # id
|
||||||
|
@ -50,7 +47,7 @@ cdef struct Entity:
|
||||||
hash_t id
|
hash_t id
|
||||||
int start
|
int start
|
||||||
int end
|
int end
|
||||||
int label
|
attr_t label
|
||||||
|
|
||||||
|
|
||||||
cdef struct TokenC:
|
cdef struct TokenC:
|
||||||
|
@ -58,12 +55,12 @@ cdef struct TokenC:
|
||||||
uint64_t morph
|
uint64_t morph
|
||||||
univ_pos_t pos
|
univ_pos_t pos
|
||||||
bint spacy
|
bint spacy
|
||||||
int tag
|
attr_t tag
|
||||||
int idx
|
int idx
|
||||||
int lemma
|
attr_t lemma
|
||||||
int sense
|
attr_t sense
|
||||||
int head
|
int head
|
||||||
int dep
|
attr_t dep
|
||||||
bint sent_start
|
bint sent_start
|
||||||
|
|
||||||
uint32_t l_kids
|
uint32_t l_kids
|
||||||
|
@ -72,5 +69,5 @@ cdef struct TokenC:
|
||||||
uint32_t r_edge
|
uint32_t r_edge
|
||||||
|
|
||||||
int ent_iob
|
int ent_iob
|
||||||
int ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
|
attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
|
||||||
hash_t ent_id
|
hash_t ent_id
|
||||||
|
|
|
@ -3,6 +3,7 @@ from cymem.cymem cimport Pool
|
||||||
from thinc.typedefs cimport weight_t
|
from thinc.typedefs cimport weight_t
|
||||||
|
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
|
from ..typedefs cimport attr_t
|
||||||
|
|
||||||
from .transition_system cimport TransitionSystem, Transition
|
from .transition_system cimport TransitionSystem, Transition
|
||||||
from ..gold cimport GoldParseC
|
from ..gold cimport GoldParseC
|
||||||
|
|
|
@ -99,7 +99,7 @@ cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
cdef bint label_is_gold(const GoldParseC* gold, int head, int child, int label) nogil:
|
cdef bint label_is_gold(const GoldParseC* gold, int head, int child, attr_t label) nogil:
|
||||||
if gold.labels[child] == -1:
|
if gold.labels[child] == -1:
|
||||||
return True
|
return True
|
||||||
elif label == -1:
|
elif label == -1:
|
||||||
|
@ -116,16 +116,16 @@ cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil:
|
||||||
|
|
||||||
cdef class Shift:
|
cdef class Shift:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_start
|
return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_start
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, int label) nogil:
|
cdef int transition(StateC* st, attr_t label) nogil:
|
||||||
st.push()
|
st.push()
|
||||||
st.fast_forward()
|
st.fast_forward()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef weight_t cost(StateClass st, const GoldParseC* gold, int label) nogil:
|
cdef weight_t cost(StateClass st, const GoldParseC* gold, attr_t label) nogil:
|
||||||
return Shift.move_cost(st, gold) + Shift.label_cost(st, gold, label)
|
return Shift.move_cost(st, gold) + Shift.label_cost(st, gold, label)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -133,17 +133,17 @@ cdef class Shift:
|
||||||
return push_cost(s, gold, s.B(0))
|
return push_cost(s, gold, s.B(0))
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
cdef class Reduce:
|
cdef class Reduce:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
return st.stack_depth() >= 2
|
return st.stack_depth() >= 2
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, int label) nogil:
|
cdef int transition(StateC* st, attr_t label) nogil:
|
||||||
if st.has_head(st.S(0)):
|
if st.has_head(st.S(0)):
|
||||||
st.pop()
|
st.pop()
|
||||||
else:
|
else:
|
||||||
|
@ -151,7 +151,7 @@ cdef class Reduce:
|
||||||
st.fast_forward()
|
st.fast_forward()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||||
return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label)
|
return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -170,23 +170,23 @@ cdef class Reduce:
|
||||||
return cost
|
return cost
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
cdef class LeftArc:
|
cdef class LeftArc:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
return not st.B_(0).sent_start
|
return not st.B_(0).sent_start
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, int label) nogil:
|
cdef int transition(StateC* st, attr_t label) nogil:
|
||||||
st.add_arc(st.B(0), st.S(0), label)
|
st.add_arc(st.B(0), st.S(0), label)
|
||||||
st.pop()
|
st.pop()
|
||||||
st.fast_forward()
|
st.fast_forward()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||||
return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label)
|
return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -204,23 +204,23 @@ cdef class LeftArc:
|
||||||
return cost + pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0))
|
return cost + pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0))
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||||
return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label)
|
return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label)
|
||||||
|
|
||||||
|
|
||||||
cdef class RightArc:
|
cdef class RightArc:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
return not st.B_(0).sent_start
|
return not st.B_(0).sent_start
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, int label) nogil:
|
cdef int transition(StateC* st, attr_t label) nogil:
|
||||||
st.add_arc(st.S(0), st.B(0), label)
|
st.add_arc(st.S(0), st.B(0), label)
|
||||||
st.push()
|
st.push()
|
||||||
st.fast_forward()
|
st.fast_forward()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
cdef inline weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||||
return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label)
|
return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -233,13 +233,13 @@ cdef class RightArc:
|
||||||
return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0))
|
return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0))
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
cdef weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||||
return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label)
|
return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label)
|
||||||
|
|
||||||
|
|
||||||
cdef class Break:
|
cdef class Break:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
cdef int i
|
cdef int i
|
||||||
if not USE_BREAK:
|
if not USE_BREAK:
|
||||||
return False
|
return False
|
||||||
|
@ -251,12 +251,12 @@ cdef class Break:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, int label) nogil:
|
cdef int transition(StateC* st, attr_t label) nogil:
|
||||||
st.set_break(st.B_(0).l_edge)
|
st.set_break(st.B_(0).l_edge)
|
||||||
st.fast_forward()
|
st.fast_forward()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||||
return Break.move_cost(s, gold) + Break.label_cost(s, gold, label)
|
return Break.move_cost(s, gold) + Break.label_cost(s, gold, label)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -281,7 +281,7 @@ cdef class Break:
|
||||||
return cost + 1
|
return cost + 1
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
cdef int _get_root(int word, const GoldParseC* gold) nogil:
|
cdef int _get_root(int word, const GoldParseC* gold) nogil:
|
||||||
|
@ -295,9 +295,7 @@ cdef int _get_root(int word, const GoldParseC* gold) nogil:
|
||||||
|
|
||||||
cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
|
cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
|
||||||
cdef StateClass st = StateClass.init(<const TokenC*>tokens, length)
|
cdef StateClass st = StateClass.init(<const TokenC*>tokens, length)
|
||||||
# Ensure sent_start is set to 0 throughout
|
|
||||||
for i in range(st.c.length):
|
for i in range(st.c.length):
|
||||||
st.c._sent[i].sent_start = False
|
|
||||||
st.c._sent[i].l_edge = i
|
st.c._sent[i].l_edge = i
|
||||||
st.c._sent[i].r_edge = i
|
st.c._sent[i].r_edge = i
|
||||||
st.fast_forward()
|
st.fast_forward()
|
||||||
|
@ -371,7 +369,7 @@ cdef class ArcEager(TransitionSystem):
|
||||||
if label.upper() == 'ROOT':
|
if label.upper() == 'ROOT':
|
||||||
label = 'ROOT'
|
label = 'ROOT'
|
||||||
gold.c.heads[i] = gold.heads[i]
|
gold.c.heads[i] = gold.heads[i]
|
||||||
gold.c.labels[i] = self.strings[label]
|
gold.c.labels[i] = self.strings.add(label)
|
||||||
return gold
|
return gold
|
||||||
|
|
||||||
cdef Transition lookup_transition(self, object name) except *:
|
cdef Transition lookup_transition(self, object name) except *:
|
||||||
|
@ -386,14 +384,14 @@ cdef class ArcEager(TransitionSystem):
|
||||||
if self.c[i].move == move and self.c[i].label == label:
|
if self.c[i].move == move and self.c[i].label == label:
|
||||||
return self.c[i]
|
return self.c[i]
|
||||||
|
|
||||||
def move_name(self, int move, int label):
|
def move_name(self, int move, attr_t label):
|
||||||
label_str = self.strings[label]
|
label_str = self.strings[label]
|
||||||
if label_str:
|
if label_str:
|
||||||
return MOVE_NAMES[move] + '-' + label_str
|
return MOVE_NAMES[move] + '-' + label_str
|
||||||
else:
|
else:
|
||||||
return MOVE_NAMES[move]
|
return MOVE_NAMES[move]
|
||||||
|
|
||||||
cdef Transition init_transition(self, int clas, int move, int label) except *:
|
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
|
||||||
# TODO: Apparent Cython bug here when we try to use the Transition()
|
# TODO: Apparent Cython bug here when we try to use the Transition()
|
||||||
# constructor with the function pointers
|
# constructor with the function pointers
|
||||||
cdef Transition t
|
cdef Transition t
|
||||||
|
@ -426,9 +424,7 @@ cdef class ArcEager(TransitionSystem):
|
||||||
return t
|
return t
|
||||||
|
|
||||||
cdef int initialize_state(self, StateC* st) nogil:
|
cdef int initialize_state(self, StateC* st) nogil:
|
||||||
# Ensure sent_start is set to 0 throughout
|
|
||||||
for i in range(st.length):
|
for i in range(st.length):
|
||||||
st._sent[i].sent_start = False
|
|
||||||
st._sent[i].l_edge = i
|
st._sent[i].l_edge = i
|
||||||
st._sent[i].r_edge = i
|
st._sent[i].r_edge = i
|
||||||
st.fast_forward()
|
st.fast_forward()
|
||||||
|
@ -473,7 +469,7 @@ cdef class ArcEager(TransitionSystem):
|
||||||
label_cost_funcs[RIGHT] = RightArc.label_cost
|
label_cost_funcs[RIGHT] = RightArc.label_cost
|
||||||
label_cost_funcs[BREAK] = Break.label_cost
|
label_cost_funcs[BREAK] = Break.label_cost
|
||||||
|
|
||||||
cdef int* labels = gold.c.labels
|
cdef attr_t* labels = gold.c.labels
|
||||||
cdef int* heads = gold.c.heads
|
cdef int* heads = gold.c.heads
|
||||||
|
|
||||||
n_gold = 0
|
n_gold = 0
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
from .transition_system cimport TransitionSystem
|
from .transition_system cimport TransitionSystem
|
||||||
from .transition_system cimport Transition
|
from .transition_system cimport Transition
|
||||||
from ..gold cimport GoldParseC
|
from ..gold cimport GoldParseC
|
||||||
|
from ..typedefs cimport attr_t
|
||||||
|
|
||||||
|
|
||||||
cdef class BiluoPushDown(TransitionSystem):
|
cdef class BiluoPushDown(TransitionSystem):
|
||||||
|
|
|
@ -100,7 +100,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return (BEGIN, IN, LAST, UNIT, OUT)
|
return (BEGIN, IN, LAST, UNIT, OUT)
|
||||||
|
|
||||||
def move_name(self, int move, int label):
|
def move_name(self, int move, attr_t label):
|
||||||
if move == OUT:
|
if move == OUT:
|
||||||
return 'O'
|
return 'O'
|
||||||
elif move == MISSING:
|
elif move == MISSING:
|
||||||
|
@ -132,7 +132,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
if label_str.startswith('!'):
|
if label_str.startswith('!'):
|
||||||
label_str = label_str[1:]
|
label_str = label_str[1:]
|
||||||
move_str = 'x'
|
move_str = 'x'
|
||||||
label = self.strings[label_str]
|
label = self.strings.add(label_str)
|
||||||
else:
|
else:
|
||||||
move_str = name
|
move_str = name
|
||||||
label = 0
|
label = 0
|
||||||
|
@ -145,7 +145,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
else:
|
else:
|
||||||
raise KeyError(name)
|
raise KeyError(name)
|
||||||
|
|
||||||
cdef Transition init_transition(self, int clas, int move, int label) except *:
|
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
|
||||||
# TODO: Apparent Cython bug here when we try to use the Transition()
|
# TODO: Apparent Cython bug here when we try to use the Transition()
|
||||||
# constructor with the function pointers
|
# constructor with the function pointers
|
||||||
cdef Transition t
|
cdef Transition t
|
||||||
|
@ -194,21 +194,21 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
|
|
||||||
cdef class Missing:
|
cdef class Missing:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* s, int label) nogil:
|
cdef int transition(StateC* s, attr_t label) nogil:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||||
return 9000
|
return 9000
|
||||||
|
|
||||||
|
|
||||||
cdef class Begin:
|
cdef class Begin:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
# Ensure we don't clobber preset entities. If no entity preset,
|
# Ensure we don't clobber preset entities. If no entity preset,
|
||||||
# ent_iob is 0
|
# ent_iob is 0
|
||||||
cdef int preset_ent_iob = st.B_(0).ent_iob
|
cdef int preset_ent_iob = st.B_(0).ent_iob
|
||||||
|
@ -232,14 +232,14 @@ cdef class Begin:
|
||||||
return label != 0 and not st.entity_is_open()
|
return label != 0 and not st.entity_is_open()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, int label) nogil:
|
cdef int transition(StateC* st, attr_t label) nogil:
|
||||||
st.open_ent(label)
|
st.open_ent(label)
|
||||||
st.set_ent_tag(st.B(0), 3, label)
|
st.set_ent_tag(st.B(0), 3, label)
|
||||||
st.push()
|
st.push()
|
||||||
st.pop()
|
st.pop()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||||
cdef int g_act = gold.ner[s.B(0)].move
|
cdef int g_act = gold.ner[s.B(0)].move
|
||||||
cdef int g_tag = gold.ner[s.B(0)].label
|
cdef int g_tag = gold.ner[s.B(0)].label
|
||||||
|
|
||||||
|
@ -261,7 +261,7 @@ cdef class Begin:
|
||||||
|
|
||||||
cdef class In:
|
cdef class In:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
cdef int preset_ent_iob = st.B_(0).ent_iob
|
cdef int preset_ent_iob = st.B_(0).ent_iob
|
||||||
if preset_ent_iob == 2:
|
if preset_ent_iob == 2:
|
||||||
return False
|
return False
|
||||||
|
@ -277,17 +277,17 @@ cdef class In:
|
||||||
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
|
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, int label) nogil:
|
cdef int transition(StateC* st, attr_t label) nogil:
|
||||||
st.set_ent_tag(st.B(0), 1, label)
|
st.set_ent_tag(st.B(0), 1, label)
|
||||||
st.push()
|
st.push()
|
||||||
st.pop()
|
st.pop()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||||
move = IN
|
move = IN
|
||||||
cdef int next_act = gold.ner[s.B(1)].move if s.B(0) < s.c.length else OUT
|
cdef int next_act = gold.ner[s.B(1)].move if s.B(0) < s.c.length else OUT
|
||||||
cdef int g_act = gold.ner[s.B(0)].move
|
cdef int g_act = gold.ner[s.B(0)].move
|
||||||
cdef int g_tag = gold.ner[s.B(0)].label
|
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
||||||
cdef bint is_sunk = _entity_is_sunk(s, gold.ner)
|
cdef bint is_sunk = _entity_is_sunk(s, gold.ner)
|
||||||
|
|
||||||
if g_act == MISSING:
|
if g_act == MISSING:
|
||||||
|
@ -313,24 +313,24 @@ cdef class In:
|
||||||
|
|
||||||
cdef class Last:
|
cdef class Last:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
if st.B_(1).ent_iob == 1:
|
if st.B_(1).ent_iob == 1:
|
||||||
return False
|
return False
|
||||||
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
|
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, int label) nogil:
|
cdef int transition(StateC* st, attr_t label) nogil:
|
||||||
st.close_ent()
|
st.close_ent()
|
||||||
st.set_ent_tag(st.B(0), 1, label)
|
st.set_ent_tag(st.B(0), 1, label)
|
||||||
st.push()
|
st.push()
|
||||||
st.pop()
|
st.pop()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||||
move = LAST
|
move = LAST
|
||||||
|
|
||||||
cdef int g_act = gold.ner[s.B(0)].move
|
cdef int g_act = gold.ner[s.B(0)].move
|
||||||
cdef int g_tag = gold.ner[s.B(0)].label
|
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
||||||
|
|
||||||
if g_act == MISSING:
|
if g_act == MISSING:
|
||||||
return 0
|
return 0
|
||||||
|
@ -355,7 +355,7 @@ cdef class Last:
|
||||||
|
|
||||||
cdef class Unit:
|
cdef class Unit:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
cdef int preset_ent_iob = st.B_(0).ent_iob
|
cdef int preset_ent_iob = st.B_(0).ent_iob
|
||||||
if preset_ent_iob == 2:
|
if preset_ent_iob == 2:
|
||||||
return False
|
return False
|
||||||
|
@ -368,7 +368,7 @@ cdef class Unit:
|
||||||
return label != 0 and not st.entity_is_open()
|
return label != 0 and not st.entity_is_open()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, int label) nogil:
|
cdef int transition(StateC* st, attr_t label) nogil:
|
||||||
st.open_ent(label)
|
st.open_ent(label)
|
||||||
st.close_ent()
|
st.close_ent()
|
||||||
st.set_ent_tag(st.B(0), 3, label)
|
st.set_ent_tag(st.B(0), 3, label)
|
||||||
|
@ -376,9 +376,9 @@ cdef class Unit:
|
||||||
st.pop()
|
st.pop()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||||
cdef int g_act = gold.ner[s.B(0)].move
|
cdef int g_act = gold.ner[s.B(0)].move
|
||||||
cdef int g_tag = gold.ner[s.B(0)].label
|
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
||||||
|
|
||||||
if g_act == MISSING:
|
if g_act == MISSING:
|
||||||
return 0
|
return 0
|
||||||
|
@ -398,7 +398,7 @@ cdef class Unit:
|
||||||
|
|
||||||
cdef class Out:
|
cdef class Out:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
cdef int preset_ent_iob = st.B_(0).ent_iob
|
cdef int preset_ent_iob = st.B_(0).ent_iob
|
||||||
if preset_ent_iob == 3:
|
if preset_ent_iob == 3:
|
||||||
return False
|
return False
|
||||||
|
@ -407,15 +407,15 @@ cdef class Out:
|
||||||
return not st.entity_is_open()
|
return not st.entity_is_open()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, int label) nogil:
|
cdef int transition(StateC* st, attr_t label) nogil:
|
||||||
st.set_ent_tag(st.B(0), 2, 0)
|
st.set_ent_tag(st.B(0), 2, 0)
|
||||||
st.push()
|
st.push()
|
||||||
st.pop()
|
st.pop()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||||
cdef int g_act = gold.ner[s.B(0)].move
|
cdef int g_act = gold.ner[s.B(0)].move
|
||||||
cdef int g_tag = gold.ner[s.B(0)].label
|
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
||||||
|
|
||||||
if g_act == MISSING or g_act == ISNT:
|
if g_act == MISSING or g_act == ISNT:
|
||||||
return 0
|
return 0
|
||||||
|
|
|
@ -428,7 +428,7 @@ cdef class Parser:
|
||||||
|
|
||||||
cuda_stream = get_cuda_stream()
|
cuda_stream = get_cuda_stream()
|
||||||
|
|
||||||
states, golds, max_length = self._init_gold_batch(docs, golds)
|
states, golds, max_steps = self._init_gold_batch(docs, golds)
|
||||||
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream,
|
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream,
|
||||||
0.0)
|
0.0)
|
||||||
todo = [(s, g) for (s, g) in zip(states, golds)
|
todo = [(s, g) for (s, g) in zip(states, golds)
|
||||||
|
@ -439,6 +439,7 @@ cdef class Parser:
|
||||||
backprops = []
|
backprops = []
|
||||||
d_tokvecs = state2vec.ops.allocate(tokvecs.shape)
|
d_tokvecs = state2vec.ops.allocate(tokvecs.shape)
|
||||||
cdef float loss = 0.
|
cdef float loss = 0.
|
||||||
|
n_steps = 0
|
||||||
while todo:
|
while todo:
|
||||||
states, golds = zip(*todo)
|
states, golds = zip(*todo)
|
||||||
|
|
||||||
|
@ -450,7 +451,7 @@ cdef class Parser:
|
||||||
scores, bp_scores = vec2scores.begin_update(vector, drop=drop)
|
scores, bp_scores = vec2scores.begin_update(vector, drop=drop)
|
||||||
|
|
||||||
d_scores = self.get_batch_loss(states, golds, scores)
|
d_scores = self.get_batch_loss(states, golds, scores)
|
||||||
d_vector = bp_scores(d_scores, sgd=sgd)
|
d_vector = bp_scores(d_scores / d_scores.shape[0], sgd=sgd)
|
||||||
if drop != 0:
|
if drop != 0:
|
||||||
d_vector *= mask
|
d_vector *= mask
|
||||||
|
|
||||||
|
@ -468,7 +469,8 @@ cdef class Parser:
|
||||||
todo = [st for st in todo if not st[0].is_final()]
|
todo = [st for st in todo if not st[0].is_final()]
|
||||||
if losses is not None:
|
if losses is not None:
|
||||||
losses[self.name] += (d_scores**2).sum()
|
losses[self.name] += (d_scores**2).sum()
|
||||||
if len(backprops) >= (max_length * 2):
|
n_steps += 1
|
||||||
|
if n_steps >= max_steps:
|
||||||
break
|
break
|
||||||
self._make_updates(d_tokvecs,
|
self._make_updates(d_tokvecs,
|
||||||
backprops, sgd, cuda_stream)
|
backprops, sgd, cuda_stream)
|
||||||
|
@ -483,7 +485,8 @@ cdef class Parser:
|
||||||
StateClass state
|
StateClass state
|
||||||
Transition action
|
Transition action
|
||||||
whole_states = self.moves.init_batch(whole_docs)
|
whole_states = self.moves.init_batch(whole_docs)
|
||||||
max_length = max(5, min(20, min([len(doc) for doc in whole_docs])))
|
max_length = max(5, min(50, min([len(doc) for doc in whole_docs])))
|
||||||
|
max_moves = 0
|
||||||
states = []
|
states = []
|
||||||
golds = []
|
golds = []
|
||||||
for doc, state, gold in zip(whole_docs, whole_states, whole_golds):
|
for doc, state, gold in zip(whole_docs, whole_states, whole_golds):
|
||||||
|
@ -494,16 +497,20 @@ cdef class Parser:
|
||||||
start = 0
|
start = 0
|
||||||
while start < len(doc):
|
while start < len(doc):
|
||||||
state = state.copy()
|
state = state.copy()
|
||||||
|
n_moves = 0
|
||||||
while state.B(0) < start and not state.is_final():
|
while state.B(0) < start and not state.is_final():
|
||||||
action = self.moves.c[oracle_actions.pop(0)]
|
action = self.moves.c[oracle_actions.pop(0)]
|
||||||
action.do(state.c, action.label)
|
action.do(state.c, action.label)
|
||||||
|
n_moves += 1
|
||||||
has_gold = self.moves.has_gold(gold, start=start,
|
has_gold = self.moves.has_gold(gold, start=start,
|
||||||
end=start+max_length)
|
end=start+max_length)
|
||||||
if not state.is_final() and has_gold:
|
if not state.is_final() and has_gold:
|
||||||
states.append(state)
|
states.append(state)
|
||||||
golds.append(gold)
|
golds.append(gold)
|
||||||
|
max_moves = max(max_moves, n_moves)
|
||||||
start += min(max_length, len(doc)-start)
|
start += min(max_length, len(doc)-start)
|
||||||
return states, golds, max_length
|
max_moves = max(max_moves, len(oracle_actions))
|
||||||
|
return states, golds, max_moves
|
||||||
|
|
||||||
def _make_updates(self, d_tokvecs, backprops, sgd, cuda_stream=None):
|
def _make_updates(self, d_tokvecs, backprops, sgd, cuda_stream=None):
|
||||||
# Tells CUDA to block, so our async copies complete.
|
# Tells CUDA to block, so our async copies complete.
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from thinc.typedefs cimport weight_t
|
from thinc.typedefs cimport weight_t
|
||||||
|
|
||||||
|
from ..typedefs cimport attr_t
|
||||||
from ..structs cimport TokenC
|
from ..structs cimport TokenC
|
||||||
from ..gold cimport GoldParse
|
from ..gold cimport GoldParse
|
||||||
from ..gold cimport GoldParseC
|
from ..gold cimport GoldParseC
|
||||||
|
@ -13,20 +14,22 @@ from ._state cimport StateC
|
||||||
cdef struct Transition:
|
cdef struct Transition:
|
||||||
int clas
|
int clas
|
||||||
int move
|
int move
|
||||||
int label
|
attr_t label
|
||||||
|
|
||||||
weight_t score
|
weight_t score
|
||||||
|
|
||||||
bint (*is_valid)(const StateC* state, int label) nogil
|
bint (*is_valid)(const StateC* state, attr_t label) nogil
|
||||||
weight_t (*get_cost)(StateClass state, const GoldParseC* gold, int label) nogil
|
weight_t (*get_cost)(StateClass state, const GoldParseC* gold, attr_t label) nogil
|
||||||
int (*do)(StateC* state, int label) nogil
|
int (*do)(StateC* state, attr_t label) nogil
|
||||||
|
|
||||||
|
|
||||||
ctypedef weight_t (*get_cost_func_t)(StateClass state, const GoldParseC* gold, int label) nogil
|
ctypedef weight_t (*get_cost_func_t)(StateClass state, const GoldParseC* gold,
|
||||||
|
attr_tlabel) nogil
|
||||||
ctypedef weight_t (*move_cost_func_t)(StateClass state, const GoldParseC* gold) nogil
|
ctypedef weight_t (*move_cost_func_t)(StateClass state, const GoldParseC* gold) nogil
|
||||||
ctypedef weight_t (*label_cost_func_t)(StateClass state, const GoldParseC* gold, int label) nogil
|
ctypedef weight_t (*label_cost_func_t)(StateClass state, const GoldParseC*
|
||||||
|
gold, attr_t label) nogil
|
||||||
|
|
||||||
ctypedef int (*do_func_t)(StateC* state, int label) nogil
|
ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil
|
||||||
|
|
||||||
ctypedef void* (*init_state_t)(Pool mem, int length, void* tokens) except NULL
|
ctypedef void* (*init_state_t)(Pool mem, int length, void* tokens) except NULL
|
||||||
|
|
||||||
|
@ -36,7 +39,7 @@ cdef class TransitionSystem:
|
||||||
cdef Transition* c
|
cdef Transition* c
|
||||||
cdef readonly int n_moves
|
cdef readonly int n_moves
|
||||||
cdef int _size
|
cdef int _size
|
||||||
cdef public int root_label
|
cdef public attr_t root_label
|
||||||
cdef public freqs
|
cdef public freqs
|
||||||
cdef init_state_t init_beam_state
|
cdef init_state_t init_beam_state
|
||||||
|
|
||||||
|
@ -45,7 +48,7 @@ cdef class TransitionSystem:
|
||||||
|
|
||||||
cdef Transition lookup_transition(self, object name) except *
|
cdef Transition lookup_transition(self, object name) except *
|
||||||
|
|
||||||
cdef Transition init_transition(self, int clas, int move, int label) except *
|
cdef Transition init_transition(self, int clas, int move, attr_t label) except *
|
||||||
|
|
||||||
cdef int set_valid(self, int* output, const StateC* st) nogil
|
cdef int set_valid(self, int* output, const StateC* st) nogil
|
||||||
|
|
||||||
|
|
|
@ -99,7 +99,7 @@ cdef class TransitionSystem:
|
||||||
cdef Transition lookup_transition(self, object name) except *:
|
cdef Transition lookup_transition(self, object name) except *:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
cdef Transition init_transition(self, int clas, int move, int label) except *:
|
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def is_valid(self, StateClass stcls, move_name):
|
def is_valid(self, StateClass stcls, move_name):
|
||||||
|
|
|
@ -204,6 +204,7 @@ def test_doc_api_right_edge(en_tokenizer):
|
||||||
assert doc[6].right_edge.text == ','
|
assert doc[6].right_edge.text == ','
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
@pytest.mark.parametrize('text,vectors', [
|
@pytest.mark.parametrize('text,vectors', [
|
||||||
("apple orange pear", ["apple -1 -1 -1", "orange -1 -1 0", "pear -1 0 -1"])
|
("apple orange pear", ["apple -1 -1 -1", "orange -1 -1 0", "pear -1 0 -1"])
|
||||||
])
|
])
|
||||||
|
|
|
@ -20,7 +20,7 @@ def test_doc_noun_chunks_not_nested(en_tokenizer):
|
||||||
tokens.from_array(
|
tokens.from_array(
|
||||||
[HEAD, DEP],
|
[HEAD, DEP],
|
||||||
numpy.asarray([[1, nsubj], [0, root], [4, amod], [3, nmod], [-1, cc],
|
numpy.asarray([[1, nsubj], [0, root], [4, amod], [3, nmod], [-1, cc],
|
||||||
[-2, conj], [-5, dobj]], dtype='int32'))
|
[-2, conj], [-5, dobj]], dtype='uint64'))
|
||||||
tokens.noun_chunks_iterator = english_noun_chunks
|
tokens.noun_chunks_iterator = english_noun_chunks
|
||||||
word_occurred = {}
|
word_occurred = {}
|
||||||
for chunk in tokens.noun_chunks:
|
for chunk in tokens.noun_chunks:
|
||||||
|
|
|
@ -68,6 +68,7 @@ def test_doc_token_api_is_properties(en_vocab):
|
||||||
assert doc[5].like_email
|
assert doc[5].like_email
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
@pytest.mark.parametrize('text,vectors', [
|
@pytest.mark.parametrize('text,vectors', [
|
||||||
("apples oranges ldskbjls", ["apples -1 -1 -1", "oranges -1 -1 0"])
|
("apples oranges ldskbjls", ["apples -1 -1 -1", "oranges -1 -1 0"])
|
||||||
])
|
])
|
||||||
|
|
|
@ -15,7 +15,9 @@ def test_issue615(en_tokenizer):
|
||||||
# Get Span objects
|
# Get Span objects
|
||||||
spans = [(ent_id, ent_id, doc[start : end]) for ent_id, start, end in matches]
|
spans = [(ent_id, ent_id, doc[start : end]) for ent_id, start, end in matches]
|
||||||
for ent_id, label, span in spans:
|
for ent_id, label, span in spans:
|
||||||
span.merge('NNP' if label else span.root.tag_, span.text, doc.vocab.strings[label])
|
span.merge(tag='NNP' if label else span.root.tag_, lemma=span.text,
|
||||||
|
label=label)
|
||||||
|
doc.ents = doc.ents + ((label, span.start, span.end),)
|
||||||
|
|
||||||
text = "The golf club is broken"
|
text = "The golf club is broken"
|
||||||
pattern = [{'ORTH': "golf"}, {'ORTH': "club"}]
|
pattern = [{'ORTH': "golf"}, {'ORTH': "club"}]
|
||||||
|
@ -25,6 +27,7 @@ def test_issue615(en_tokenizer):
|
||||||
matcher = Matcher(doc.vocab)
|
matcher = Matcher(doc.vocab)
|
||||||
matcher.add(label, merge_phrases, pattern)
|
matcher.add(label, merge_phrases, pattern)
|
||||||
match = matcher(doc)
|
match = matcher(doc)
|
||||||
|
print(match)
|
||||||
entities = list(doc.ents)
|
entities = list(doc.ents)
|
||||||
|
|
||||||
assert entities != [] #assertion 1
|
assert entities != [] #assertion 1
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
word2vec_str = """, -0.046107 -0.035951 -0.560418
|
word2vec_str = """, -0.046107 -0.035951 -0.560418
|
||||||
|
@ -8,6 +9,7 @@ de -0.648927 -0.400976 -0.527124
|
||||||
\u00A0 -1.499184 -0.184280 -0.598371"""
|
\u00A0 -1.499184 -0.184280 -0.598371"""
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
def test_issue834(en_vocab, text_file):
|
def test_issue834(en_vocab, text_file):
|
||||||
"""Test that no-break space (U+00A0) is detected as space by the load_vectors function."""
|
"""Test that no-break space (U+00A0) is detected as space by the load_vectors function."""
|
||||||
text_file.write(word2vec_str)
|
text_file.write(word2vec_str)
|
||||||
|
|
|
@ -7,6 +7,7 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
@pytest.mark.parametrize('text', [["a", "b", "c"]])
|
@pytest.mark.parametrize('text', [["a", "b", "c"]])
|
||||||
def test_stringstore_freeze_oov(stringstore, text):
|
def test_stringstore_freeze_oov(stringstore, text):
|
||||||
assert stringstore[text[0]] == 1
|
assert stringstore[text[0]] == 1
|
||||||
|
|
|
@ -8,69 +8,65 @@ import pytest
|
||||||
|
|
||||||
@pytest.mark.parametrize('text1,text2,text3', [(b'Hello', b'goodbye', b'hello')])
|
@pytest.mark.parametrize('text1,text2,text3', [(b'Hello', b'goodbye', b'hello')])
|
||||||
def test_stringstore_save_bytes(stringstore, text1, text2, text3):
|
def test_stringstore_save_bytes(stringstore, text1, text2, text3):
|
||||||
i = stringstore[text1]
|
key = stringstore.add(text1)
|
||||||
assert i == 1
|
assert stringstore[text1] == key
|
||||||
assert stringstore[text1] == 1
|
assert stringstore[text2] != key
|
||||||
assert stringstore[text2] != i
|
assert stringstore[text3] != key
|
||||||
assert stringstore[text3] != i
|
|
||||||
assert i == 1
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text1,text2,text3', [('Hello', 'goodbye', 'hello')])
|
@pytest.mark.parametrize('text1,text2,text3', [('Hello', 'goodbye', 'hello')])
|
||||||
def test_stringstore_save_unicode(stringstore, text1, text2, text3):
|
def test_stringstore_save_unicode(stringstore, text1, text2, text3):
|
||||||
i = stringstore[text1]
|
key = stringstore.add(text1)
|
||||||
assert i == 1
|
assert stringstore[text1] == key
|
||||||
assert stringstore[text1] == 1
|
assert stringstore[text2] != key
|
||||||
assert stringstore[text2] != i
|
assert stringstore[text3] != key
|
||||||
assert stringstore[text3] != i
|
|
||||||
assert i == 1
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', [b'A'])
|
@pytest.mark.parametrize('text', [b'A'])
|
||||||
def test_stringstore_retrieve_id(stringstore, text):
|
def test_stringstore_retrieve_id(stringstore, text):
|
||||||
i = stringstore[text]
|
key = stringstore.add(text)
|
||||||
assert stringstore.size == 1
|
assert len(stringstore) == 1
|
||||||
assert stringstore[1] == text.decode('utf8')
|
assert stringstore[key] == text.decode('utf8')
|
||||||
with pytest.raises(IndexError):
|
with pytest.raises(KeyError):
|
||||||
stringstore[2]
|
stringstore[20000]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text1,text2', [(b'0123456789', b'A')])
|
@pytest.mark.parametrize('text1,text2', [(b'0123456789', b'A')])
|
||||||
def test_stringstore_med_string(stringstore, text1, text2):
|
def test_stringstore_med_string(stringstore, text1, text2):
|
||||||
store = stringstore[text1]
|
store = stringstore.add(text1)
|
||||||
assert stringstore[store] == text1.decode('utf8')
|
assert stringstore[store] == text1.decode('utf8')
|
||||||
dummy = stringstore[text2]
|
dummy = stringstore.add(text2)
|
||||||
assert stringstore[text1] == store
|
assert stringstore[text1] == store
|
||||||
|
|
||||||
|
|
||||||
def test_stringstore_long_string(stringstore):
|
def test_stringstore_long_string(stringstore):
|
||||||
text = "INFORMATIVE](http://www.google.com/search?as_q=RedditMonkey&hl=en&num=50&btnG=Google+Search&as_epq=&as_oq=&as_eq=&lr=&as_ft=i&as_filetype=&as_qdr=all&as_nlo=&as_nhi=&as_occt=any&as_dt=i&as_sitesearch=&as_rights=&safe=off"
|
text = "INFORMATIVE](http://www.google.com/search?as_q=RedditMonkey&hl=en&num=50&btnG=Google+Search&as_epq=&as_oq=&as_eq=&lr=&as_ft=i&as_filetype=&as_qdr=all&as_nlo=&as_nhi=&as_occt=any&as_dt=i&as_sitesearch=&as_rights=&safe=off"
|
||||||
store = stringstore[text]
|
store = stringstore.add(text)
|
||||||
assert stringstore[store] == text
|
assert stringstore[store] == text
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('factor', [254, 255, 256])
|
@pytest.mark.parametrize('factor', [254, 255, 256])
|
||||||
def test_stringstore_multiply(stringstore, factor):
|
def test_stringstore_multiply(stringstore, factor):
|
||||||
text = 'a' * factor
|
text = 'a' * factor
|
||||||
store = stringstore[text]
|
store = stringstore.add(text)
|
||||||
assert stringstore[store] == text
|
assert stringstore[store] == text
|
||||||
|
|
||||||
|
|
||||||
def test_stringstore_massive_strings(stringstore):
|
def test_stringstore_massive_strings(stringstore):
|
||||||
text = 'a' * 511
|
text = 'a' * 511
|
||||||
store = stringstore[text]
|
store = stringstore.add(text)
|
||||||
assert stringstore[store] == text
|
assert stringstore[store] == text
|
||||||
text2 = 'z' * 512
|
text2 = 'z' * 512
|
||||||
store = stringstore[text2]
|
store = stringstore.add(text2)
|
||||||
assert stringstore[store] == text2
|
assert stringstore[store] == text2
|
||||||
text3 = '1' * 513
|
text3 = '1' * 513
|
||||||
store = stringstore[text3]
|
store = stringstore.add(text3)
|
||||||
assert stringstore[store] == text3
|
assert stringstore[store] == text3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["qqqqq"])
|
@pytest.mark.parametrize('text', ["qqqqq"])
|
||||||
def test_stringstore_to_bytes(stringstore, text):
|
def test_stringstore_to_bytes(stringstore, text):
|
||||||
store = stringstore[text]
|
store = stringstore.add(text)
|
||||||
serialized = stringstore.to_bytes()
|
serialized = stringstore.to_bytes()
|
||||||
new_stringstore = StringStore().from_bytes(serialized)
|
new_stringstore = StringStore().from_bytes(serialized)
|
||||||
assert new_stringstore[store] == text
|
assert new_stringstore[store] == text
|
||||||
|
|
|
@ -10,8 +10,11 @@ import numpy
|
||||||
def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None):
|
def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None):
|
||||||
"""Create Doc object from given vocab, words and annotations."""
|
"""Create Doc object from given vocab, words and annotations."""
|
||||||
pos = pos or [''] * len(words)
|
pos = pos or [''] * len(words)
|
||||||
|
tags = tags or [''] * len(words)
|
||||||
heads = heads or [0] * len(words)
|
heads = heads or [0] * len(words)
|
||||||
deps = deps or [''] * len(words)
|
deps = deps or [''] * len(words)
|
||||||
|
for value in (deps+tags+pos):
|
||||||
|
vocab.strings.add(value)
|
||||||
|
|
||||||
doc = Doc(vocab, words=words)
|
doc = Doc(vocab, words=words)
|
||||||
attrs = doc.to_array([POS, HEAD, DEP])
|
attrs = doc.to_array([POS, HEAD, DEP])
|
||||||
|
|
|
@ -16,7 +16,7 @@ def vectors():
|
||||||
def vocab(en_vocab, vectors):
|
def vocab(en_vocab, vectors):
|
||||||
return add_vecs_to_vocab(en_vocab, vectors)
|
return add_vecs_to_vocab(en_vocab, vectors)
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
def test_vectors_similarity_LL(vocab, vectors):
|
def test_vectors_similarity_LL(vocab, vectors):
|
||||||
[(word1, vec1), (word2, vec2)] = vectors
|
[(word1, vec1), (word2, vec2)] = vectors
|
||||||
lex1 = vocab[word1]
|
lex1 = vocab[word1]
|
||||||
|
@ -30,6 +30,7 @@ def test_vectors_similarity_LL(vocab, vectors):
|
||||||
assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1))
|
assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
def test_vectors_similarity_TT(vocab, vectors):
|
def test_vectors_similarity_TT(vocab, vectors):
|
||||||
[(word1, vec1), (word2, vec2)] = vectors
|
[(word1, vec1), (word2, vec2)] = vectors
|
||||||
doc = get_doc(vocab, words=[word1, word2])
|
doc = get_doc(vocab, words=[word1, word2])
|
||||||
|
@ -42,18 +43,21 @@ def test_vectors_similarity_TT(vocab, vectors):
|
||||||
assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1]))
|
assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1]))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
def test_vectors_similarity_TD(vocab, vectors):
|
def test_vectors_similarity_TD(vocab, vectors):
|
||||||
[(word1, vec1), (word2, vec2)] = vectors
|
[(word1, vec1), (word2, vec2)] = vectors
|
||||||
doc = get_doc(vocab, words=[word1, word2])
|
doc = get_doc(vocab, words=[word1, word2])
|
||||||
assert doc.similarity(doc[0]) == doc[0].similarity(doc)
|
assert doc.similarity(doc[0]) == doc[0].similarity(doc)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
def test_vectors_similarity_DS(vocab, vectors):
|
def test_vectors_similarity_DS(vocab, vectors):
|
||||||
[(word1, vec1), (word2, vec2)] = vectors
|
[(word1, vec1), (word2, vec2)] = vectors
|
||||||
doc = get_doc(vocab, words=[word1, word2])
|
doc = get_doc(vocab, words=[word1, word2])
|
||||||
assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
|
assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
def test_vectors_similarity_TS(vocab, vectors):
|
def test_vectors_similarity_TS(vocab, vectors):
|
||||||
[(word1, vec1), (word2, vec2)] = vectors
|
[(word1, vec1), (word2, vec2)] = vectors
|
||||||
doc = get_doc(vocab, words=[word1, word2])
|
doc = get_doc(vocab, words=[word1, word2])
|
||||||
|
|
|
@ -22,6 +22,7 @@ def tokenizer_v(vocab):
|
||||||
return Tokenizer(vocab, {}, None, None, None)
|
return Tokenizer(vocab, {}, None, None, None)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
@pytest.mark.parametrize('text', ["apple and orange"])
|
@pytest.mark.parametrize('text', ["apple and orange"])
|
||||||
def test_vectors_token_vector(tokenizer_v, vectors, text):
|
def test_vectors_token_vector(tokenizer_v, vectors, text):
|
||||||
doc = tokenizer_v(text)
|
doc = tokenizer_v(text)
|
||||||
|
@ -29,6 +30,7 @@ def test_vectors_token_vector(tokenizer_v, vectors, text):
|
||||||
assert vectors[1] == (doc[2].text, list(doc[2].vector))
|
assert vectors[1] == (doc[2].text, list(doc[2].vector))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
@pytest.mark.parametrize('text', ["apple", "orange"])
|
@pytest.mark.parametrize('text', ["apple", "orange"])
|
||||||
def test_vectors_lexeme_vector(vocab, text):
|
def test_vectors_lexeme_vector(vocab, text):
|
||||||
lex = vocab[text]
|
lex = vocab[text]
|
||||||
|
@ -36,6 +38,7 @@ def test_vectors_lexeme_vector(vocab, text):
|
||||||
assert lex.vector_norm
|
assert lex.vector_norm
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
|
@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
|
||||||
def test_vectors_doc_vector(vocab, text):
|
def test_vectors_doc_vector(vocab, text):
|
||||||
doc = get_doc(vocab, text)
|
doc = get_doc(vocab, text)
|
||||||
|
@ -43,6 +46,7 @@ def test_vectors_doc_vector(vocab, text):
|
||||||
assert doc.vector_norm
|
assert doc.vector_norm
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
|
@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
|
||||||
def test_vectors_span_vector(vocab, text):
|
def test_vectors_span_vector(vocab, text):
|
||||||
span = get_doc(vocab, text)[0:2]
|
span = get_doc(vocab, text)[0:2]
|
||||||
|
@ -50,6 +54,7 @@ def test_vectors_span_vector(vocab, text):
|
||||||
assert span.vector_norm
|
assert span.vector_norm
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
@pytest.mark.parametrize('text', ["apple orange"])
|
@pytest.mark.parametrize('text', ["apple orange"])
|
||||||
def test_vectors_token_token_similarity(tokenizer_v, text):
|
def test_vectors_token_token_similarity(tokenizer_v, text):
|
||||||
doc = tokenizer_v(text)
|
doc = tokenizer_v(text)
|
||||||
|
@ -57,6 +62,7 @@ def test_vectors_token_token_similarity(tokenizer_v, text):
|
||||||
assert 0.0 < doc[0].similarity(doc[1]) < 1.0
|
assert 0.0 < doc[0].similarity(doc[1]) < 1.0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
|
@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
|
||||||
def test_vectors_token_lexeme_similarity(tokenizer_v, vocab, text1, text2):
|
def test_vectors_token_lexeme_similarity(tokenizer_v, vocab, text1, text2):
|
||||||
token = tokenizer_v(text1)
|
token = tokenizer_v(text1)
|
||||||
|
@ -65,6 +71,7 @@ def test_vectors_token_lexeme_similarity(tokenizer_v, vocab, text1, text2):
|
||||||
assert 0.0 < token.similarity(lex) < 1.0
|
assert 0.0 < token.similarity(lex) < 1.0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
||||||
def test_vectors_token_span_similarity(vocab, text):
|
def test_vectors_token_span_similarity(vocab, text):
|
||||||
doc = get_doc(vocab, text)
|
doc = get_doc(vocab, text)
|
||||||
|
@ -72,6 +79,7 @@ def test_vectors_token_span_similarity(vocab, text):
|
||||||
assert 0.0 < doc[0].similarity(doc[1:3]) < 1.0
|
assert 0.0 < doc[0].similarity(doc[1:3]) < 1.0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
||||||
def test_vectors_token_doc_similarity(vocab, text):
|
def test_vectors_token_doc_similarity(vocab, text):
|
||||||
doc = get_doc(vocab, text)
|
doc = get_doc(vocab, text)
|
||||||
|
@ -79,6 +87,7 @@ def test_vectors_token_doc_similarity(vocab, text):
|
||||||
assert 0.0 < doc[0].similarity(doc) < 1.0
|
assert 0.0 < doc[0].similarity(doc) < 1.0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
||||||
def test_vectors_lexeme_span_similarity(vocab, text):
|
def test_vectors_lexeme_span_similarity(vocab, text):
|
||||||
doc = get_doc(vocab, text)
|
doc = get_doc(vocab, text)
|
||||||
|
@ -87,6 +96,7 @@ def test_vectors_lexeme_span_similarity(vocab, text):
|
||||||
assert 0.0 < doc.similarity(doc[1:3]) < 1.0
|
assert 0.0 < doc.similarity(doc[1:3]) < 1.0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
|
@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
|
||||||
def test_vectors_lexeme_lexeme_similarity(vocab, text1, text2):
|
def test_vectors_lexeme_lexeme_similarity(vocab, text1, text2):
|
||||||
lex1 = vocab[text1]
|
lex1 = vocab[text1]
|
||||||
|
@ -95,6 +105,7 @@ def test_vectors_lexeme_lexeme_similarity(vocab, text1, text2):
|
||||||
assert 0.0 < lex1.similarity(lex2) < 1.0
|
assert 0.0 < lex1.similarity(lex2) < 1.0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
||||||
def test_vectors_lexeme_doc_similarity(vocab, text):
|
def test_vectors_lexeme_doc_similarity(vocab, text):
|
||||||
doc = get_doc(vocab, text)
|
doc = get_doc(vocab, text)
|
||||||
|
@ -103,6 +114,7 @@ def test_vectors_lexeme_doc_similarity(vocab, text):
|
||||||
assert 0.0 < lex.similarity(doc) < 1.0
|
assert 0.0 < lex.similarity(doc) < 1.0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
||||||
def test_vectors_span_span_similarity(vocab, text):
|
def test_vectors_span_span_similarity(vocab, text):
|
||||||
doc = get_doc(vocab, text)
|
doc = get_doc(vocab, text)
|
||||||
|
@ -110,6 +122,7 @@ def test_vectors_span_span_similarity(vocab, text):
|
||||||
assert 0.0 < doc[0:2].similarity(doc[1:3]) < 1.0
|
assert 0.0 < doc[0:2].similarity(doc[1:3]) < 1.0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
||||||
def test_vectors_span_doc_similarity(vocab, text):
|
def test_vectors_span_doc_similarity(vocab, text):
|
||||||
doc = get_doc(vocab, text)
|
doc = get_doc(vocab, text)
|
||||||
|
@ -117,6 +130,7 @@ def test_vectors_span_doc_similarity(vocab, text):
|
||||||
assert 0.0 < doc[0:2].similarity(doc) < 1.0
|
assert 0.0 < doc[0:2].similarity(doc) < 1.0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
@pytest.mark.parametrize('text1,text2', [
|
@pytest.mark.parametrize('text1,text2', [
|
||||||
(["apple", "and", "apple", "pie"], ["orange", "juice"])])
|
(["apple", "and", "apple", "pie"], ["orange", "juice"])])
|
||||||
def test_vectors_doc_doc_similarity(vocab, text1, text2):
|
def test_vectors_doc_doc_similarity(vocab, text1, text2):
|
||||||
|
|
|
@ -5,6 +5,7 @@ import numpy
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
@pytest.mark.parametrize('text', ["Hello"])
|
@pytest.mark.parametrize('text', ["Hello"])
|
||||||
def test_vocab_add_vector(en_vocab, text):
|
def test_vocab_add_vector(en_vocab, text):
|
||||||
en_vocab.resize_vectors(10)
|
en_vocab.resize_vectors(10)
|
||||||
|
|
|
@ -11,7 +11,6 @@ import struct
|
||||||
import dill
|
import dill
|
||||||
|
|
||||||
from libc.string cimport memcpy, memset
|
from libc.string cimport memcpy, memset
|
||||||
from libc.stdint cimport uint32_t
|
|
||||||
from libc.math cimport sqrt
|
from libc.math cimport sqrt
|
||||||
|
|
||||||
from .span cimport Span
|
from .span cimport Span
|
||||||
|
@ -21,6 +20,7 @@ from .token cimport Token
|
||||||
from .printers import parse_tree
|
from .printers import parse_tree
|
||||||
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
||||||
from ..typedefs cimport attr_t, flags_t
|
from ..typedefs cimport attr_t, flags_t
|
||||||
|
from ..attrs import intify_attrs
|
||||||
from ..attrs cimport attr_id_t
|
from ..attrs cimport attr_id_t
|
||||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||||
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
||||||
|
@ -494,8 +494,8 @@ cdef class Doc:
|
||||||
cdef np.ndarray[attr_t, ndim=2] output
|
cdef np.ndarray[attr_t, ndim=2] output
|
||||||
# Make an array from the attributes --- otherwise our inner loop is Python
|
# Make an array from the attributes --- otherwise our inner loop is Python
|
||||||
# dict iteration.
|
# dict iteration.
|
||||||
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.int32)
|
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
|
||||||
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int32)
|
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
for j, feature in enumerate(attr_ids):
|
for j, feature in enumerate(attr_ids):
|
||||||
output[i, j] = get_token_attr(&self.c[i], feature)
|
output[i, j] = get_token_attr(&self.c[i], feature)
|
||||||
|
@ -640,7 +640,7 @@ cdef class Doc:
|
||||||
"""
|
"""
|
||||||
if self.length != 0:
|
if self.length != 0:
|
||||||
raise ValueError("Cannot load into non-empty Doc")
|
raise ValueError("Cannot load into non-empty Doc")
|
||||||
cdef int[:, :] attrs
|
cdef attr_t[:, :] attrs
|
||||||
cdef int i, start, end, has_space
|
cdef int i, start, end, has_space
|
||||||
fields = dill.loads(data)
|
fields = dill.loads(data)
|
||||||
text, attrs = fields[:2]
|
text, attrs = fields[:2]
|
||||||
|
@ -679,17 +679,15 @@ cdef class Doc:
|
||||||
if len(args) == 3:
|
if len(args) == 3:
|
||||||
# TODO: Warn deprecation
|
# TODO: Warn deprecation
|
||||||
tag, lemma, ent_type = args
|
tag, lemma, ent_type = args
|
||||||
attributes[TAG] = self.vocab.strings[tag]
|
attributes[TAG] = tag
|
||||||
attributes[LEMMA] = self.vocab.strings[lemma]
|
attributes[LEMMA] = lemma
|
||||||
attributes[ENT_TYPE] = self.vocab.strings[ent_type]
|
attributes[ENT_TYPE] = ent_type
|
||||||
elif not args:
|
elif not args:
|
||||||
# TODO: This code makes little sense overall. We're still
|
|
||||||
# ignoring most of the attributes?
|
|
||||||
if "label" in attributes and 'ent_type' not in attributes:
|
if "label" in attributes and 'ent_type' not in attributes:
|
||||||
if type(attributes["label"]) == int:
|
if type(attributes["label"]) == int:
|
||||||
attributes[ENT_TYPE] = attributes["label"]
|
attributes[ENT_TYPE] = attributes["label"]
|
||||||
else:
|
else:
|
||||||
attributes[ENT_TYPE] = self.vocab.strings[attributes["label"]]
|
attributes[ENT_TYPE] = self.vocab.strings.add(attributes["label"])
|
||||||
if 'ent_type' in attributes:
|
if 'ent_type' in attributes:
|
||||||
attributes[ENT_TYPE] = attributes['ent_type']
|
attributes[ENT_TYPE] = attributes['ent_type']
|
||||||
elif args:
|
elif args:
|
||||||
|
@ -699,6 +697,12 @@ cdef class Doc:
|
||||||
"Arguments supplied:\n%s\n"
|
"Arguments supplied:\n%s\n"
|
||||||
"Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes)))
|
"Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes)))
|
||||||
|
|
||||||
|
# More deprecated attribute handling =/
|
||||||
|
if 'label' in attributes:
|
||||||
|
attributes['ent_type'] = attributes.pop('label')
|
||||||
|
|
||||||
|
attributes = intify_attrs(attributes, strings_map=self.vocab.strings)
|
||||||
|
|
||||||
cdef int start = token_by_start(self.c, self.length, start_idx)
|
cdef int start = token_by_start(self.c, self.length, start_idx)
|
||||||
if start == -1:
|
if start == -1:
|
||||||
return None
|
return None
|
||||||
|
@ -708,13 +712,6 @@ cdef class Doc:
|
||||||
# Currently we have the token index, we want the range-end index
|
# Currently we have the token index, we want the range-end index
|
||||||
end += 1
|
end += 1
|
||||||
cdef Span span = self[start:end]
|
cdef Span span = self[start:end]
|
||||||
tag = self.vocab.strings[attributes.get(TAG, span.root.tag)]
|
|
||||||
lemma = self.vocab.strings[attributes.get(LEMMA, span.root.lemma)]
|
|
||||||
ent_type = self.vocab.strings[attributes.get(ENT_TYPE, span.root.ent_type)]
|
|
||||||
ent_id = attributes.get('ent_id', span.root.ent_id)
|
|
||||||
if isinstance(ent_id, basestring):
|
|
||||||
ent_id = self.vocab.strings[ent_id]
|
|
||||||
|
|
||||||
# Get LexemeC for newly merged token
|
# Get LexemeC for newly merged token
|
||||||
new_orth = ''.join([t.text_with_ws for t in span])
|
new_orth = ''.join([t.text_with_ws for t in span])
|
||||||
if span[-1].whitespace_:
|
if span[-1].whitespace_:
|
||||||
|
@ -723,18 +720,11 @@ cdef class Doc:
|
||||||
# House the new merged token where it starts
|
# House the new merged token where it starts
|
||||||
cdef TokenC* token = &self.c[start]
|
cdef TokenC* token = &self.c[start]
|
||||||
token.spacy = self.c[end-1].spacy
|
token.spacy = self.c[end-1].spacy
|
||||||
if tag in self.vocab.morphology.tag_map:
|
for attr_name, attr_value in attributes.items():
|
||||||
self.vocab.morphology.assign_tag(token, tag)
|
if attr_name == TAG:
|
||||||
|
self.vocab.morphology.assign_tag(token, attr_value)
|
||||||
else:
|
else:
|
||||||
token.tag = self.vocab.strings[tag]
|
Token.set_struct_attr(token, attr_name, attr_value)
|
||||||
token.lemma = self.vocab.strings[lemma]
|
|
||||||
if ent_type == 'O':
|
|
||||||
token.ent_iob = 2
|
|
||||||
token.ent_type = 0
|
|
||||||
else:
|
|
||||||
token.ent_iob = 3
|
|
||||||
token.ent_type = self.vocab.strings[ent_type]
|
|
||||||
token.ent_id = ent_id
|
|
||||||
# Begin by setting all the head indices to absolute token positions
|
# Begin by setting all the head indices to absolute token positions
|
||||||
# This is easier to work with for now than the offsets
|
# This is easier to work with for now than the offsets
|
||||||
# Before thinking of something simpler, beware the case where a dependency
|
# Before thinking of something simpler, beware the case where a dependency
|
||||||
|
|
|
@ -21,14 +21,14 @@ from .. import about
|
||||||
|
|
||||||
cdef class Span:
|
cdef class Span:
|
||||||
"""A slice from a Doc object."""
|
"""A slice from a Doc object."""
|
||||||
def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None,
|
def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None,
|
||||||
vector_norm=None):
|
vector_norm=None):
|
||||||
"""Create a `Span` object from the slice `doc[start : end]`.
|
"""Create a `Span` object from the slice `doc[start : end]`.
|
||||||
|
|
||||||
doc (Doc): The parent document.
|
doc (Doc): The parent document.
|
||||||
start (int): The index of the first token of the span.
|
start (int): The index of the first token of the span.
|
||||||
end (int): The index of the first token after the span.
|
end (int): The index of the first token after the span.
|
||||||
label (int): A label to attach to the Span, e.g. for named entities.
|
label (uint64): A label to attach to the Span, e.g. for named entities.
|
||||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
|
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
|
||||||
RETURNS (Span): The newly constructed object.
|
RETURNS (Span): The newly constructed object.
|
||||||
"""
|
"""
|
||||||
|
@ -377,7 +377,7 @@ cdef class Span:
|
||||||
property ent_id:
|
property ent_id:
|
||||||
"""An (integer) entity ID. Usually assigned by patterns in the `Matcher`.
|
"""An (integer) entity ID. Usually assigned by patterns in the `Matcher`.
|
||||||
|
|
||||||
RETURNS (int): The entity ID.
|
RETURNS (uint64): The entity ID.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.root.ent_id
|
return self.root.ent_id
|
||||||
|
|
|
@ -202,11 +202,11 @@ cdef class Token:
|
||||||
property lemma:
|
property lemma:
|
||||||
"""Base form of the word, with no inflectional suffixes.
|
"""Base form of the word, with no inflectional suffixes.
|
||||||
|
|
||||||
RETURNS (int): Token lemma.
|
RETURNS (uint64): Token lemma.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.lemma
|
return self.c.lemma
|
||||||
def __set__(self, int lemma):
|
def __set__(self, attr_t lemma):
|
||||||
self.c.lemma = lemma
|
self.c.lemma = lemma
|
||||||
|
|
||||||
property pos:
|
property pos:
|
||||||
|
@ -216,13 +216,13 @@ cdef class Token:
|
||||||
property tag:
|
property tag:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.tag
|
return self.c.tag
|
||||||
def __set__(self, int tag):
|
def __set__(self, attr_t tag):
|
||||||
self.vocab.morphology.assign_tag(self.c, tag)
|
self.vocab.morphology.assign_tag(self.c, tag)
|
||||||
|
|
||||||
property dep:
|
property dep:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.dep
|
return self.c.dep
|
||||||
def __set__(self, int label):
|
def __set__(self, attr_t label):
|
||||||
self.c.dep = label
|
self.c.dep = label
|
||||||
|
|
||||||
property has_vector:
|
property has_vector:
|
||||||
|
@ -234,12 +234,7 @@ cdef class Token:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'has_vector' in self.doc.user_token_hooks:
|
if 'has_vector' in self.doc.user_token_hooks:
|
||||||
return self.doc.user_token_hooks['has_vector'](self)
|
return self.doc.user_token_hooks['has_vector'](self)
|
||||||
cdef int i
|
return self.vocab.has_vector(self.lex.c.orth)
|
||||||
for i in range(self.vocab.vectors_length):
|
|
||||||
if self.c.lex.vector[i] != 0:
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
property vector:
|
property vector:
|
||||||
"""A real-valued meaning representation.
|
"""A real-valued meaning representation.
|
||||||
|
@ -250,16 +245,7 @@ cdef class Token:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'vector' in self.doc.user_token_hooks:
|
if 'vector' in self.doc.user_token_hooks:
|
||||||
return self.doc.user_token_hooks['vector'](self)
|
return self.doc.user_token_hooks['vector'](self)
|
||||||
cdef int length = self.vocab.vectors_length
|
return self.vocab.get_vector(self.c.lex.orth)
|
||||||
if length == 0:
|
|
||||||
raise ValueError(
|
|
||||||
"Word vectors set to length 0. This may be because you "
|
|
||||||
"don't have a model installed or loaded, or because your "
|
|
||||||
"model doesn't include word vectors. For more info, see "
|
|
||||||
"the documentation: \n%s\n" % about.__docs_models__
|
|
||||||
)
|
|
||||||
vector_view = <float[:length,]>self.c.lex.vector
|
|
||||||
return numpy.asarray(vector_view)
|
|
||||||
|
|
||||||
property vector_norm:
|
property vector_norm:
|
||||||
"""The L2 norm of the token's vector representation.
|
"""The L2 norm of the token's vector representation.
|
||||||
|
@ -269,7 +255,8 @@ cdef class Token:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'vector_norm' in self.doc.user_token_hooks:
|
if 'vector_norm' in self.doc.user_token_hooks:
|
||||||
return self.doc.user_token_hooks['vector_norm'](self)
|
return self.doc.user_token_hooks['vector_norm'](self)
|
||||||
return self.c.lex.l2_norm
|
vector = self.vector
|
||||||
|
return numpy.sqrt((vector ** 2).sum())
|
||||||
|
|
||||||
property n_lefts:
|
property n_lefts:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -516,16 +503,18 @@ cdef class Token:
|
||||||
property ent_type:
|
property ent_type:
|
||||||
"""Named entity type.
|
"""Named entity type.
|
||||||
|
|
||||||
RETURNS (int): Named entity type.
|
RETURNS (uint64): Named entity type.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.ent_type
|
return self.c.ent_type
|
||||||
|
def __set__(self, ent_type):
|
||||||
|
self.c.ent_type = ent_type
|
||||||
|
|
||||||
property ent_iob:
|
property ent_iob:
|
||||||
"""IOB code of named entity tag. `1="I", 2="O", 3="B"`. 0 means no tag
|
"""IOB code of named entity tag. `1="I", 2="O", 3="B"`. 0 means no tag
|
||||||
is assigned.
|
is assigned.
|
||||||
|
|
||||||
RETURNS (int): IOB code of named entity tag.
|
RETURNS (uint64): IOB code of named entity tag.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.ent_iob
|
return self.c.ent_iob
|
||||||
|
@ -537,6 +526,8 @@ cdef class Token:
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.ent_type]
|
return self.vocab.strings[self.c.ent_type]
|
||||||
|
def __set__(self, ent_type):
|
||||||
|
self.c.ent_type = self.vocab.strings.add(ent_type)
|
||||||
|
|
||||||
property ent_iob_:
|
property ent_iob_:
|
||||||
"""IOB code of named entity tag. "B" means the token begins an entity,
|
"""IOB code of named entity tag. "B" means the token begins an entity,
|
||||||
|
@ -553,7 +544,7 @@ cdef class Token:
|
||||||
"""ID of the entity the token is an instance of, if any. Usually
|
"""ID of the entity the token is an instance of, if any. Usually
|
||||||
assigned by patterns in the Matcher.
|
assigned by patterns in the Matcher.
|
||||||
|
|
||||||
RETURNS (int): ID of the entity.
|
RETURNS (uint64): ID of the entity.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.ent_id
|
return self.c.ent_id
|
||||||
|
@ -571,7 +562,7 @@ cdef class Token:
|
||||||
return self.vocab.strings[self.c.ent_id]
|
return self.vocab.strings[self.c.ent_id]
|
||||||
|
|
||||||
def __set__(self, name):
|
def __set__(self, name):
|
||||||
self.c.ent_id = self.vocab.strings[name]
|
self.c.ent_id = self.vocab.strings.add(name)
|
||||||
|
|
||||||
property whitespace_:
|
property whitespace_:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -613,7 +604,7 @@ cdef class Token:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lemma]
|
return self.vocab.strings[self.c.lemma]
|
||||||
def __set__(self, unicode lemma_):
|
def __set__(self, unicode lemma_):
|
||||||
self.c.lemma = self.vocab.strings[lemma_]
|
self.c.lemma = self.vocab.strings.add(lemma_)
|
||||||
|
|
||||||
property pos_:
|
property pos_:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -623,13 +614,13 @@ cdef class Token:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.tag]
|
return self.vocab.strings[self.c.tag]
|
||||||
def __set__(self, tag):
|
def __set__(self, tag):
|
||||||
self.tag = self.vocab.strings[tag]
|
self.tag = self.vocab.strings.add(tag)
|
||||||
|
|
||||||
property dep_:
|
property dep_:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.dep]
|
return self.vocab.strings[self.c.dep]
|
||||||
def __set__(self, unicode label):
|
def __set__(self, unicode label):
|
||||||
self.c.dep = self.vocab.strings[label]
|
self.c.dep = self.vocab.strings.add(label)
|
||||||
|
|
||||||
property is_oov:
|
property is_oov:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV)
|
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV)
|
||||||
|
|
|
@ -4,7 +4,7 @@ from libc.stdint cimport uint8_t
|
||||||
|
|
||||||
ctypedef uint64_t hash_t
|
ctypedef uint64_t hash_t
|
||||||
ctypedef char* utf8_t
|
ctypedef char* utf8_t
|
||||||
ctypedef int32_t attr_t
|
ctypedef uint64_t attr_t
|
||||||
ctypedef uint64_t flags_t
|
ctypedef uint64_t flags_t
|
||||||
ctypedef uint16_t len_t
|
ctypedef uint16_t len_t
|
||||||
ctypedef uint16_t tag_t
|
ctypedef uint16_t tag_t
|
||||||
|
|
133
spacy/util.py
|
@ -78,27 +78,86 @@ def ensure_path(path):
|
||||||
return path
|
return path
|
||||||
|
|
||||||
|
|
||||||
def resolve_model_path(name):
|
def load_model(name):
|
||||||
"""Resolve a model name or string to a model path.
|
"""Load a model from a shortcut link, package or data path.
|
||||||
|
|
||||||
name (unicode): Package name, shortcut link or model path.
|
name (unicode): Package name, shortcut link or model path.
|
||||||
RETURNS (Path): Path to model data directory.
|
RETURNS (Language): `Language` class with the loaded model.
|
||||||
"""
|
"""
|
||||||
data_path = get_data_path()
|
data_path = get_data_path()
|
||||||
if not data_path or not data_path.exists():
|
if not data_path or not data_path.exists():
|
||||||
raise IOError("Can't find spaCy data path: %s" % path2str(data_path))
|
raise IOError("Can't find spaCy data path: %s" % path2str(data_path))
|
||||||
if isinstance(name, basestring_):
|
if isinstance(name, basestring_):
|
||||||
if (data_path / name).exists(): # in data dir or shortcut link
|
if (data_path / name).exists(): # in data dir or shortcut
|
||||||
return (data_path / name)
|
return load_model_from_path(data_path / name)
|
||||||
if is_package(name): # installed as a package
|
if is_package(name): # installed as package
|
||||||
return get_model_package_path(name)
|
return load_model_from_pkg(name)
|
||||||
if Path(name).exists(): # path to model
|
if Path(name).exists(): # path to model data directory
|
||||||
return Path(name)
|
return load_data_from_path(Path(name))
|
||||||
elif hasattr(name, 'exists'): # Path or Path-like object
|
elif hasattr(name, 'exists'): # Path or Path-like to model data
|
||||||
return name
|
return load_data_from_path(name)
|
||||||
raise IOError("Can't find model '%s'" % name)
|
raise IOError("Can't find model '%s'" % name)
|
||||||
|
|
||||||
|
|
||||||
|
def load_model_from_init_py(init_file):
|
||||||
|
"""Helper function to use in the `load()` method of a model package's
|
||||||
|
__init__.py.
|
||||||
|
|
||||||
|
init_file (unicode): Path to model's __init__.py, i.e. `__file__`.
|
||||||
|
RETURNS (Language): `Language` class with loaded model.
|
||||||
|
"""
|
||||||
|
model_path = Path(init_file).parent
|
||||||
|
return load_data_from_path(model_path, package=True)
|
||||||
|
|
||||||
|
|
||||||
|
def load_model_from_path(model_path):
|
||||||
|
"""Import and load a model package from its file path.
|
||||||
|
|
||||||
|
path (unicode or Path): Path to package directory.
|
||||||
|
RETURNS (Language): `Language` class with loaded model.
|
||||||
|
"""
|
||||||
|
model_path = ensure_path(model_path)
|
||||||
|
spec = importlib.util.spec_from_file_location('model', model_path)
|
||||||
|
module = importlib.util.module_from_spec(spec)
|
||||||
|
spec.loader.exec_module(module)
|
||||||
|
return module.load()
|
||||||
|
|
||||||
|
|
||||||
|
def load_model_from_pkg(name):
|
||||||
|
"""Import and load a model package.
|
||||||
|
|
||||||
|
name (unicode): Name of model package installed via pip.
|
||||||
|
RETURNS (Language): `Language` class with loaded model.
|
||||||
|
"""
|
||||||
|
module = importlib.import_module(name)
|
||||||
|
return module.load()
|
||||||
|
|
||||||
|
|
||||||
|
def load_data_from_path(model_path, package=False):
|
||||||
|
"""Initialie a `Language` class with a loaded model from a model data path.
|
||||||
|
|
||||||
|
model_path (unicode or Path): Path to model data directory.
|
||||||
|
package (bool): Does the path point to the parent package directory?
|
||||||
|
RETURNS (Language): `Language` class with loaded model.
|
||||||
|
"""
|
||||||
|
model_path = ensure_path(model_path)
|
||||||
|
meta_path = model_path / 'meta.json'
|
||||||
|
if not meta_path.is_file():
|
||||||
|
raise IOError("Could not read meta.json from %s" % location)
|
||||||
|
meta = read_json(location)
|
||||||
|
for setting in ['lang', 'name', 'version']:
|
||||||
|
if setting not in meta:
|
||||||
|
raise IOError('No %s setting found in model meta.json' % setting)
|
||||||
|
if package:
|
||||||
|
model_data_path = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version'])
|
||||||
|
model_path = model_path / model_data_path
|
||||||
|
if not model_path.exists():
|
||||||
|
raise ValueError("Can't find model directory: %s" % path2str(model_path))
|
||||||
|
cls = get_lang_class(meta['lang'])
|
||||||
|
nlp = cls(pipeline=meta.get('pipeline', True))
|
||||||
|
return nlp.from_disk(model_path)
|
||||||
|
|
||||||
|
|
||||||
def is_package(name):
|
def is_package(name):
|
||||||
"""Check if string maps to a package installed via pip.
|
"""Check if string maps to a package installed via pip.
|
||||||
|
|
||||||
|
@ -112,36 +171,16 @@ def is_package(name):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def get_model_package_path(package_name):
|
def get_package_path(name):
|
||||||
"""Get path to a model package installed via pip.
|
"""Get the path to an installed package.
|
||||||
|
|
||||||
package_name (unicode): Name of installed package.
|
name (unicode): Package name.
|
||||||
RETURNS (Path): Path to model data directory.
|
RETURNS (Path): Path to installed package.
|
||||||
"""
|
"""
|
||||||
# Here we're importing the module just to find it. This is worryingly
|
# Here we're importing the module just to find it. This is worryingly
|
||||||
# indirect, but it's otherwise very difficult to find the package.
|
# indirect, but it's otherwise very difficult to find the package.
|
||||||
# Python's installation and import rules are very complicated.
|
|
||||||
pkg = importlib.import_module(package_name)
|
pkg = importlib.import_module(package_name)
|
||||||
package_path = Path(pkg.__file__).parent.parent
|
return Path(pkg.__file__).parent
|
||||||
meta = parse_package_meta(package_path / package_name)
|
|
||||||
model_name = '%s-%s' % (package_name, meta['version'])
|
|
||||||
return package_path / package_name / model_name
|
|
||||||
|
|
||||||
|
|
||||||
def parse_package_meta(package_path, require=True):
|
|
||||||
"""Check if a meta.json exists in a package and return its contents.
|
|
||||||
|
|
||||||
package_path (Path): Path to model package directory.
|
|
||||||
require (bool): If True, raise error if no meta.json is found.
|
|
||||||
RETURNS (dict or None): Model meta.json data or None.
|
|
||||||
"""
|
|
||||||
location = package_path / 'meta.json'
|
|
||||||
if location.is_file():
|
|
||||||
return read_json(location)
|
|
||||||
elif require:
|
|
||||||
raise IOError("Could not read meta.json from %s" % location)
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def is_in_jupyter():
|
def is_in_jupyter():
|
||||||
|
@ -177,10 +216,13 @@ def get_async(stream, numpy_array):
|
||||||
|
|
||||||
def itershuffle(iterable, bufsize=1000):
|
def itershuffle(iterable, bufsize=1000):
|
||||||
"""Shuffle an iterator. This works by holding `bufsize` items back
|
"""Shuffle an iterator. This works by holding `bufsize` items back
|
||||||
and yielding them sometime later. Obviously, this is not unbiased --
|
and yielding them sometime later. Obviously, this is not unbiased –
|
||||||
but should be good enough for batching. Larger bufsize means less bias.
|
but should be good enough for batching. Larger bufsize means less bias.
|
||||||
|
|
||||||
From https://gist.github.com/andres-erbsen/1307752
|
From https://gist.github.com/andres-erbsen/1307752
|
||||||
|
|
||||||
|
iterable (iterable): Iterator to shuffle.
|
||||||
|
bufsize (int): Items to hold back.
|
||||||
|
YIELDS (iterable): The shuffled iterator.
|
||||||
"""
|
"""
|
||||||
iterable = iter(iterable)
|
iterable = iter(iterable)
|
||||||
buf = []
|
buf = []
|
||||||
|
@ -315,17 +357,16 @@ def normalize_slice(length, start, stop, step=None):
|
||||||
|
|
||||||
|
|
||||||
def compounding(start, stop, compound):
|
def compounding(start, stop, compound):
|
||||||
'''Yield an infinite series of compounding values. Each time the
|
"""Yield an infinite series of compounding values. Each time the
|
||||||
generator is called, a value is produced by multiplying the previous
|
generator is called, a value is produced by multiplying the previous
|
||||||
value by the compound rate.
|
value by the compound rate.
|
||||||
|
|
||||||
EXAMPLE
|
EXAMPLE:
|
||||||
|
|
||||||
>>> sizes = compounding(1., 10., 1.5)
|
>>> sizes = compounding(1., 10., 1.5)
|
||||||
>>> assert next(sizes) == 1.
|
>>> assert next(sizes) == 1.
|
||||||
>>> assert next(sizes) == 1 * 1.5
|
>>> assert next(sizes) == 1 * 1.5
|
||||||
>>> assert next(sizes) == 1.5 * 1.5
|
>>> assert next(sizes) == 1.5 * 1.5
|
||||||
'''
|
"""
|
||||||
def clip(value):
|
def clip(value):
|
||||||
return max(value, stop) if (start>stop) else min(value, stop)
|
return max(value, stop) if (start>stop) else min(value, stop)
|
||||||
curr = float(start)
|
curr = float(start)
|
||||||
|
@ -335,7 +376,7 @@ def compounding(start, stop, compound):
|
||||||
|
|
||||||
|
|
||||||
def decaying(start, stop, decay):
|
def decaying(start, stop, decay):
|
||||||
'''Yield an infinite series of linearly decaying values.'''
|
"""Yield an infinite series of linearly decaying values."""
|
||||||
def clip(value):
|
def clip(value):
|
||||||
return max(value, stop) if (start>stop) else min(value, stop)
|
return max(value, stop) if (start>stop) else min(value, stop)
|
||||||
nr_upd = 1.
|
nr_upd = 1.
|
||||||
|
@ -344,12 +385,6 @@ def decaying(start, stop, decay):
|
||||||
nr_upd += 1
|
nr_upd += 1
|
||||||
|
|
||||||
|
|
||||||
def check_renamed_kwargs(renamed, kwargs):
|
|
||||||
for old, new in renamed.items():
|
|
||||||
if old in kwargs:
|
|
||||||
raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
|
|
||||||
|
|
||||||
|
|
||||||
def read_json(location):
|
def read_json(location):
|
||||||
"""Open and load JSON from file.
|
"""Open and load JSON from file.
|
||||||
|
|
||||||
|
|
232
spacy/vocab.pyx
|
@ -26,15 +26,6 @@ from . import attrs
|
||||||
from . import symbols
|
from . import symbols
|
||||||
|
|
||||||
|
|
||||||
DEF MAX_VEC_SIZE = 100000
|
|
||||||
|
|
||||||
|
|
||||||
cdef float[MAX_VEC_SIZE] EMPTY_VEC
|
|
||||||
memset(EMPTY_VEC, 0, sizeof(EMPTY_VEC))
|
|
||||||
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
|
||||||
EMPTY_LEXEME.vector = EMPTY_VEC
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Vocab:
|
cdef class Vocab:
|
||||||
"""A look-up table that allows you to access `Lexeme` objects. The `Vocab`
|
"""A look-up table that allows you to access `Lexeme` objects. The `Vocab`
|
||||||
instance also provides access to the `StringStore`, and owns underlying
|
instance also provides access to the `StringStore`, and owns underlying
|
||||||
|
@ -53,8 +44,6 @@ cdef class Vocab:
|
||||||
vice versa.
|
vice versa.
|
||||||
RETURNS (Vocab): The newly constructed vocab object.
|
RETURNS (Vocab): The newly constructed vocab object.
|
||||||
"""
|
"""
|
||||||
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
|
|
||||||
|
|
||||||
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
||||||
tag_map = tag_map if tag_map is not None else {}
|
tag_map = tag_map if tag_map is not None else {}
|
||||||
if lemmatizer in (None, True, False):
|
if lemmatizer in (None, True, False):
|
||||||
|
@ -66,7 +55,7 @@ cdef class Vocab:
|
||||||
self.strings = StringStore()
|
self.strings = StringStore()
|
||||||
if strings:
|
if strings:
|
||||||
for string in strings:
|
for string in strings:
|
||||||
self.strings[string]
|
self.strings.add(string)
|
||||||
# Load strings in a special order, so that we have an onset number for
|
# Load strings in a special order, so that we have an onset number for
|
||||||
# the vocabulary. This way, when words are added in order, the orth ID
|
# the vocabulary. This way, when words are added in order, the orth ID
|
||||||
# is the frequency rank of the word, plus a certain offset. The structural
|
# is the frequency rank of the word, plus a certain offset. The structural
|
||||||
|
@ -77,7 +66,7 @@ cdef class Vocab:
|
||||||
# Need to rethink this.
|
# Need to rethink this.
|
||||||
for name in symbols.NAMES + list(sorted(tag_map.keys())):
|
for name in symbols.NAMES + list(sorted(tag_map.keys())):
|
||||||
if name:
|
if name:
|
||||||
_ = self.strings[name]
|
self.strings.add(name)
|
||||||
self.lex_attr_getters = lex_attr_getters
|
self.lex_attr_getters = lex_attr_getters
|
||||||
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
|
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
|
||||||
|
|
||||||
|
@ -176,15 +165,14 @@ cdef class Vocab:
|
||||||
mem = self.mem
|
mem = self.mem
|
||||||
cdef bint is_oov = mem is not self.mem
|
cdef bint is_oov = mem is not self.mem
|
||||||
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
||||||
lex.orth = self.strings[string]
|
lex.orth = self.strings.add(string)
|
||||||
lex.length = len(string)
|
lex.length = len(string)
|
||||||
lex.id = self.length
|
lex.id = self.length
|
||||||
lex.vector = <float*>mem.alloc(self.vectors_length, sizeof(float))
|
|
||||||
if self.lex_attr_getters is not None:
|
if self.lex_attr_getters is not None:
|
||||||
for attr, func in self.lex_attr_getters.items():
|
for attr, func in self.lex_attr_getters.items():
|
||||||
value = func(string)
|
value = func(string)
|
||||||
if isinstance(value, unicode):
|
if isinstance(value, unicode):
|
||||||
value = self.strings[value]
|
value = self.strings.add(value)
|
||||||
if attr == PROB:
|
if attr == PROB:
|
||||||
lex.prob = value
|
lex.prob = value
|
||||||
elif value is not None:
|
elif value is not None:
|
||||||
|
@ -239,7 +227,7 @@ cdef class Vocab:
|
||||||
"""
|
"""
|
||||||
cdef attr_t orth
|
cdef attr_t orth
|
||||||
if type(id_or_string) == unicode:
|
if type(id_or_string) == unicode:
|
||||||
orth = self.strings[id_or_string]
|
orth = self.strings.add(id_or_string)
|
||||||
else:
|
else:
|
||||||
orth = id_or_string
|
orth = id_or_string
|
||||||
return Lexeme(self, orth)
|
return Lexeme(self, orth)
|
||||||
|
@ -258,6 +246,26 @@ cdef class Vocab:
|
||||||
Token.set_struct_attr(token, attr_id, value)
|
Token.set_struct_attr(token, attr_id, value)
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
|
def get_vector(self, orth):
|
||||||
|
"""Retrieve a vector for a word in the vocabulary.
|
||||||
|
|
||||||
|
Words can be looked up by string or int ID.
|
||||||
|
|
||||||
|
RETURNS:
|
||||||
|
A word vector. Size and shape determed by the
|
||||||
|
vocab.vectors instance. Usually, a numpy ndarray
|
||||||
|
of shape (300,) and dtype float32.
|
||||||
|
|
||||||
|
RAISES: If no vectors data is loaded, ValueError is raised.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def has_vector(self, orth):
|
||||||
|
"""Check whether a word has a vector. Returns False if no
|
||||||
|
vectors have been loaded. Words can be looked up by string
|
||||||
|
or int ID."""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
def to_disk(self, path):
|
def to_disk(self, path):
|
||||||
"""Save the current state to a directory.
|
"""Save the current state to a directory.
|
||||||
|
|
||||||
|
@ -271,9 +279,6 @@ cdef class Vocab:
|
||||||
with strings_loc.open('w', encoding='utf8') as file_:
|
with strings_loc.open('w', encoding='utf8') as file_:
|
||||||
self.strings.dump(file_)
|
self.strings.dump(file_)
|
||||||
|
|
||||||
# TODO: pickle
|
|
||||||
# self.dump(path / 'lexemes.bin')
|
|
||||||
|
|
||||||
def from_disk(self, path):
|
def from_disk(self, path):
|
||||||
"""Loads state from a directory. Modifies the object in place and
|
"""Loads state from a directory. Modifies the object in place and
|
||||||
returns it.
|
returns it.
|
||||||
|
@ -286,7 +291,7 @@ cdef class Vocab:
|
||||||
with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
|
with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
|
||||||
strings_list = ujson.load(file_)
|
strings_list = ujson.load(file_)
|
||||||
for string in strings_list:
|
for string in strings_list:
|
||||||
self.strings[string]
|
self.strings.add(string)
|
||||||
self.load_lexemes(path / 'lexemes.bin')
|
self.load_lexemes(path / 'lexemes.bin')
|
||||||
|
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, **exclude):
|
||||||
|
@ -346,7 +351,6 @@ cdef class Vocab:
|
||||||
lex_data.data[j] = bytes_ptr[i+j]
|
lex_data.data[j] = bytes_ptr[i+j]
|
||||||
Lexeme.c_from_bytes(lexeme, lex_data)
|
Lexeme.c_from_bytes(lexeme, lex_data)
|
||||||
|
|
||||||
lexeme.vector = EMPTY_VEC
|
|
||||||
py_str = self.strings[lexeme.orth]
|
py_str = self.strings[lexeme.orth]
|
||||||
assert self.strings[py_str] == lexeme.orth, (py_str, lexeme.orth)
|
assert self.strings[py_str] == lexeme.orth, (py_str, lexeme.orth)
|
||||||
key = hash_string(py_str)
|
key = hash_string(py_str)
|
||||||
|
@ -354,172 +358,6 @@ cdef class Vocab:
|
||||||
self._by_orth.set(lexeme.orth, lexeme)
|
self._by_orth.set(lexeme.orth, lexeme)
|
||||||
self.length += 1
|
self.length += 1
|
||||||
|
|
||||||
# Deprecated --- delete these once stable
|
|
||||||
|
|
||||||
def dump_vectors(self, out_loc):
|
|
||||||
"""Save the word vectors to a binary file.
|
|
||||||
|
|
||||||
loc (Path): The path to save to.
|
|
||||||
"""
|
|
||||||
cdef int32_t vec_len = self.vectors_length
|
|
||||||
cdef int32_t word_len
|
|
||||||
cdef bytes word_str
|
|
||||||
cdef char* chars
|
|
||||||
|
|
||||||
cdef Lexeme lexeme
|
|
||||||
cdef CFile out_file = CFile(out_loc, 'wb')
|
|
||||||
for lexeme in self:
|
|
||||||
word_str = lexeme.orth_.encode('utf8')
|
|
||||||
vec = lexeme.c.vector
|
|
||||||
word_len = len(word_str)
|
|
||||||
|
|
||||||
out_file.write_from(&word_len, 1, sizeof(word_len))
|
|
||||||
out_file.write_from(&vec_len, 1, sizeof(vec_len))
|
|
||||||
|
|
||||||
chars = <char*>word_str
|
|
||||||
out_file.write_from(chars, word_len, sizeof(char))
|
|
||||||
out_file.write_from(vec, vec_len, sizeof(float))
|
|
||||||
out_file.close()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def load_vectors(self, file_):
|
|
||||||
"""Load vectors from a text-based file.
|
|
||||||
|
|
||||||
file_ (buffer): The file to read from. Entries should be separated by
|
|
||||||
newlines, and each entry should be whitespace delimited. The first value of the entry
|
|
||||||
should be the word string, and subsequent entries should be the values of the
|
|
||||||
vector.
|
|
||||||
|
|
||||||
RETURNS (int): The length of the vectors loaded.
|
|
||||||
"""
|
|
||||||
cdef LexemeC* lexeme
|
|
||||||
cdef attr_t orth
|
|
||||||
cdef int32_t vec_len = -1
|
|
||||||
cdef double norm = 0.0
|
|
||||||
|
|
||||||
whitespace_pattern = re.compile(r'\s', re.UNICODE)
|
|
||||||
|
|
||||||
for line_num, line in enumerate(file_):
|
|
||||||
pieces = line.split()
|
|
||||||
word_str = " " if whitespace_pattern.match(line) else pieces.pop(0)
|
|
||||||
if vec_len == -1:
|
|
||||||
vec_len = len(pieces)
|
|
||||||
elif vec_len != len(pieces):
|
|
||||||
raise VectorReadError.mismatched_sizes(file_, line_num,
|
|
||||||
vec_len, len(pieces))
|
|
||||||
orth = self.strings[word_str]
|
|
||||||
lexeme = <LexemeC*><void*>self.get_by_orth(self.mem, orth)
|
|
||||||
lexeme.vector = <float*>self.mem.alloc(vec_len, sizeof(float))
|
|
||||||
for i, val_str in enumerate(pieces):
|
|
||||||
lexeme.vector[i] = float(val_str)
|
|
||||||
norm = 0.0
|
|
||||||
for i in range(vec_len):
|
|
||||||
norm += lexeme.vector[i] * lexeme.vector[i]
|
|
||||||
lexeme.l2_norm = sqrt(norm)
|
|
||||||
self.vectors_length = vec_len
|
|
||||||
return vec_len
|
|
||||||
|
|
||||||
def load_vectors_from_bin_loc(self, loc):
|
|
||||||
"""Load vectors from the location of a binary file.
|
|
||||||
|
|
||||||
loc (unicode): The path of the binary file to load from.
|
|
||||||
|
|
||||||
RETURNS (int): The length of the vectors loaded.
|
|
||||||
"""
|
|
||||||
cdef CFile file_ = CFile(loc, b'rb')
|
|
||||||
cdef int32_t word_len
|
|
||||||
cdef int32_t vec_len = 0
|
|
||||||
cdef int32_t prev_vec_len = 0
|
|
||||||
cdef float* vec
|
|
||||||
cdef Address mem
|
|
||||||
cdef attr_t string_id
|
|
||||||
cdef bytes py_word
|
|
||||||
cdef vector[float*] vectors
|
|
||||||
cdef int line_num = 0
|
|
||||||
cdef Pool tmp_mem = Pool()
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
file_.read_into(&word_len, sizeof(word_len), 1)
|
|
||||||
except IOError:
|
|
||||||
break
|
|
||||||
file_.read_into(&vec_len, sizeof(vec_len), 1)
|
|
||||||
if prev_vec_len != 0 and vec_len != prev_vec_len:
|
|
||||||
raise VectorReadError.mismatched_sizes(loc, line_num,
|
|
||||||
vec_len, prev_vec_len)
|
|
||||||
if 0 >= vec_len >= MAX_VEC_SIZE:
|
|
||||||
raise VectorReadError.bad_size(loc, vec_len)
|
|
||||||
|
|
||||||
chars = <char*>file_.alloc_read(tmp_mem, word_len, sizeof(char))
|
|
||||||
vec = <float*>file_.alloc_read(self.mem, vec_len, sizeof(float))
|
|
||||||
|
|
||||||
string_id = self.strings[chars[:word_len]]
|
|
||||||
# Insert words into vocab to add vector.
|
|
||||||
self.get_by_orth(self.mem, string_id)
|
|
||||||
while string_id >= vectors.size():
|
|
||||||
vectors.push_back(EMPTY_VEC)
|
|
||||||
assert vec != NULL
|
|
||||||
vectors[string_id] = vec
|
|
||||||
line_num += 1
|
|
||||||
cdef LexemeC* lex
|
|
||||||
cdef size_t lex_addr
|
|
||||||
cdef double norm = 0.0
|
|
||||||
cdef int i
|
|
||||||
for orth, lex_addr in self._by_orth.items():
|
|
||||||
lex = <LexemeC*>lex_addr
|
|
||||||
if lex.lower < vectors.size():
|
|
||||||
lex.vector = vectors[lex.lower]
|
|
||||||
norm = 0.0
|
|
||||||
for i in range(vec_len):
|
|
||||||
norm += lex.vector[i] * lex.vector[i]
|
|
||||||
lex.l2_norm = sqrt(norm)
|
|
||||||
else:
|
|
||||||
lex.vector = EMPTY_VEC
|
|
||||||
self.vectors_length = vec_len
|
|
||||||
return vec_len
|
|
||||||
|
|
||||||
|
|
||||||
def resize_vectors(self, int new_size):
|
|
||||||
"""Set vectors_length to a new size, and allocate more memory for the
|
|
||||||
`Lexeme` vectors if necessary. The memory will be zeroed.
|
|
||||||
|
|
||||||
new_size (int): The new size of the vectors.
|
|
||||||
"""
|
|
||||||
cdef hash_t key
|
|
||||||
cdef size_t addr
|
|
||||||
if new_size > self.vectors_length:
|
|
||||||
for key, addr in self._by_hash.items():
|
|
||||||
lex = <LexemeC*>addr
|
|
||||||
lex.vector = <float*>self.mem.realloc(lex.vector,
|
|
||||||
new_size * sizeof(lex.vector[0]))
|
|
||||||
self.vectors_length = new_size
|
|
||||||
|
|
||||||
|
|
||||||
def write_binary_vectors(in_loc, out_loc):
|
|
||||||
cdef CFile out_file = CFile(out_loc, 'wb')
|
|
||||||
cdef Address mem
|
|
||||||
cdef int32_t word_len
|
|
||||||
cdef int32_t vec_len
|
|
||||||
cdef char* chars
|
|
||||||
with bz2.BZ2File(in_loc, 'r') as file_:
|
|
||||||
for line in file_:
|
|
||||||
pieces = line.split()
|
|
||||||
word = pieces.pop(0)
|
|
||||||
mem = Address(len(pieces), sizeof(float))
|
|
||||||
vec = <float*>mem.ptr
|
|
||||||
for i, val_str in enumerate(pieces):
|
|
||||||
vec[i] = float(val_str)
|
|
||||||
|
|
||||||
word_len = len(word)
|
|
||||||
vec_len = len(pieces)
|
|
||||||
|
|
||||||
out_file.write_from(&word_len, 1, sizeof(word_len))
|
|
||||||
out_file.write_from(&vec_len, 1, sizeof(vec_len))
|
|
||||||
|
|
||||||
chars = <char*>word
|
|
||||||
out_file.write_from(chars, len(word), sizeof(char))
|
|
||||||
out_file.write_from(vec, vec_len, sizeof(float))
|
|
||||||
|
|
||||||
|
|
||||||
def pickle_vocab(vocab):
|
def pickle_vocab(vocab):
|
||||||
sstore = vocab.strings
|
sstore = vocab.strings
|
||||||
|
@ -567,21 +405,3 @@ class LookupError(Exception):
|
||||||
"ID of orth: {orth_id}".format(
|
"ID of orth: {orth_id}".format(
|
||||||
query=repr(original_string), orth_str=repr(id_string), orth_id=id_)
|
query=repr(original_string), orth_str=repr(id_string), orth_id=id_)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class VectorReadError(Exception):
|
|
||||||
@classmethod
|
|
||||||
def mismatched_sizes(cls, loc, line_num, prev_size, curr_size):
|
|
||||||
return cls(
|
|
||||||
"Error reading word vectors from %s on line %d.\n"
|
|
||||||
"All vectors must be the same size.\n"
|
|
||||||
"Prev size: %d\n"
|
|
||||||
"Curr size: %d" % (loc, line_num, prev_size, curr_size))
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def bad_size(cls, loc, size):
|
|
||||||
return cls(
|
|
||||||
"Error reading word vectors from %s.\n"
|
|
||||||
"Vector size: %d\n"
|
|
||||||
"Max size: %d\n"
|
|
||||||
"Min size: 1\n" % (loc, size, MAX_VEC_SIZE))
|
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-1 -1 907 737" width="906" height="746">
|
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-1 -1 907 737" width="906" height="746">
|
||||||
<style>
|
<style>
|
||||||
.svg__architecture__text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
|
.svg__architecture__text-large { fill: #1a1e23; font: 20px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||||
.svg__architecture__text-medium { fill: #1a1e23; font: 17px "Source Sans Pro" }
|
.svg__architecture__text-medium { fill: #1a1e23; font: 17px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||||
.svg__architecture__text-small { fill: #1a1e23; font: bold 14px "Source Sans Pro" }
|
.svg__architecture__text-small { fill: #1a1e23; font: bold 14px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||||
.svg__architecture__text-code { fill: #1a1e23; font: 600 12px "Source Code Pro" }
|
.svg__architecture__text-code { fill: #1a1e23; font: 600 12px "Source Code Pro", Monaco, "Courier New", monospace }
|
||||||
</style>
|
</style>
|
||||||
<ellipse cx="404" cy="203" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="74.8" ry="49.8"/>
|
<ellipse cx="404" cy="203" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="74.8" ry="49.8"/>
|
||||||
<text class="svg__architecture__text-large" transform="translate(362.5 206.5)" width="81" height="40">Language</text>
|
<text class="svg__architecture__text-large" transform="translate(362.5 206.5)" width="81" height="40">Language</text>
|
||||||
|
|
Before Width: | Height: | Size: 14 KiB After Width: | Height: | Size: 14 KiB |
|
@ -1,8 +1,8 @@
|
||||||
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="931" height="456" viewBox="-1 -1 932 480" preserveAspectRatio="xMinYMin meet">
|
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="931" height="456" viewBox="-1 -1 932 480" preserveAspectRatio="xMinYMin meet">
|
||||||
<style>
|
<style>
|
||||||
.svg__langdata__text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
|
.svg__langdata__text-large { fill: #1a1e23; font: 20px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||||
.svg__langdata__text-small { fill: #1a1e23; font: bold 16px "Source Sans Pro" }
|
.svg__langdata__text-small { fill: #1a1e23; font: bold 16px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||||
.svg__langdata__text-tiny { fill: #1a1e23; font: bold 16px "Source Sans Pro" }
|
.svg__langdata__text-tiny { fill: #1a1e23; font: bold 16px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||||
</style>
|
</style>
|
||||||
<path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M610 404h-69.8" stroke-dasharray="1 6" stroke-linecap="round"/>
|
<path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M610 404h-69.8" stroke-dasharray="1 6" stroke-linecap="round"/>
|
||||||
<path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M534.2 404l8-4-2 4 2 4z"/>
|
<path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M534.2 404l8-4-2 4 2 4z"/>
|
||||||
|
|
Before Width: | Height: | Size: 9.0 KiB After Width: | Height: | Size: 9.1 KiB |
|
@ -1,8 +1,8 @@
|
||||||
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 923 200" width="923" height="200">
|
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 923 200" width="923" height="200">
|
||||||
<style>
|
<style>
|
||||||
.svg__pipeline__text { fill: #1a1e23; font: 20px "Source Sans Pro" }
|
.svg__pipeline__text { fill: #1a1e23; font: 20px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||||
.svg__pipeline__text-small { fill: #1a1e23; font: bold 18px "Source Sans Pro" }
|
.svg__pipeline__text-small { fill: #1a1e23; font: bold 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||||
.svg__pipeline__text-code { fill: #1a1e23; font: 600 16px "Source Code Pro" }
|
.svg__pipeline__text-code { fill: #1a1e23; font: 600 16px "Source Code Pro", Monaco, "Courier New", monospace }
|
||||||
</style>
|
</style>
|
||||||
<rect width="601" height="127" x="159" y="21" fill="none" stroke="#09a3d5" stroke-width="3" rx="19.1" stroke-dasharray="3 6" ry="19.1"/>
|
<rect width="601" height="127" x="159" y="21" fill="none" stroke="#09a3d5" stroke-width="3" rx="19.1" stroke-dasharray="3 6" ry="19.1"/>
|
||||||
<path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M801 55h120v60H801z"/>
|
<path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M801 55h120v60H801z"/>
|
||||||
|
|
Before Width: | Height: | Size: 3.1 KiB After Width: | Height: | Size: 3.2 KiB |
123
website/assets/img/docs/tokenization.svg
Normal file
|
@ -0,0 +1,123 @@
|
||||||
|
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="600" height="380" viewBox="-20 -10 550 400">
|
||||||
|
<style>
|
||||||
|
.svg__tokenization__text { fill: #1a1e23; font: 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||||
|
.svg__tokenization__text-small { fill: #fff; font: 600 13px "Source Code Pro", Monaco, "Courier New", monospace }
|
||||||
|
</style>
|
||||||
|
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M71 39v12H16v11M71 39v12h20v11"/>
|
||||||
|
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M1 1h140v38.2H1z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" width="43" height="19" transform="translate(48.5 9.5)">“Let’s</text>
|
||||||
|
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M175 39v23"/>
|
||||||
|
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M150 1h50v38.2h-50z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 9.5)" width="19" height="19">go</text>
|
||||||
|
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M235 39v23"/>
|
||||||
|
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M210 1h50v38.2h-50z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 9.5)" width="15" height="19">to</text>
|
||||||
|
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M341 39v23"/>
|
||||||
|
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M270 1h141v38.2H270z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(320.5 9.5)" width="38" height="19">N.Y.!”</text>
|
||||||
|
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 100v20"/>
|
||||||
|
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 62h30v38.2H1z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 70.5)" width="7" height="19">“</text>
|
||||||
|
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M91 100v11H66v9M91 100v11h29v9"/>
|
||||||
|
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M41 62h100v38.2H41z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(72.5 70.5)" width="35" height="19">Let’s</text>
|
||||||
|
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M175 100v20"/>
|
||||||
|
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M150 62h50v38.2h-50z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 70.5)" width="19" height="19">go</text>
|
||||||
|
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M235 100v20"/>
|
||||||
|
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M210 62h50v38.2h-50z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 70.5)" width="15" height="19">to</text>
|
||||||
|
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M341 100v20"/>
|
||||||
|
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M270 62h141v38.2H270z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(320.5 70.5)" width="38" height="19">N.Y.!”</text>
|
||||||
|
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 120h30v38H1z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 128.5)" width="7" height="19">“</text>
|
||||||
|
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 158v24"/>
|
||||||
|
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 120h50v38H41z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 128.5)" width="23" height="19">Let</text>
|
||||||
|
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 158v24"/>
|
||||||
|
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 120h50v38h-50z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 128.5)" width="19" height="19">go</text>
|
||||||
|
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 158v24"/>
|
||||||
|
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 120h50v38h-50z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 128.5)" width="15" height="19">to</text>
|
||||||
|
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M341 158v13h-20v11M341 158v13h55v11"/>
|
||||||
|
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M270 120h141v38H270z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(320.5 128.5)" width="38" height="19">N.Y.!”</text>
|
||||||
|
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 158v24"/>
|
||||||
|
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 120h40v38h-40z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 128.5)" width="11" height="19">’s</text>
|
||||||
|
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 220v23"/>
|
||||||
|
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 181.8h30V220H1z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 190.5)" width="7" height="19">“</text>
|
||||||
|
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 220v23"/>
|
||||||
|
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 181.8h50V220H41z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 190.5)" width="23" height="19">Let</text>
|
||||||
|
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 220v23"/>
|
||||||
|
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 181.8h50V220h-50z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 190.5)" width="19" height="19">go</text>
|
||||||
|
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 220v23"/>
|
||||||
|
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 181.8h50V220h-50z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 190.5)" width="15" height="19">to</text>
|
||||||
|
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M321 220v11h-20v12M321 220v11h34v12"/>
|
||||||
|
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M270 181.8h101V220H270z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(304.5 190.5)" width="30" height="19">N.Y.!</text>
|
||||||
|
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 220v23"/>
|
||||||
|
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 181.8h40V220h-40z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 190.5)" width="11" height="19">’s</text>
|
||||||
|
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M396 220v23"/>
|
||||||
|
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 181.8h30V220h-30z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(391.5 190.5)" width="7" height="19">”</text>
|
||||||
|
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 281v23"/>
|
||||||
|
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 242.7h30V281H1z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 251.5)" width="7" height="19">“</text>
|
||||||
|
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 281v23"/>
|
||||||
|
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 242.7h50V281H41z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 251.5)" width="23" height="19">Let</text>
|
||||||
|
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 281v20-17 20"/>
|
||||||
|
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 242.7h50V281h-50z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 251.5)" width="19" height="19">go</text>
|
||||||
|
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 281v23"/>
|
||||||
|
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 242.7h50V281h-50z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 251.5)" width="15" height="19">to</text>
|
||||||
|
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M301 281v23"/>
|
||||||
|
<path fill="#f8cecc" stroke="#b85450" stroke-width="2" d="M270 242.7h61V281h-61z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(286.5 251.5)" width="26" height="19">N.Y.</text>
|
||||||
|
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 281v23"/>
|
||||||
|
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 242.7h40V281h-40z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 251.5)" width="11" height="19">’s</text>
|
||||||
|
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M396 281v23"/>
|
||||||
|
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 242.7h30V281h-30z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(391.5 251.5)" width="7" height="19">”</text>
|
||||||
|
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M355 281v23"/>
|
||||||
|
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M340 242.7h30V281h-30z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(351.5 251.5)" width="5" height="19">!</text>
|
||||||
|
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 304h30v38H1z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 312.5)" width="7" height="19">“</text>
|
||||||
|
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 304h50v38H41z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 312.5)" width="23" height="19">Let</text>
|
||||||
|
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 304h50v38h-50z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 312.5)" width="19" height="19">go</text>
|
||||||
|
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 304h50v38h-50z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 312.5)" width="15" height="19">to</text>
|
||||||
|
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M270 304h61v38h-61z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(286.5 312.5)" width="26" height="19">N.Y.</text>
|
||||||
|
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 304h40v38h-40z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 312.5)" width="11" height="19">’s</text>
|
||||||
|
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 304h30v38h-30z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(391.5 312.5)" width="7" height="19">”</text>
|
||||||
|
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M340 304h30v38h-30z"/>
|
||||||
|
<text class="svg__tokenization__text" dy="1em" transform="translate(351.5 312.5)" width="5" height="19">!</text>
|
||||||
|
<rect width="104" height="19" x="437" y="72" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
|
||||||
|
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(455.5 74.5)" width="65" height="12">EXCEPTION</text>
|
||||||
|
<rect width="104" height="19" x="437" y="11" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
|
||||||
|
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 13.5)" width="43" height="12">PREFIX</text>
|
||||||
|
<rect width="104" height="19" x="437" y="130" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
|
||||||
|
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 132.5)" width="43" height="12">SUFFIX</text>
|
||||||
|
<rect width="104" height="19" x="437" y="191" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
|
||||||
|
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 193.5)" width="43" height="12">SUFFIX</text>
|
||||||
|
<rect width="104" height="19" x="437" y="252" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
|
||||||
|
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(455.5 254.5)" width="65" height="12">EXCEPTION</text>
|
||||||
|
<rect width="104" height="19" x="437" y="313" fill="#82b366" stroke="#82b366" stroke-width="2" rx="2.9" ry="2.9"/>
|
||||||
|
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(473.5 315.5)" width="29" height="12">DONE</text>
|
||||||
|
</svg>
|
After Width: | Height: | Size: 12 KiB |
|
@ -1,9 +1,9 @@
|
||||||
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-10 -10 582 365" width="572" height="355">
|
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-10 -10 582 365" width="572" height="355">
|
||||||
<style>
|
<style>
|
||||||
.svg__vocab__text { fill: #1a1e23; font: 18px "Source Sans Pro" }
|
.svg__vocab__text { fill: #1a1e23; font: 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||||
.svg__vocab__text-large { fill: #fff; font: bold 18px "Source Sans Pro"; text-transform: uppercase }
|
.svg__vocab__text-large { fill: #fff; font: bold 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif; text-transform: uppercase }
|
||||||
.svg__vocab__text-box { fill: #fff; font: bold 12px "Source Code Pro" }
|
.svg__vocab__text-box { fill: #fff; font: bold 12px "Source Code Pro", Monaco, "Courier New", monospace }
|
||||||
.svg__vocab__text-code { fill: #1a1e23; font: bold 12px "Source Code Pro" }
|
.svg__vocab__text-code { fill: #1a1e23; font: bold 12px "Source Code Pro", Monaco, "Courier New", monospace }
|
||||||
</style>
|
</style>
|
||||||
<rect width="570" height="88" x="1" y="135" fill="#d5e8d4" stroke="#82b366" stroke-width="2" rx="13.2" ry="13.2"/>
|
<rect width="570" height="88" x="1" y="135" fill="#d5e8d4" stroke="#82b366" stroke-width="2" rx="13.2" ry="13.2"/>
|
||||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 164h100v40H444z"/>
|
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 164h100v40H444z"/>
|
||||||
|
|
Before Width: | Height: | Size: 7.6 KiB After Width: | Height: | Size: 7.8 KiB |
|
@ -158,7 +158,8 @@
|
||||||
|
|
||||||
"binder": {
|
"binder": {
|
||||||
"title": "Binder",
|
"title": "Binder",
|
||||||
"tag": "class"
|
"tag": "class",
|
||||||
|
"source": "spacy/tokens/binder.pyx"
|
||||||
},
|
},
|
||||||
|
|
||||||
"annotation": {
|
"annotation": {
|
||||||
|
|
|
@ -2,7 +2,10 @@
|
||||||
|
|
||||||
include ../../_includes/_mixins
|
include ../../_includes/_mixins
|
||||||
|
|
||||||
p spaCy currently supports the following languages and capabilities:
|
p
|
||||||
|
| spaCy currently provides models for the following languages and
|
||||||
|
| capabilities:
|
||||||
|
|
||||||
|
|
||||||
+aside-code("Download language models", "bash").
|
+aside-code("Download language models", "bash").
|
||||||
python -m spacy download en
|
python -m spacy download en
|
||||||
|
@ -22,12 +25,16 @@ p spaCy currently supports the following languages and capabilities:
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell French #[code fr]
|
+cell French #[code fr]
|
||||||
each icon in [ "pro", "pro", "con", "pro", "con", "pro", "pro", "con" ]
|
each icon in [ "pro", "con", "con", "pro", "con", "pro", "pro", "con" ]
|
||||||
+cell.u-text-center #[+procon(icon)]
|
+cell.u-text-center #[+procon(icon)]
|
||||||
|
|
||||||
+h(2, "available") Available models
|
+row
|
||||||
|
+cell Spanish #[code es]
|
||||||
|
each icon in [ "pro", "pro", "con", "pro", "pro", "pro", "pro", "con" ]
|
||||||
|
+cell.u-text-center #[+procon(icon)]
|
||||||
|
|
||||||
include ../usage/_models-list
|
p
|
||||||
|
+button("/docs/usage/models", true, "primary") See available models
|
||||||
|
|
||||||
+h(2, "alpha-support") Alpha tokenization support
|
+h(2, "alpha-support") Alpha tokenization support
|
||||||
|
|
||||||
|
@ -52,9 +59,35 @@ p
|
||||||
| #[+a("https://github.com/mocobeta/janome") Janome].
|
| #[+a("https://github.com/mocobeta/janome") Janome].
|
||||||
|
|
||||||
+table([ "Language", "Code", "Source" ])
|
+table([ "Language", "Code", "Source" ])
|
||||||
each language, code in { es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", da: "Danish", hu: "Hungarian", pl: "Polish", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" }
|
each language, code in { it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", da: "Danish", hu: "Hungarian", pl: "Polish", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" }
|
||||||
+row
|
+row
|
||||||
+cell #{language}
|
+cell #{language}
|
||||||
+cell #[code=code]
|
+cell #[code=code]
|
||||||
+cell
|
+cell
|
||||||
+src(gh("spaCy", "spacy/lang/" + code)) lang/#{code}
|
+src(gh("spaCy", "spacy/lang/" + code)) lang/#{code}
|
||||||
|
|
||||||
|
+h(2, "multi-language") Multi-language support
|
||||||
|
+tag-new(2)
|
||||||
|
|
||||||
|
p
|
||||||
|
| As of v2.0, spaCy supports models trained on more than one language. This
|
||||||
|
| is especially useful for named entity recognition. The language ID used
|
||||||
|
| for multi-language or language-neutral models is #[code xx]. The
|
||||||
|
| language class, a generic subclass containing only the base language data,
|
||||||
|
| can be found in #[+src(gh("spaCy", "spacy/lang/xx")) lang/xx].
|
||||||
|
|
||||||
|
p
|
||||||
|
| To load your model with the neutral, multi-language class, simply set
|
||||||
|
| #[code "language": "xx"] in your
|
||||||
|
| #[+a("/docs/usage/saving-loading#models-generating") model package]'s
|
||||||
|
| meta.json. You can also import the class directly, or call
|
||||||
|
| #[+api("util#get_lang_class") #[code util.get_lang_class()]] for
|
||||||
|
| lazy-loading.
|
||||||
|
|
||||||
|
+code("Standard import").
|
||||||
|
from spacy.lang.xx import MultiLanguage
|
||||||
|
nlp = MultiLanguage()
|
||||||
|
|
||||||
|
+code("With lazy-loading").
|
||||||
|
from spacy.util import get_lang_class
|
||||||
|
nlp = get_lang_class('xx')
|
||||||
|
|
|
@ -11,8 +11,13 @@ p
|
||||||
| the name of an installed
|
| the name of an installed
|
||||||
| #[+a("/docs/usage/saving-loading#generating") model package], a unicode
|
| #[+a("/docs/usage/saving-loading#generating") model package], a unicode
|
||||||
| path or a #[code Path]-like object. spaCy will try resolving the load
|
| path or a #[code Path]-like object. spaCy will try resolving the load
|
||||||
| argument in this order. The #[code Language] class to initialise will be
|
| argument in this order. If a model is loaded from a shortcut link or
|
||||||
| determined based on the model's settings.
|
| package name, spaCy will assume it's a Python package and import it and
|
||||||
|
| call the model's own #[code load()] method. If a model is loaded from a
|
||||||
|
| path, spaCy will assume it's a data directory, read the language and
|
||||||
|
| pipeline settings off the meta.json and initialise the #[code Language]
|
||||||
|
| class. The data will be loaded in via
|
||||||
|
| #[+api("language#from_disk") #[code Language.from_disk()]].
|
||||||
|
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
nlp = spacy.load('en') # shortcut link
|
nlp = spacy.load('en') # shortcut link
|
||||||
|
@ -20,7 +25,7 @@ p
|
||||||
nlp = spacy.load('/path/to/en') # unicode path
|
nlp = spacy.load('/path/to/en') # unicode path
|
||||||
nlp = spacy.load(Path('/path/to/en')) # pathlib Path
|
nlp = spacy.load(Path('/path/to/en')) # pathlib Path
|
||||||
|
|
||||||
nlp = spacy.load('en', disable['parser', 'tagger'])
|
nlp = spacy.load('en', disable=['parser', 'tagger'])
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
|
|
|
@ -1,12 +1,10 @@
|
||||||
//- 💫 DOCS > API > ANNOTATION SPECS
|
//- 💫 DOCS > API > UTIL
|
||||||
|
|
||||||
include ../../_includes/_mixins
|
include ../../_includes/_mixins
|
||||||
|
|
||||||
p
|
p
|
||||||
| spaCy comes with a small collection of utility functions located in
|
| spaCy comes with a small collection of utility functions located in
|
||||||
| #[+src(gh("spaCy", "spacy/util.py")) spacy/util.py].
|
| #[+src(gh("spaCy", "spacy/util.py")) spacy/util.py].
|
||||||
|
|
||||||
+infobox("Important note")
|
|
||||||
| Because utility functions are mostly intended for
|
| Because utility functions are mostly intended for
|
||||||
| #[strong internal use within spaCy], their behaviour may change with
|
| #[strong internal use within spaCy], their behaviour may change with
|
||||||
| future releases. The functions documented on this page should be safe
|
| future releases. The functions documented on this page should be safe
|
||||||
|
@ -74,15 +72,23 @@ p
|
||||||
+cell #[code Language]
|
+cell #[code Language]
|
||||||
+cell Language class.
|
+cell Language class.
|
||||||
|
|
||||||
+h(2, "resolve_model_path") util.resolve_model_path
|
+h(2, "load_model") util.load_model
|
||||||
+tag function
|
+tag function
|
||||||
+tag-new(2)
|
+tag-new(2)
|
||||||
|
|
||||||
p Resolve a model name or string to a model path.
|
p
|
||||||
|
| Load a model from a shortcut link, package or data path. If called with a
|
||||||
|
| shortcut link or package name, spaCy will assume the model is a Python
|
||||||
|
| package and import and call its #[code load()] method. If called with a
|
||||||
|
| path, spaCy will assume it's a data directory, read the language and
|
||||||
|
| pipeline settings from the meta.json and initialise a #[code Language]
|
||||||
|
| class. The model data will then be loaded in via
|
||||||
|
| #[+api("language#from_disk") #[code Language.from_disk()]].
|
||||||
|
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
model_path = util.resolve_model_path('en')
|
nlp = util.load_model('en')
|
||||||
model_path = util.resolve_model_path('/path/to/en')
|
nlp = util.load_model('en_core_web_sm')
|
||||||
|
nlp = util.load_model('/path/to/data')
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
|
@ -92,8 +98,33 @@ p Resolve a model name or string to a model path.
|
||||||
|
|
||||||
+footrow
|
+footrow
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell #[code Path]
|
+cell #[code Language]
|
||||||
+cell Path to model data directory.
|
+cell #[code Language] class with the loaded model.
|
||||||
|
|
||||||
|
+h(2, "load_model_from_init_py") util.load_model_from_init_py
|
||||||
|
+tag function
|
||||||
|
+tag-new(2)
|
||||||
|
|
||||||
|
p
|
||||||
|
| A helper function to use in the #[code load()] method of a model package's
|
||||||
|
| #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) __init__.py].
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
from spacy.util import load_model_from_init_py
|
||||||
|
|
||||||
|
def load():
|
||||||
|
return load_model_from_init_py(__file__)
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code init_file]
|
||||||
|
+cell unicode
|
||||||
|
+cell Path to model's __init__.py, i.e. #[code __file__].
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell returns
|
||||||
|
+cell #[code Language]
|
||||||
|
+cell #[code Language] class with the loaded model.
|
||||||
|
|
||||||
+h(2, "is_package") util.is_package
|
+h(2, "is_package") util.is_package
|
||||||
+tag function
|
+tag function
|
||||||
|
@ -117,16 +148,18 @@ p
|
||||||
+cell #[code bool]
|
+cell #[code bool]
|
||||||
+cell #[code True] if installed package, #[code False] if not.
|
+cell #[code True] if installed package, #[code False] if not.
|
||||||
|
|
||||||
+h(2, "get_model_package_path") util.get_model_package_path
|
+h(2, "get_package_path") util.get_package_path
|
||||||
+tag function
|
+tag function
|
||||||
|
+tag-new(2)
|
||||||
|
|
||||||
p
|
p
|
||||||
| Get path to a #[+a("/docs/usage/models") model package] installed via pip.
|
| Get path to an installed package. Mainly used to resolve the location of
|
||||||
| Currently imports the package to find it and parse its meta data.
|
| #[+a("/docs/usage/models") model packages]. Currently imports the package
|
||||||
|
| to find its path.
|
||||||
|
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
util.get_model_package_path('en_core_web_sm')
|
util.get_package_path('en_core_web_sm')
|
||||||
# /usr/lib/python3.6/site-packages/en_core_web_sm/en_core_web_sm-1.2.0
|
# /usr/lib/python3.6/site-packages/en_core_web_sm
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
|
@ -137,37 +170,8 @@ p
|
||||||
+footrow
|
+footrow
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell #[code Path]
|
+cell #[code Path]
|
||||||
+cell Path to model data directory.
|
|
||||||
|
|
||||||
+h(2, "parse_package_meta") util.parse_package_meta
|
|
||||||
+tag function
|
|
||||||
|
|
||||||
p
|
|
||||||
| Check if a #[code meta.json] exists in a model package and return its
|
|
||||||
| contents.
|
|
||||||
|
|
||||||
+aside-code("Example").
|
|
||||||
if util.is_package('en_core_web_sm'):
|
|
||||||
path = util.get_model_package_path('en_core_web_sm')
|
|
||||||
meta = util.parse_package_meta(path, require=True)
|
|
||||||
# {'name': 'core_web_sm', 'lang': 'en', ...}
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+row
|
|
||||||
+cell #[code package_path]
|
|
||||||
+cell #[code Path]
|
|
||||||
+cell Path to model package directory.
|
+cell Path to model package directory.
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code require]
|
|
||||||
+cell #[code bool]
|
|
||||||
+cell If #[code True], raise error if no #[code meta.json] is found.
|
|
||||||
|
|
||||||
+footrow
|
|
||||||
+cell returns
|
|
||||||
+cell dict / #[code None]
|
|
||||||
+cell Model meta data or #[code None].
|
|
||||||
|
|
||||||
+h(2, "is_in_jupyter") util.is_in_jupyter
|
+h(2, "is_in_jupyter") util.is_in_jupyter
|
||||||
+tag function
|
+tag function
|
||||||
+tag-new(2)
|
+tag-new(2)
|
||||||
|
|
|
@ -5,7 +5,7 @@ p
|
||||||
| #[strong how similar they are]. Predicting similarity is useful for
|
| #[strong how similar they are]. Predicting similarity is useful for
|
||||||
| building recommendation systems or flagging duplicates. For example, you
|
| building recommendation systems or flagging duplicates. For example, you
|
||||||
| can suggest a user content that's similar to what they're currently
|
| can suggest a user content that's similar to what they're currently
|
||||||
| looking at, or label a support ticket as a duplicate, if it's very
|
| looking at, or label a support ticket as a duplicate if it's very
|
||||||
| similar to an already existing one.
|
| similar to an already existing one.
|
||||||
|
|
||||||
p
|
p
|
||||||
|
|
|
@ -16,3 +16,47 @@ p
|
||||||
+row
|
+row
|
||||||
for cell in ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "for", "$", "1", "billion"]
|
for cell in ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "for", "$", "1", "billion"]
|
||||||
+cell=cell
|
+cell=cell
|
||||||
|
|
||||||
|
p
|
||||||
|
| Fist, the raw text is split on whitespace characters, similar to
|
||||||
|
| #[code text.split(' ')]. Then, the tokenizer processes the text from
|
||||||
|
| left to right. On each substring, it performs two checks:
|
||||||
|
|
||||||
|
+list("numbers")
|
||||||
|
+item
|
||||||
|
| #[strong Does the substring match a tokenizer exception rule?] For
|
||||||
|
| example, "don't" does not contain whitespace, but should be split
|
||||||
|
| into two tokens, "do" and "n't", while "U.K." should always
|
||||||
|
| remain one token.
|
||||||
|
+item
|
||||||
|
| #[strong Can a prefix, suffix or infixes be split off?]. For example
|
||||||
|
| punctuation like commas, periods, hyphens or quotes.
|
||||||
|
|
||||||
|
p
|
||||||
|
| If there's a match, the rule is applied and the tokenizer continues its
|
||||||
|
| loop, starting with the newly split substrings. This way, spaCy can split
|
||||||
|
| #[strong complex, nested tokens] like combinations of abbreviations and
|
||||||
|
| multiple punctuation marks.
|
||||||
|
|
||||||
|
+aside
|
||||||
|
| #[strong Tokenizer exception:] Special-case rule to split a string into
|
||||||
|
| several tokens or prevent a token from being split when punctuation rules
|
||||||
|
| are applied.#[br]
|
||||||
|
| #[strong Prefix:] Character(s) at the beginning, e.g.
|
||||||
|
| #[code $], #[code (], #[code “], #[code ¿].#[br]
|
||||||
|
| #[strong Suffix:] Character(s) at the end, e.g.
|
||||||
|
| #[code km], #[code )], #[code ”], #[code !].#[br]
|
||||||
|
| #[strong Infix:] Character(s) in between, e.g.
|
||||||
|
| #[code -], #[code --], #[code /], #[code …].#[br]
|
||||||
|
|
||||||
|
+image
|
||||||
|
include ../../../assets/img/docs/tokenization.svg
|
||||||
|
.u-text-right
|
||||||
|
+button("/assets/img/docs/tokenization.svg", false, "secondary").u-text-tag View large graphic
|
||||||
|
|
||||||
|
p
|
||||||
|
| While punctuation rules are usually pretty general, tokenizer exceptions
|
||||||
|
| strongly depend on the specifics of the individual language. This is
|
||||||
|
| why each #[+a("/docs/api/language-models") available language] has its
|
||||||
|
| own subclass like #[code English] or #[code German], that loads in lists
|
||||||
|
| of hard-coded data and exception rules.
|
||||||
|
|
|
@ -89,4 +89,6 @@ p
|
||||||
|
|
||||||
p
|
p
|
||||||
| Even though both #[code Doc] objects contain the same words, the internal
|
| Even though both #[code Doc] objects contain the same words, the internal
|
||||||
| integer IDs are very different.
|
| integer IDs are very different. The same applies for all other strings,
|
||||||
|
| like the annotation scheme. To avoid mismatched IDs, spaCy will always
|
||||||
|
| export the vocab if you save a #[code Doc] or #[code nlp] object.
|
||||||
|
|
|
@ -144,7 +144,7 @@ p
|
||||||
+table(["Argument", "Type", "Description"])
|
+table(["Argument", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
+cell #[code vocab]
|
+cell #[code vocab]
|
||||||
+cell #[coce Vocab]
|
+cell #[code Vocab]
|
||||||
+cell
|
+cell
|
||||||
| Shared data between components, including strings, morphology,
|
| Shared data between components, including strings, morphology,
|
||||||
| vectors etc.
|
| vectors etc.
|
||||||
|
|
|
@ -139,6 +139,8 @@ p
|
||||||
new_doc = Doc(Vocab()).from_disk('/moby_dick.bin')
|
new_doc = Doc(Vocab()).from_disk('/moby_dick.bin')
|
||||||
|
|
||||||
+infobox
|
+infobox
|
||||||
|
| #[strong API:] #[+api("language") #[code Language]],
|
||||||
|
| #[+api("doc") #[code Doc]]
|
||||||
| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
|
| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
|
||||||
|
|
||||||
+h(2, "rule-matcher") Match text with token rules
|
+h(2, "rule-matcher") Match text with token rules
|
||||||
|
|
|
@ -345,7 +345,7 @@ p
|
||||||
| account and check the #[code subtree] for intensifiers like "very", to
|
| account and check the #[code subtree] for intensifiers like "very", to
|
||||||
| increase the sentiment score. At some point, you might also want to train
|
| increase the sentiment score. At some point, you might also want to train
|
||||||
| a sentiment model. However, the approach described in this example is
|
| a sentiment model. However, the approach described in this example is
|
||||||
| very useful for #[strong bootstrapping rules to gather training data].
|
| very useful for #[strong bootstrapping rules to collect training data].
|
||||||
| It's also an incredibly fast way to gather first insights into your data
|
| It's also an incredibly fast way to gather first insights into your data
|
||||||
| – with about 1 million tweets, you'd be looking at a processing time of
|
| – with about 1 million tweets, you'd be looking at a processing time of
|
||||||
| #[strong under 1 minute].
|
| #[strong under 1 minute].
|
||||||
|
|
|
@ -65,7 +65,7 @@ p
|
||||||
| spaCy provides a variety of linguistic annotations to give you insights
|
| spaCy provides a variety of linguistic annotations to give you insights
|
||||||
| into a text's grammatical structure. This includes the word types,
|
| into a text's grammatical structure. This includes the word types,
|
||||||
| i.e. the parts of speech, and how the words are related to each other.
|
| i.e. the parts of speech, and how the words are related to each other.
|
||||||
| For example, if you're analysing text, it makes a #[em huge] difference
|
| For example, if you're analysing text, it makes a huge difference
|
||||||
| whether a noun is the subject of a sentence, or the object – or whether
|
| whether a noun is the subject of a sentence, or the object – or whether
|
||||||
| "google" is used as a verb, or refers to the website or company in a
|
| "google" is used as a verb, or refers to the website or company in a
|
||||||
| specific context.
|
| specific context.
|
||||||
|
@ -94,9 +94,10 @@ p
|
||||||
include _spacy-101/_tokenization
|
include _spacy-101/_tokenization
|
||||||
|
|
||||||
+infobox
|
+infobox
|
||||||
| To learn more about how spaCy's tokenizer and its rules work in detail,
|
| To learn more about how spaCy's tokenization rules work in detail,
|
||||||
| how to #[strong customise] it and how to #[strong add your own tokenizer]
|
| how to #[strong customise and replace] the default tokenizer and how to
|
||||||
| to a processing pipeline, see the usage guide on
|
| #[strong add language-specific data], see the usage guides on
|
||||||
|
| #[+a("/docs/usage/adding-languages") adding languages] and
|
||||||
| #[+a("/docs/usage/customizing-tokenizer") customising the tokenizer].
|
| #[+a("/docs/usage/customizing-tokenizer") customising the tokenizer].
|
||||||
|
|
||||||
+h(3, "annotations-pos-deps") Part-of-speech tags and dependencies
|
+h(3, "annotations-pos-deps") Part-of-speech tags and dependencies
|
||||||
|
@ -118,9 +119,11 @@ include _spacy-101/_named-entities
|
||||||
|
|
||||||
+infobox
|
+infobox
|
||||||
| To learn more about entity recognition in spaCy, how to
|
| To learn more about entity recognition in spaCy, how to
|
||||||
| #[strong add your own entities] to a document and how to train and update
|
| #[strong add your own entities] to a document and how to
|
||||||
| the entity predictions of a model, see the usage guide on
|
| #[strong train and update] the entity predictions of a model, see the
|
||||||
| #[+a("/docs/usage/entity-recognition") named entity recognition].
|
| usage guides on
|
||||||
|
| #[+a("/docs/usage/entity-recognition") named entity recognition] and
|
||||||
|
| #[+a("/docs/usage/training-ner") training the named entity recognizer].
|
||||||
|
|
||||||
+h(2, "vectors-similarity") Word vectors and similarity
|
+h(2, "vectors-similarity") Word vectors and similarity
|
||||||
+tag-model("vectors")
|
+tag-model("vectors")
|
||||||
|
|
|
@ -20,19 +20,18 @@ p
|
||||||
nlp = Language(pipeline=['my_factory', mycomponent])
|
nlp = Language(pipeline=['my_factory', mycomponent])
|
||||||
|
|
||||||
p
|
p
|
||||||
| It's now much easier to customise the pipeline with your own components.
|
| It's now much easier to #[strong customise the pipeline] with your own
|
||||||
| Components are functions that receive a #[code Doc] object, modify and
|
| components, functions that receive a #[code Doc] object, modify and
|
||||||
| return it. If your component is stateful, you'll want to create a new one
|
| return it. If your component is stateful, you can define and register a
|
||||||
| for each pipeline. You can do that by defining and registering a factory
|
| factory which receives the shared #[code Vocab] object and returns a
|
||||||
| which receives the shared #[code Vocab] object and returns a component.
|
| component. spaCy's default components can be added to your pipeline by
|
||||||
|
| using their string IDs. This way, you won't have to worry about finding
|
||||||
p
|
| and implementing them – simply add #[code "tagger"] to the pipeline,
|
||||||
| spaCy's default components – the vectorizer, tagger, parser and entity
|
|
||||||
| recognizer, can be added to your pipeline by using their string IDs.
|
|
||||||
| This way, you won't have to worry about finding and implementing them –
|
|
||||||
| to use the default tagger, simply add #[code "tagger"] to the pipeline,
|
|
||||||
| and spaCy will know what to do.
|
| and spaCy will know what to do.
|
||||||
|
|
||||||
|
+image
|
||||||
|
include ../../assets/img/docs/pipeline.svg
|
||||||
|
|
||||||
+infobox
|
+infobox
|
||||||
| #[strong API:] #[+api("language") #[code Language]]
|
| #[strong API:] #[+api("language") #[code Language]]
|
||||||
| #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text]
|
| #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text]
|
||||||
|
@ -96,11 +95,10 @@ p
|
||||||
| #[code Language] class, or load a model that initialises one. This allows
|
| #[code Language] class, or load a model that initialises one. This allows
|
||||||
| languages to contain more custom data, e.g. lemmatizer lookup tables, or
|
| languages to contain more custom data, e.g. lemmatizer lookup tables, or
|
||||||
| complex regular expressions. The language data has also been tidied up
|
| complex regular expressions. The language data has also been tidied up
|
||||||
| and simplified. It's now also possible to overwrite the functions that
|
| and simplified. spaCy now also supports simple lookup-based lemmatization.
|
||||||
| compute lexical attributes like #[code like_num], and supply
|
|
||||||
| language-specific syntax iterators, e.g. to determine noun chunks. spaCy
|
+image
|
||||||
| now also supports simple lookup-based lemmatization. The data is stored
|
include ../../assets/img/docs/language_data.svg
|
||||||
| in a dictionary mapping a string to its lemma.
|
|
||||||
|
|
||||||
+infobox
|
+infobox
|
||||||
| #[strong API:] #[+api("language") #[code Language]]
|
| #[strong API:] #[+api("language") #[code Language]]
|
||||||
|
@ -111,13 +109,10 @@ p
|
||||||
|
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
from spacy.matcher import Matcher
|
from spacy.matcher import Matcher
|
||||||
from spacy.attrs import LOWER, IS_PUNCT
|
|
||||||
matcher = Matcher(nlp.vocab)
|
matcher = Matcher(nlp.vocab)
|
||||||
matcher.add('HelloWorld', None,
|
matcher.add('HEARTS', None, [{'ORTH': '❤️', 'OP': '+'}])
|
||||||
[{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}],
|
|
||||||
[{LOWER: 'hello'}, {LOWER: 'world'}])
|
|
||||||
assert len(matcher) == 1
|
assert len(matcher) == 1
|
||||||
assert 'HelloWorld' in matcher
|
assert 'HEARTS' in matcher
|
||||||
|
|
||||||
p
|
p
|
||||||
| Patterns can now be added to the matcher by calling
|
| Patterns can now be added to the matcher by calling
|
||||||
|
@ -157,28 +152,8 @@ p
|
||||||
+cell #[+api("language#to_disk") #[code Language.to_disk]]
|
+cell #[+api("language#to_disk") #[code Language.to_disk]]
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code Tokenizer.load]
|
+cell #[code Language.create_make_doc]
|
||||||
+cell
|
+cell #[+api("language#attributes") #[code Language.tokenizer]]
|
||||||
| #[+api("tokenizer#from_disk") #[code Tokenizer.from_disk]]
|
|
||||||
| #[+api("tokenizer#from_bytes") #[code Tokenizer.from_bytes]]
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code Tagger.load]
|
|
||||||
+cell
|
|
||||||
| #[+api("tagger#from_disk") #[code Tagger.from_disk]]
|
|
||||||
| #[+api("tagger#from_bytes") #[code Tagger.from_bytes]]
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code DependencyParser.load]
|
|
||||||
+cell
|
|
||||||
| #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]]
|
|
||||||
| #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]]
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code EntityRecognizer.load]
|
|
||||||
+cell
|
|
||||||
| #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]]
|
|
||||||
| #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]]
|
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell
|
+cell
|
||||||
|
@ -212,6 +187,28 @@ p
|
||||||
| #[+api("stringstore#to_disk") #[code StringStore.to_disk]]
|
| #[+api("stringstore#to_disk") #[code StringStore.to_disk]]
|
||||||
| #[+api("stringstore#to_bytes") #[code StringStore.to_bytes]]
|
| #[+api("stringstore#to_bytes") #[code StringStore.to_bytes]]
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code Tokenizer.load]
|
||||||
|
+cell -
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code Tagger.load]
|
||||||
|
+cell
|
||||||
|
| #[+api("tagger#from_disk") #[code Tagger.from_disk]]
|
||||||
|
| #[+api("tagger#from_bytes") #[code Tagger.from_bytes]]
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code DependencyParser.load]
|
||||||
|
+cell
|
||||||
|
| #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]]
|
||||||
|
| #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]]
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code EntityRecognizer.load]
|
||||||
|
+cell
|
||||||
|
| #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]]
|
||||||
|
| #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]]
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code Matcher.load]
|
+cell #[code Matcher.load]
|
||||||
+cell -
|
+cell -
|
||||||
|
@ -232,7 +229,7 @@ p
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code Doc.read_bytes]
|
+cell #[code Doc.read_bytes]
|
||||||
+cell
|
+cell #[+api("binder") #[code Binder]]
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code Token.is_ancestor_of]
|
+cell #[code Token.is_ancestor_of]
|
||||||
|
|