WIP on vectors fixes

This commit is contained in:
Matthew Honnibal 2017-10-31 11:22:56 +01:00
commit 9c11ee4a1c
34 changed files with 682 additions and 343 deletions

View File

@ -0,0 +1,21 @@
{"lang": "en", "settings": {"oov_prob": -20.502029418945312}}
{"orth": ".", "id": 1, "lower": ".", "norm": ".", "shape": ".", "prefix": ".", "suffix": ".", "length": 1, "cluster": "8", "prob": -3.0678977966308594, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": ",", "id": 2, "lower": ",", "norm": ",", "shape": ",", "prefix": ",", "suffix": ",", "length": 1, "cluster": "4", "prob": -3.4549596309661865, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "the", "id": 3, "lower": "the", "norm": "the", "shape": "xxx", "prefix": "t", "suffix": "the", "length": 3, "cluster": "11", "prob": -3.528766632080078, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "I", "id": 4, "lower": "i", "norm": "I", "shape": "X", "prefix": "I", "suffix": "I", "length": 1, "cluster": "346", "prob": -3.791565179824829, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": true, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "to", "id": 5, "lower": "to", "norm": "to", "shape": "xx", "prefix": "t", "suffix": "to", "length": 2, "cluster": "12", "prob": -3.8560216426849365, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "a", "id": 6, "lower": "a", "norm": "a", "shape": "x", "prefix": "a", "suffix": "a", "length": 1, "cluster": "19", "prob": -3.92978835105896, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "and", "id": 7, "lower": "and", "norm": "and", "shape": "xxx", "prefix": "a", "suffix": "and", "length": 3, "cluster": "20", "prob": -4.113108158111572, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "of", "id": 8, "lower": "of", "norm": "of", "shape": "xx", "prefix": "o", "suffix": "of", "length": 2, "cluster": "28", "prob": -4.27587366104126, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "you", "id": 9, "lower": "you", "norm": "you", "shape": "xxx", "prefix": "y", "suffix": "you", "length": 3, "cluster": "602", "prob": -4.373791217803955, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "it", "id": 10, "lower": "it", "norm": "it", "shape": "xx", "prefix": "i", "suffix": "it", "length": 2, "cluster": "474", "prob": -4.388050079345703, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "is", "id": 11, "lower": "is", "norm": "is", "shape": "xx", "prefix": "i", "suffix": "is", "length": 2, "cluster": "762", "prob": -4.457748889923096, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "that", "id": 12, "lower": "that", "norm": "that", "shape": "xxxx", "prefix": "t", "suffix": "hat", "length": 4, "cluster": "84", "prob": -4.464504718780518, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "\n\n", "id": 0, "lower": "\n\n", "norm": "\n\n", "shape": "\n\n", "prefix": "\n", "suffix": "\n\n", "length": 2, "cluster": "0", "prob": -4.606560707092285, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": true, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "in", "id": 13, "lower": "in", "norm": "in", "shape": "xx", "prefix": "i", "suffix": "in", "length": 2, "cluster": "60", "prob": -4.619071960449219, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "'s", "id": 14, "lower": "'s", "norm": "'s", "shape": "'x", "prefix": "'", "suffix": "'s", "length": 2, "cluster": "52", "prob": -4.830559253692627, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "n't", "id": 15, "lower": "n't", "norm": "n't", "shape": "x'x", "prefix": "n", "suffix": "n't", "length": 3, "cluster": "74", "prob": -4.859938621520996, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "for", "id": 16, "lower": "for", "norm": "for", "shape": "xxx", "prefix": "f", "suffix": "for", "length": 3, "cluster": "508", "prob": -4.8801093101501465, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "\"", "id": 17, "lower": "\"", "norm": "\"", "shape": "\"", "prefix": "\"", "suffix": "\"", "length": 1, "cluster": "0", "prob": -5.02677583694458, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": true, "is_left_punct": true, "is_right_punct": true}
{"orth": "?", "id": 18, "lower": "?", "norm": "?", "shape": "?", "prefix": "?", "suffix": "?", "length": 1, "cluster": "0", "prob": -5.05924654006958, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": " ", "id": 0, "lower": " ", "norm": " ", "shape": " ", "prefix": " ", "suffix": " ", "length": 1, "cluster": "0", "prob": -5.129165172576904, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": true, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}

View File

@ -7,7 +7,7 @@ if __name__ == '__main__':
import plac
import sys
from spacy.cli import download, link, info, package, train, convert, model
from spacy.cli import profile, evaluate, validate
from spacy.cli import vocab, profile, evaluate, validate
from spacy.util import prints
commands = {
@ -19,6 +19,7 @@ if __name__ == '__main__':
'convert': convert,
'package': package,
'model': model,
'vocab': vocab,
'profile': profile,
'validate': validate
}

View File

@ -7,4 +7,5 @@ from .train import train
from .evaluate import evaluate
from .convert import convert
from .model import model
from .vocab import make_vocab as vocab
from .validate import validate

View File

@ -17,14 +17,14 @@ numpy.random.seed(0)
@plac.annotations(
model=("Model name or path", "positional", None, str),
data_path=("Location of JSON-formatted evaluation data", "positional",
model=("model name or path", "positional", None, str),
data_path=("location of JSON-formatted evaluation data", "positional",
None, str),
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
gpu_id=("Use GPU", "option", "g", int),
displacy_path=("Directory to output rendered parses as HTML", "option",
gold_preproc=("use gold preprocessing", "flag", "G", bool),
gpu_id=("use GPU", "option", "g", int),
displacy_path=("directory to output rendered parses as HTML", "option",
"dp", str),
displacy_limit=("Limit of parses to render as HTML", "option", "dl", int))
displacy_limit=("limit of parses to render as HTML", "option", "dl", int))
def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
displacy_path=None, displacy_limit=25):
"""

View File

@ -16,10 +16,11 @@ from .. import about
input_dir=("directory with model data", "positional", None, str),
output_dir=("output parent directory", "positional", None, str),
meta_path=("path to meta.json", "option", "m", str),
create_meta=("create meta.json, even if one exists in directory", "flag",
"c", bool),
force=("force overwriting of existing folder in output directory", "flag",
"f", bool))
create_meta=("create meta.json, even if one exists in directory if "
"existing meta is found, entries are shown as defaults in "
"the command line prompt", "flag", "c", bool),
force=("force overwriting of existing model directory in output directory",
"flag", "f", bool))
def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False,
force=False):
"""
@ -41,13 +42,13 @@ def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False,
template_manifest = get_template('MANIFEST.in')
template_init = get_template('xx_model_name/__init__.py')
meta_path = meta_path or input_path / 'meta.json'
if not create_meta and meta_path.is_file():
prints(meta_path, title="Reading meta.json from file")
if meta_path.is_file():
meta = util.read_json(meta_path)
if not create_meta: # only print this if user doesn't want to overwrite
prints(meta_path, title="Loaded meta.json from file")
else:
meta = generate_meta(input_dir)
meta = generate_meta(input_dir, meta)
meta = validate_meta(meta, ['lang', 'name', 'version'])
model_name = meta['lang'] + '_' + meta['name']
model_name_v = model_name + '-' + meta['version']
main_path = output_path / model_name_v
@ -82,18 +83,19 @@ def create_file(file_path, contents):
file_path.open('w', encoding='utf-8').write(contents)
def generate_meta(model_path):
meta = {}
settings = [('lang', 'Model language', 'en'),
('name', 'Model name', 'model'),
('version', 'Model version', '0.0.0'),
def generate_meta(model_path, existing_meta):
meta = existing_meta or {}
settings = [('lang', 'Model language', meta.get('lang', 'en')),
('name', 'Model name', meta.get('name', 'model')),
('version', 'Model version', meta.get('version', '0.0.0')),
('spacy_version', 'Required spaCy version',
'>=%s,<3.0.0' % about.__version__),
('description', 'Model description', False),
('author', 'Author', False),
('email', 'Author email', False),
('url', 'Author website', False),
('license', 'License', 'CC BY-NC 3.0')]
('description', 'Model description',
meta.get('description', False)),
('author', 'Author', meta.get('author', False)),
('email', 'Author email', meta.get('email', False)),
('url', 'Author website', meta.get('url', False)),
('license', 'License', meta.get('license', 'CC BY-SA 3.0'))]
nlp = util.load_model_from_path(Path(model_path))
meta['pipeline'] = nlp.pipe_names
meta['vectors'] = {'width': nlp.vocab.vectors_length,

View File

@ -32,6 +32,7 @@ numpy.random.seed(0)
n_sents=("number of sentences", "option", "ns", int),
use_gpu=("Use GPU", "option", "g", int),
vectors=("Model to load vectors from", "option", "v"),
vectors_limit=("Truncate to N vectors (requires -v)", "option", None, int),
no_tagger=("Don't train tagger", "flag", "T", bool),
no_parser=("Don't train parser", "flag", "P", bool),
no_entities=("Don't train NER", "flag", "N", bool),
@ -40,9 +41,9 @@ numpy.random.seed(0)
meta_path=("Optional path to meta.json. All relevant properties will be "
"overwritten.", "option", "m", Path))
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
use_gpu=-1, vectors=None, no_tagger=False, no_parser=False,
no_entities=False, gold_preproc=False, version="0.0.0",
meta_path=None):
use_gpu=-1, vectors=None, vectors_limit=None, no_tagger=False,
no_parser=False, no_entities=False, gold_preproc=False,
version="0.0.0", meta_path=None):
"""
Train a model. Expects data in spaCy's JSON format.
"""
@ -95,10 +96,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
if vectors:
util.load_model(vectors, vocab=nlp.vocab)
if vectors_limit is not None:
remap = nlp.vocab.prune_vectors(vectors_limit)
print('remap', len(remap))
for key, (value, sim) in remap.items():
print(repr(key), repr(value), sim)
nlp.vocab.prune_vectors(vectors_limit)
for name in pipeline:
nlp.add_pipe(nlp.create_pipe(name), name=name)
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)

54
spacy/cli/vocab.py Normal file
View File

@ -0,0 +1,54 @@
# coding: utf8
from __future__ import unicode_literals
import plac
import json
import spacy
import numpy
from pathlib import Path
from ..util import prints, ensure_path
@plac.annotations(
lang=("model language", "positional", None, str),
output_dir=("model output directory", "positional", None, Path),
lexemes_loc=("location of JSONL-formatted lexical data", "positional",
None, Path),
vectors_loc=("optional: location of vectors data, as numpy .npz",
"positional", None, str))
def make_vocab(cmd, lang, output_dir, lexemes_loc, vectors_loc=None):
"""Compile a vocabulary from a lexicon jsonl file and word vectors."""
if not lexemes_loc.exists():
prints(lexemes_loc, title="Can't find lexical data", exits=1)
vectors_loc = ensure_path(vectors_loc)
nlp = spacy.blank(lang)
for word in nlp.vocab:
word.rank = 0
lex_added = 0
vec_added = 0
with lexemes_loc.open() as file_:
for line in file_:
if line.strip():
attrs = json.loads(line)
if 'settings' in attrs:
nlp.vocab.cfg.update(attrs['settings'])
else:
lex = nlp.vocab[attrs['orth']]
lex.set_attrs(**attrs)
assert lex.rank == attrs['id']
lex_added += 1
if vectors_loc is not None:
vector_data = numpy.load(open(vectors_loc, 'rb'))
nlp.vocab.clear_vectors(width=vector_data.shape[1])
for word in nlp.vocab:
if word.rank:
nlp.vocab.vectors.add(word.orth_, row=word.rank,
vector=vector_data[word.rank])
vec_added += 1
if not output_dir.exists():
output_dir.mkdir()
nlp.to_disk(output_dir)
prints("{} entries, {} vectors".format(lex_added, vec_added), output_dir,
title="Sucessfully compiled vocab and vectors, and saved model")
return nlp

View File

@ -300,5 +300,15 @@ GLOSSARY = {
'MONEY': 'Monetary values, including unit',
'QUANTITY': 'Measurements, as of weight or distance',
'ORDINAL': '"first", "second", etc.',
'CARDINAL': 'Numerals that do not fall under another type'
'CARDINAL': 'Numerals that do not fall under another type',
# Named Entity Recognition
# Wikipedia
# http://www.sciencedirect.com/science/article/pii/S0004370212000276
# https://pdfs.semanticscholar.org/5744/578cc243d92287f47448870bb426c66cc941.pdf
'PER': 'Named person or family.',
'MISC': ('Miscellaneous entities, e.g. events, nationalities, '
'products or works of art'),
}

View File

@ -154,6 +154,8 @@ class Language(object):
self._meta.setdefault('email', '')
self._meta.setdefault('url', '')
self._meta.setdefault('license', '')
self._meta['vectors'] = {'width': self.vocab.vectors_length,
'entries': len(self.vocab.vectors)}
self._meta['pipeline'] = self.pipe_names
return self._meta

View File

@ -13,6 +13,8 @@ from .typedefs cimport attr_t, flags_t
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_OOV
from .attrs cimport PROB
from .attrs import intify_attrs
from . import about
@ -68,6 +70,19 @@ cdef class Lexeme:
def __hash__(self):
return self.c.orth
def set_attrs(self, **attrs):
cdef attr_id_t attr
attrs = intify_attrs(attrs)
for attr, value in attrs.items():
if attr == PROB:
self.c.prob = value
elif attr == CLUSTER:
self.c.cluster = int(value)
elif isinstance(value, int) or isinstance(value, long):
Lexeme.set_struct_attr(self.c, attr, value)
else:
Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value))
def set_flag(self, attr_id_t flag_id, bint value):
"""Change the value of a boolean flag.

View File

@ -209,7 +209,7 @@ def test_doc_api_right_edge(en_tokenizer):
def test_doc_api_has_vector():
vocab = Vocab()
vocab.clear_vectors(2)
vocab.vectors.add('kitten', numpy.asarray([0., 2.], dtype='f'))
vocab.vectors.add('kitten', vector=numpy.asarray([0., 2.], dtype='f'))
doc = Doc(vocab, words=['kitten'])
assert doc.has_vector

View File

@ -73,8 +73,8 @@ def test_doc_token_api_is_properties(en_vocab):
def test_doc_token_api_vectors():
vocab = Vocab()
vocab.clear_vectors(2)
vocab.vectors.add('apples', numpy.asarray([0., 2.], dtype='f'))
vocab.vectors.add('oranges', numpy.asarray([0., 1.], dtype='f'))
vocab.vectors.add('apples', vector=numpy.asarray([0., 2.], dtype='f'))
vocab.vectors.add('oranges', vector=numpy.asarray([0., 1.], dtype='f'))
doc = Doc(vocab, words=['apples', 'oranges', 'oov'])
assert doc.has_vector

View File

@ -21,8 +21,10 @@ cdef class Vectors:
Vectors data is kept in the vectors.data attribute, which should be an
instance of numpy.ndarray (for CPU vectors) or cupy.ndarray
(for GPU vectors). `vectors.key2row` is a dictionary mapping word hashes to
rows in the vectors.data table. The array `vectors.keys` keeps the keys in
order, such that `keys[vectors.key2row[key]] == key`.
rows in the vectors.data table.
Multiple keys can be mapped to the same vector, so len(keys) may be greater
(but not smaller) than data.shape[0].
"""
cdef public object data
cdef readonly StringStore strings
@ -101,7 +103,7 @@ cdef class Vectors:
RETURNS (int): The number of vectors in the data.
"""
return self.i
return self._i_vec
def __contains__(self, key):
"""Check whether a key has a vector entry in the table.
@ -113,11 +115,13 @@ cdef class Vectors:
key = self.strings[key]
return key in self.key2row
def add(self, key, vector=None):
"""Add a key to the table, optionally setting a vector value as well.
def add(self, key, *, vector=None, row=None):
"""Add a key to the table. Keys can be mapped to an existing vector
by setting `row`, or a new vector can be added.
key (unicode / int): The key to add.
vector (numpy.ndarray): An optional vector to add.
vector (numpy.ndarray / None): A vector to add for the key.
row (int / None): The row-number of a vector to map the key to.
"""
if isinstance(key, basestring_):
key = self.strings.add(key)
@ -131,8 +135,8 @@ cdef class Vectors:
self.key2row[key] = row
if vector is not None:
self.data[i] = vector
return i
self.data[row] = vector
return row
def items(self):
"""Iterate over `(string key, vector)` pairs, in order.

View File

@ -32,6 +32,7 @@ cdef class Vocab:
cdef readonly int length
cdef public object data_dir
cdef public object lex_attr_getters
cdef public object cfg
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL

View File

@ -5,6 +5,7 @@ import numpy
import dill
from collections import OrderedDict
from thinc.neural.util import get_array_module
from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport Lexeme
from .strings cimport hash_string
@ -27,7 +28,7 @@ cdef class Vocab:
C-data that is shared between `Doc` objects.
"""
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
strings=tuple(), **deprecated_kwargs):
strings=tuple(), oov_prob=-20., **deprecated_kwargs):
"""Create the vocabulary.
lex_attr_getters (dict): A dictionary mapping attribute IDs to
@ -43,6 +44,7 @@ cdef class Vocab:
tag_map = tag_map if tag_map is not None else {}
if lemmatizer in (None, True, False):
lemmatizer = Lemmatizer({}, {}, {})
self.cfg = {'oov_prob': oov_prob}
self.mem = Pool()
self._by_hash = PreshMap()
self._by_orth = PreshMap()
@ -239,7 +241,7 @@ cdef class Vocab:
def vectors_length(self):
return self.vectors.data.shape[1]
def clear_vectors(self, new_dim=None):
def clear_vectors(self, width=None):
"""Drop the current vector table. Because all vectors must be the same
width, you have to call this to change the size of the vectors.
"""
@ -283,16 +285,14 @@ cdef class Vocab:
keep = xp.ascontiguousarray(keep.T)
neighbours = xp.zeros((toss.shape[0],), dtype='i')
scores = xp.zeros((toss.shape[0],), dtype='f')
for i in range(0, toss.shape[0]//2, batch_size):
for i in range(0, toss.shape[0], batch_size):
batch = toss[i : i+batch_size]
batch /= xp.linalg.norm(batch, axis=1, keepdims=True)+1e-8
sims = xp.dot(batch, keep)
matches = sims.argmax(axis=1)
neighbours[i:i+batch_size] = matches
scores[i:i+batch_size] = sims.max(axis=1)
i2k = {i: key for key, i in self.vectors.key2row.items()}
remap = {}
for lex in list(self):
for lex in self:
# If we're losing the vector for this word, map it to the nearest
# vector we're keeping.
if lex.rank >= nr_row:

View File

@ -41,9 +41,6 @@
- var comps = path.split('#');
- return "top-level#" + comps[0] + '.' + comps[1];
- }
- else if (path.startsWith('cli#')) {
- return "top-level#" + path.split('#')[1];
- }
- return path;
- }

View File

@ -1,244 +0,0 @@
//- 💫 MIXINS > BASE
//- Section
id - [string] anchor assigned to section (used for breadcrumb navigation)
mixin section(id)
section.o-section(id="section-" + id data-section=id)
block
//- Aside wrapper
label - [string] aside label
mixin aside-wrapper(label)
aside.c-aside
.c-aside__content(role="complementary")&attributes(attributes)
if label
h4.u-text-label.u-text-label--dark=label
block
//- SVG from map (uses embedded SVG sprite)
name - [string] SVG symbol id
width - [integer] width in px
height - [integer] height in px (default: same as width)
mixin svg(name, width, height)
svg(aria-hidden="true" viewBox="0 0 #{width} #{height || width}" width=width height=(height || width))&attributes(attributes)
use(xlink:href="#svg_#{name}")
//- Icon
name - [string] icon name (will be used as symbol id: #svg_{name})
width - [integer] icon width (default: 20)
height - [integer] icon height (defaults to width)
mixin icon(name, width, height)
- var width = width || 20
- var height = height || width
+svg(name, width, height).o-icon(style="min-width: #{width}px")&attributes(attributes)
//- Pro/Con/Neutral icon
icon - [string] "pro", "con" or "neutral" (default: "neutral")
size - [integer] icon size (optional)
mixin procon(icon, label, show_label, size)
- var colors = { yes: "green", no: "red", neutral: "subtle" }
span.u-nowrap
+icon(icon, size || 20)(class="u-color-#{colors[icon] || 'subtle'}").o-icon--inline&attributes(attributes)
span.u-text-small(class=show_label ? null : "u-hidden")=(label || icon)
//- Headlines Helper Mixin
level - [integer] 1, 2, 3, 4, or 5
mixin headline(level)
if level == 1
h1.u-heading-1&attributes(attributes)
block
else if level == 2
h2.u-heading-2&attributes(attributes)
block
else if level == 3
h3.u-heading-3&attributes(attributes)
block
else if level == 4
h4.u-heading-4&attributes(attributes)
block
else if level == 5
h5.u-heading-5&attributes(attributes)
block
//- Permalink rendering
id - [string] permalink ID used for link anchor
mixin permalink(id)
if id
a.u-permalink(href="##{id}")
block
else
block
//- Quickstart widget
quickstart.js with manual markup, inspired by PyTorch's "Getting started"
groups - [object] option groups, uses global variable QUICKSTART
headline - [string] optional text to be rendered as widget headline
mixin quickstart(groups, headline, description, hide_results)
.c-quickstart.o-block-small#qs
.c-quickstart__content
if headline
+h(2)=headline
if description
p=description
for group in groups
.c-quickstart__group.u-text-small(data-qs-group=group.id)
if group.title
.c-quickstart__legend=group.title
if group.help
| #[+help(group.help)]
.c-quickstart__fields
for option in group.options
input.c-quickstart__input(class="c-quickstart__input--" + (group.input_style ? group.input_style : group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id="qs-#{option.id}" value=option.id checked=option.checked)
label.c-quickstart__label.u-text-tiny(for="qs-#{option.id}")!=option.title
if option.meta
| #[span.c-quickstart__label__meta (#{option.meta})]
if option.help
| #[+help(option.help)]
if hide_results
block
else
pre.c-code-block
code.c-code-block__content.c-quickstart__code(data-qs-results="")
block
//- Quickstart code item
data - [object] Rendering conditions (keyed by option group ID, value: option)
style - [string] modifier ID for line style
mixin qs(data, style)
- args = {}
for value, setting in data
- args['data-qs-' + setting] = value
span.c-quickstart__line(class="c-quickstart__line--#{style || 'bash'}")&attributes(args)
block
//- Terminal-style code window
label - [string] title displayed in top bar of terminal window
mixin terminal(label)
.x-terminal
.x-terminal__icons: span
.u-padding-small.u-text-label.u-text-center=label
+code.x-terminal__code
block
//- Chart.js
id - [string] chart ID, will be assigned as #chart_{id}
mixin chart(id, height)
figure.o-block&attributes(attributes)
canvas(id="chart_#{id}" width="800" height=(height || "400") style="max-width: 100%")
//- Gitter chat button and widget
button - [string] text shown on button
label - [string] title of chat window (default: same as button)
mixin gitter(button, label)
aside.js-gitter.c-chat.is-collapsed(data-title=(label || button))
button.js-gitter-button.c-chat__button.u-text-tag
+icon("chat", 16).o-icon--inline
!=button
//- Badge
image - [string] path to badge image
url - [string] badge link
mixin badge(image, url)
+a(url).u-padding-small.u-hide-link&attributes(attributes)
img.o-badge(src=image alt=url height="20")
//- spaCy logo
mixin logo()
+svg("spacy", 675, 215).o-logo&attributes(attributes)
//- Landing
mixin landing-header()
header.c-landing
.c-landing__wrapper
.c-landing__content
block
mixin landing-banner(headline, label)
.c-landing__banner.u-padding.o-block.u-color-light
+grid.c-landing__banner__content.o-no-block
+grid-col("third")
h3.u-heading.u-heading-1
if label
div
span.u-text-label.u-text-label--light=label
!=headline
+grid-col("two-thirds").c-landing__banner__text
block
mixin landing-logos(title, logos)
.o-content.u-text-center&attributes(attributes)
h3.u-heading.u-text-label.u-color-dark=title
each row, i in logos
- var is_last = i == logos.length - 1
+grid("center").o-inline-list.o-no-block(class=is_last ? "o-no-block" : null)
each details, name in row
+a(details[0]).u-padding-medium
+icon(name, details[1], details[2])
if is_last
block
//- Under construction (temporary)
Marks sections that still need to be completed for the v2.0 release.
mixin under-construction()
+infobox("Under construction", "🚧")
| This section is still being written and will be updated for the v2.0
| release. Is there anything that you think should definitely mentioned or
| explained here? Any examples you'd like to see? #[strong Let us know]
| on the #[+a(gh("spacy") + "/issues/1105") v2.0 alpha thread] on GitHub!
//- Alpha infobox (temporary)
Added in the templates to notify user that they're visiting the alpha site.
mixin alpha-info()
+infobox("You are viewing the spaCy v2.0.0 alpha docs", "⚠️")
strong This page is part of the alpha documentation for spaCy v2.0.
| It does not reflect the state of the latest stable release.
| Because v2.0 is still under development, the implementation
| may differ from the intended state described here. See the
| #[+a(gh("spaCy") + "/releases/tag/v2.0.0-alpha") release notes]
| for details on how to install and test the new version. To
| read the official docs for spaCy v1.x,
| #[+a("https://spacy.io/docs") go here].

View File

@ -1,7 +1,39 @@
//- 💫 INCLUDES > MIXINS
include _functions
include _mixins-base
//- Section
id - [string] anchor assigned to section (used for breadcrumb navigation)
mixin section(id)
section.o-section(id="section-" + id data-section=id)
block
//- Headlines Helper Mixin
level - [integer] 1, 2, 3, 4, or 5
mixin headline(level)
if level == 1
h1.u-heading-1&attributes(attributes)
block
else if level == 2
h2.u-heading-2&attributes(attributes)
block
else if level == 3
h3.u-heading-3&attributes(attributes)
block
else if level == 4
h4.u-heading-4&attributes(attributes)
block
else if level == 5
h5.u-heading-5&attributes(attributes)
block
//- Headlines
@ -18,6 +50,18 @@ mixin h(level, id, source)
span Source #[+icon("code", 14).o-icon--inline]
//- Permalink rendering
id - [string] permalink ID used for link anchor
mixin permalink(id)
if id
a.u-permalink(href="##{id}")
block
else
block
//- External links
url - [string] link href
trusted - [boolean] if not set / false, rel="noopener nofollow" is added
@ -63,6 +107,18 @@ mixin help(tooltip, icon_size)
+icon("help_o", icon_size || 16).o-icon--inline
//- Aside wrapper
label - [string] aside label
mixin aside-wrapper(label)
aside.c-aside
.c-aside__content(role="complementary")&attributes(attributes)
if label
h4.u-text-label.u-text-label--dark=label
block
//- Aside for text
label - [string] aside title (optional)
@ -112,6 +168,37 @@ mixin infobox-logos(...logos)
| #[+icon(logo[0], logo[1], logo[2]).u-color-dark]
//- SVG from map (uses embedded SVG sprite)
name - [string] SVG symbol id
width - [integer] width in px
height - [integer] height in px (default: same as width)
mixin svg(name, width, height)
svg(aria-hidden="true" viewBox="0 0 #{width} #{height || width}" width=width height=(height || width))&attributes(attributes)
use(xlink:href="#svg_#{name}")
//- Icon
name - [string] icon name (will be used as symbol id: #svg_{name})
width - [integer] icon width (default: 20)
height - [integer] icon height (defaults to width)
mixin icon(name, width, height)
- var width = width || 20
- var height = height || width
+svg(name, width, height).o-icon(style="min-width: #{width}px")&attributes(attributes)
//- Pro/Con/Neutral icon
icon - [string] "pro", "con" or "neutral" (default: "neutral")
size - [integer] icon size (optional)
mixin procon(icon, label, show_label, size)
- var colors = { yes: "green", no: "red", neutral: "subtle" }
span.u-nowrap
+icon(icon, size || 20)(class="u-color-#{colors[icon] || 'subtle'}").o-icon--inline&attributes(attributes)
span.u-text-small(class=show_label ? null : "u-hidden")=(label || icon)
//- Link button
url - [string] link href
@ -238,6 +325,14 @@ mixin graphic(original)
+button(original, false, "secondary", "small") View large graphic
//- Chart.js
id - [string] chart ID, will be assigned as #chart_{id}
mixin chart(id, height)
figure.o-block&attributes(attributes)
canvas(id="chart_#{id}" width="800" height=(height || "400") style="max-width: 100%")
//- Labels
mixin label()
@ -353,8 +448,8 @@ mixin grid(...style)
width - [string] "quarter", "third", "half", "two-thirds", "three-quarters"
see $grid in assets/css/_variables.sass
mixin grid-col(width)
.o-grid__col(class="o-grid__col--#{width}")&attributes(attributes)
mixin grid-col(...style)
.o-grid__col(class=prefixArgs(style, "o-grid__col"))&attributes(attributes)
block
@ -445,3 +540,137 @@ mixin annotation-row(annots, style)
else
+cell=cell
block
//- spaCy logo
mixin logo()
+svg("spacy", 675, 215).o-logo&attributes(attributes)
//- Gitter chat button and widget
button - [string] text shown on button
label - [string] title of chat window (default: same as button)
mixin gitter(button, label)
aside.js-gitter.c-chat.is-collapsed(data-title=(label || button))
button.js-gitter-button.c-chat__button.u-text-tag
+icon("chat", 16).o-icon--inline
!=button
//- Badge
image - [string] path to badge image
url - [string] badge link
mixin badge(image, url)
+a(url).u-padding-small.u-hide-link&attributes(attributes)
img.o-badge(src=image alt=url height="20")
//- Quickstart widget
quickstart.js with manual markup, inspired by PyTorch's "Getting started"
groups - [object] option groups, uses global variable QUICKSTART
headline - [string] optional text to be rendered as widget headline
mixin quickstart(groups, headline, description, hide_results)
.c-quickstart.o-block-small#qs
.c-quickstart__content
if headline
+h(2)=headline
if description
p=description
for group in groups
.c-quickstart__group.u-text-small(data-qs-group=group.id)
if group.title
.c-quickstart__legend=group.title
if group.help
| #[+help(group.help)]
.c-quickstart__fields
for option in group.options
input.c-quickstart__input(class="c-quickstart__input--" + (group.input_style ? group.input_style : group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id="qs-#{option.id}" value=option.id checked=option.checked)
label.c-quickstart__label.u-text-tiny(for="qs-#{option.id}")!=option.title
if option.meta
| #[span.c-quickstart__label__meta (#{option.meta})]
if option.help
| #[+help(option.help)]
if hide_results
block
else
pre.c-code-block
code.c-code-block__content.c-quickstart__code(data-qs-results="")
block
//- Quickstart code item
data - [object] Rendering conditions (keyed by option group ID, value: option)
style - [string] modifier ID for line style
mixin qs(data, style)
- args = {}
for value, setting in data
- args['data-qs-' + setting] = value
span.c-quickstart__line(class="c-quickstart__line--#{style || 'bash'}")&attributes(args)
block
//- Terminal-style code window
label - [string] title displayed in top bar of terminal window
mixin terminal(label)
.x-terminal
.x-terminal__icons: span
.u-padding-small.u-text-label.u-text-center=label
+code.x-terminal__code
block
//- Landing
mixin landing-header()
header.c-landing
.c-landing__wrapper
.c-landing__content
block
mixin landing-banner(headline, label)
.c-landing__banner.u-padding.o-block.u-color-light
+grid.c-landing__banner__content.o-no-block
+grid-col("third")
h3.u-heading.u-heading-1
if label
div
span.u-text-label.u-text-label--light=label
!=headline
+grid-col("two-thirds").c-landing__banner__text
block
mixin landing-logos(title, logos)
.o-content.u-text-center&attributes(attributes)
h3.u-heading.u-text-label.u-color-dark=title
each row, i in logos
- var is_last = i == logos.length - 1
+grid("center").o-inline-list.o-no-block(class=is_last ? "o-no-block" : null)
each details, name in row
+a(details[0]).u-padding-medium
+icon(name, details[1], details[2])
if is_last
block
//- Under construction (temporary)
Marks sections that still need to be completed for the v2.0 release.
mixin under-construction()
+infobox("Under construction", "🚧")
| This section is still being written and will be updated for the v2.0
| release. Is there anything that you think should definitely mentioned or
| explained here? Any examples you'd like to see? #[strong Let us know]
| on the #[+a(gh("spacy") + "/issues/1105") v2.0 alpha thread] on GitHub!

View File

@ -25,9 +25,6 @@ main.o-main.o-main--sidebar.o-main--aside
+button(gh("spacy", source), false, "secondary", "small").u-nowrap
| Source #[+icon("code", 14)]
//-if ALPHA
//- +alpha-info
if IS_MODELS
include _page_models
else

View File

@ -62,6 +62,9 @@ svg(style="position: absolute; visibility: hidden; width: 0; height: 0;" width="
symbol#svg_explosion(viewBox="0 0 500 500")
path(fill="currentColor" d="M111.7 74.9L91.2 93.1l9.1 10.2 17.8-15.8 7.4 8.4-17.8 15.8 10.1 11.4 20.6-18.2 7.7 8.7-30.4 26.9-41.9-47.3 30.3-26.9 7.6 8.6zM190.8 59.6L219 84.3l-14.4 4.5-20.4-18.2-6.4 26.6-14.4 4.5 8.9-36.4-26.9-24.1 14.3-4.5L179 54.2l5.7-25.2 14.3-4.5-8.2 35.1zM250.1 21.2l27.1 3.4c6.1.8 10.8 3.1 14 7.2 3.2 4.1 4.5 9.2 3.7 15.5-.8 6.3-3.2 11-7.4 14.1-4.1 3.1-9.2 4.3-15.3 3.5L258 63.2l-2.8 22.3-13-1.6 7.9-62.7zm11.5 13l-2.2 17.5 12.6 1.6c5.1.6 9.1-2 9.8-7.6.7-5.6-2.5-9.2-7.6-9.9l-12.6-1.6zM329.1 95.4l23.8 13.8-5.8 10L312 98.8l31.8-54.6 11.3 6.6-26 44.6zM440.5 145c-1.3 8.4-5.9 15.4-13.9 21.1s-16.2 7.7-24.6 6.1c-8.4-1.6-15.3-6.3-20.8-14.1-5.5-7.9-7.6-16-6.4-24.4 1.3-8.5 6-15.5 14-21.1 8-5.6 16.2-7.7 24.5-6 8.4 1.6 15.4 6.3 20.9 14.2 5.5 7.6 7.6 15.7 6.3 24.2zM412 119c-5.1-.8-10.3.6-15.6 4.4-5.2 3.7-8.4 8.1-9.4 13.2-1 5.2.2 10.1 3.5 14.8 3.4 4.8 7.5 7.5 12.7 8.2 5.2.8 10.4-.7 15.6-4.4 5.3-3.7 8.4-8.1 9.4-13.2 1.1-5.1-.1-9.9-3.4-14.7-3.4-4.8-7.6-7.6-12.8-8.3zM471.5 237.9c-2.8 4.8-7.1 7.6-13 8.7l-2.6-13.1c5.3-.9 8.1-5 7.2-11-.9-5.8-4.3-8.8-8.9-8.2-2.3.3-3.7 1.4-4.5 3.3-.7 1.9-1.4 5.2-1.7 10.1-.8 7.5-2.2 13.1-4.3 16.9-2.1 3.9-5.7 6.2-10.9 7-6.3.9-11.3-.5-15.2-4.4-3.9-3.8-6.3-9-7.3-15.7-1.1-7.4-.2-13.7 2.6-18.8 2.8-5.1 7.4-8.2 13.7-9.2l2.6 13c-5.6 1.1-8.7 6.6-7.7 13.4 1 6.6 3.9 9.5 8.6 8.8 4.4-.7 5.7-4.5 6.7-14.1.3-3.5.7-6.2 1.1-8.4.4-2.2 1.2-4.4 2.2-6.8 2.1-4.7 6-7.2 11.8-8.1 5.4-.8 10.3.4 14.5 3.7 4.2 3.3 6.9 8.5 8 15.6.9 6.9-.1 12.6-2.9 17.3zM408.6 293.5l2.4-12.9 62 11.7-2.4 12.9-62-11.7zM419.6 396.9c-8.3 2-16.5.3-24.8-5-8.2-5.3-13.2-12.1-14.9-20.5-1.6-8.4.1-16.6 5.3-24.6 5.2-8.1 11.9-13.1 20.2-15.1 8.4-1.9 16.6-.3 24.9 5 8.2 5.3 13.2 12.1 14.8 20.5 1.7 8.4 0 16.6-5.2 24.7-5.2 8-12 13-20.3 15zm13.4-36.3c-1.2-5.1-4.5-9.3-9.9-12.8s-10.6-4.7-15.8-3.7-9.3 4-12.4 8.9-4.1 9.8-2.8 14.8c1.2 5.1 4.5 9.3 9.9 12.8 5.5 3.5 10.7 4.8 15.8 3.7 5.1-.9 9.2-3.8 12.3-8.7s4.1-9.9 2.9-15zM303.6 416.5l9.6-5.4 43.3 20.4-19.2-34 11.4-6.4 31 55-9.6 5.4-43.4-20.5 19.2 34.1-11.3 6.4-31-55zM238.2 468.8c-49 0-96.9-17.4-134.8-49-38.3-32-64-76.7-72.5-125.9-2-11.9-3.1-24-3.1-35.9 0-36.5 9.6-72.6 27.9-104.4 2.1-3.6 6.7-4.9 10.3-2.8 3.6 2.1 4.9 6.7 2.8 10.3-16.9 29.5-25.9 63.1-25.9 96.9 0 11.1 1 22.3 2.9 33.4 7.9 45.7 31.8 87.2 67.3 116.9 35.2 29.3 79.6 45.5 125.1 45.5 11.1 0 22.3-1 33.4-2.9 4.1-.7 8 2 8.7 6.1.7 4.1-2 8-6.1 8.7-11.9 2-24 3.1-36 3.1z")
symbol#svg_prodigy(viewBox="0 0 538.5 157.6")
path(fill="currentColor" d="M70.6 48.6c7 7.3 10.5 17.1 10.5 29.2S77.7 99.7 70.6 107c-6.9 7.3-15.9 11.1-27 11.1-9.4 0-16.8-2.7-21.7-8.2v44.8H0V39h20.7v8.1c4.8-6.4 12.4-9.6 22.9-9.6 11.1 0 20.1 3.7 27 11.1zM21.9 76v3.6c0 12.1 7.3 19.8 18.3 19.8 11.2 0 18.7-7.9 18.7-21.6s-7.5-21.6-18.7-21.6c-11 0-18.3 7.7-18.3 19.8zM133.8 59.4c-12.6 0-20.5 7-20.5 17.8v39.3h-22V39h21.1v8.8c4-6.4 11.2-9.6 21.3-9.6v21.2zM209.5 107.1c-7.6 7.3-17.5 11.1-29.5 11.1s-21.9-3.8-29.7-11.1c-7.6-7.5-11.5-17.2-11.5-29.2 0-12.1 3.9-21.9 11.5-29.2 7.8-7.3 17.7-11.1 29.7-11.1s21.9 3.8 29.5 11.1c7.8 7.3 11.7 17.1 11.7 29.2 0 11.9-3.9 21.7-11.7 29.2zM180 56.2c-5.7 0-10.3 1.9-13.8 5.8-3.5 3.8-5.2 9-5.2 15.7 0 6.7 1.8 12 5.2 15.7 3.4 3.8 8.1 5.7 13.8 5.7s10.3-1.9 13.8-5.7 5.2-9 5.2-15.7c0-6.8-1.8-12-5.2-15.7-3.5-3.8-8.1-5.8-13.8-5.8zM313 116.5h-20.5v-7.9c-4.4 5.5-12.7 9.6-23.1 9.6-10.9 0-19.9-3.8-27-11.1C235.5 99.7 232 90 232 77.8s3.5-21.9 10.3-29.2c7-7.3 16-11.1 27-11.1 9.7 0 17.1 2.7 21.9 8.2V0H313v116.5zm-58.8-38.7c0 13.6 7.5 21.4 18.7 21.4 10.9 0 18.3-7.3 18.3-19.8V76c0-12.2-7.3-19.8-18.3-19.8-11.2 0-18.7 8-18.7 21.6zM354.1 13.6c0 3.6-1.3 6.8-3.9 9.3-5 4.9-13.6 4.9-18.6 0-8.4-7.5-1.6-23.1 9.3-22.5 7.4 0 13.2 5.9 13.2 13.2zm-2.2 102.9H330V39h21.9v77.5zM425.1 47.1V39h20.5v80.4c0 11.2-3.6 20.1-10.6 26.8-7 6.7-16.6 10-28.5 10-23.4 0-36.9-11.4-39.9-29.8l21.7-.8c1 7.6 7.6 12 17.4 12 11.2 0 18.1-5.8 18.1-16.6v-11.1c-5.1 5.5-12.5 8.2-21.9 8.2-10.9 0-19.9-3.8-27-11.1-6.9-7.3-10.3-17.1-10.3-29.2s3.5-21.9 10.3-29.2c7-7.3 16-11.1 27-11.1 10.7 0 18.4 3.1 23.2 9.6zm-38.3 30.7c0 13.6 7.5 21.6 18.7 21.6 11 0 18.3-7.6 18.3-19.8V76c0-12.2-7.3-19.8-18.3-19.8-11.2 0-18.7 8-18.7 21.6zM488.8 154.8H465l19.8-45.1L454.5 39h24.1l17.8 46.2L514.2 39h24.3l-49.7 115.8z")
//- Machine learning & NLP libraries

View File

@ -1,5 +1,10 @@
//- 💫 DOCS > API > ANNOTATION > NAMED ENTITIES
p
| Models trained on the
| #[+a("https://catalog.ldc.upenn.edu/ldc2013t19") OntoNotes 5] corpus
| support the following entity types:
+table(["Type", "Description"])
+row
+cell #[code PERSON]
@ -45,9 +50,6 @@
+cell #[code LANGUAGE]
+cell Any named language.
p The following values are also annotated in a style similar to names:
+table([ "Type", "Description" ])
+row
+cell #[code DATE]
+cell Absolute or relative dates or periods.
@ -75,3 +77,33 @@ p The following values are also annotated in a style similar to names:
+row
+cell #[code CARDINAL]
+cell Numerals that do not fall under another type.
+h(4, "ner-wikipedia-scheme") Wikipedia scheme
p
| Models trained on Wikipedia corpus
| (#[+a("http://www.sciencedirect.com/science/article/pii/S0004370212000276") Nothman et al., 2013])
| use a less fine-grained NER annotation scheme and recognise the
| following entities:
+table(["Type", "Description"])
+row
+cell #[code PER]
+cell Named person or family.
+row
+cell #[code LOC]
+cell
| Name of politically or geographically defined location (cities,
| provinces, countries, international regions, bodies of water,
| mountains).
+row
+cell #[code ORG]
+cell Named corporate, governmental, or other organizational entity.
+row
+cell #[code MISC]
+cell
| Miscellaneous entities, e.g. events, nationalities, products or
| works of art.

View File

@ -1,5 +1,7 @@
//- 💫 DOCS > API > ANNOTATION > TRAINING
+h(3, "json-input") JSON input format for training
p
| spaCy takes training data in JSON format. The built-in
| #[+api("cli#convert") #[code convert]] command helps you convert the
@ -46,3 +48,57 @@ p
| Treebank:
+github("spacy", "examples/training/training-data.json", false, false, "json")
+h(3, "vocab-jsonl") Lexical data for vocabulary
+tag-new(2)
p
| The populate a model's vocabulary, you can use the
| #[+api("cli#vocab") #[code spacy vocab]] command and load in a
| #[+a("https://jsonlines.readthedocs.io/en/latest/") newline-delimited JSON]
| (JSONL) file containing one lexical entry per line. The first line
| defines the language and vocabulary settings. All other lines are
| expected to be JSON objects describing an individual lexeme. The lexical
| attributes will be then set as attributes on spaCy's
| #[+api("lexeme#attributes") #[code Lexeme]] object. The #[code vocab]
| command outputs a ready-to-use spaCy model with a #[code Vocab]
| containing the lexical data.
+code("First line").
{"lang": "en", "settings": {"oov_prob": -20.502029418945312}}
+code("Entry structure").
{
"orth": string,
"id": int,
"lower": string,
"norm": string,
"shape": string
"prefix": string,
"suffix": string,
"length": int,
"cluster": string,
"prob": float,
"is_alpha": bool,
"is_ascii": bool,
"is_digit": bool,
"is_lower": bool,
"is_punct": bool,
"is_space": bool,
"is_title": bool,
"is_upper": bool,
"like_url": bool,
"like_num": bool,
"like_email": bool,
"is_stop": bool,
"is_oov": bool,
"is_quote": bool,
"is_left_punct": bool,
"is_right_punct": bool
}
p
| Here's an example of the 20 most frequent lexemes in the English
| training data:
+github("spacy", "examples/training/vocab-data.jsonl", false, false, "json")

View File

@ -3,8 +3,10 @@
"Overview": {
"Architecture": "./",
"Annotation Specs": "annotation",
"Command Line": "cli",
"Functions": "top-level"
},
"Containers": {
"Doc": "doc",
"Token": "token",
@ -45,14 +47,19 @@
}
},
"cli": {
"title": "Command Line Interface",
"teaser": "Download, train and package models, and debug spaCy.",
"source": "spacy/cli"
},
"top-level": {
"title": "Top-level Functions",
"menu": {
"spacy": "spacy",
"displacy": "displacy",
"Utility Functions": "util",
"Compatibility": "compat",
"Command Line": "cli"
"Compatibility": "compat"
}
},
@ -213,7 +220,7 @@
"Lemmatization": "lemmatization",
"Dependencies": "dependency-parsing",
"Named Entities": "named-entities",
"Training Data": "training"
"Models & Training": "training"
}
}
}

View File

@ -85,7 +85,9 @@ p
+row
+cell #[code name]
+cell unicode
+cell ISO code of the language class to load.
+cell
| #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code]
| of the language class to load.
+row
+cell #[code disable]

View File

@ -99,6 +99,6 @@ p This document describes the target annotations spaCy is trained to predict.
include _annotation/_biluo
+section("training")
+h(2, "json-input") JSON input format for training
+h(2, "training") Models and training data
include _annotation/_training

View File

@ -1,4 +1,6 @@
//- 💫 DOCS > API > TOP-LEVEL > COMMAND LINE INTERFACE
//- 💫 DOCS > API > COMMAND LINE INTERFACE
include ../_includes/_mixins
p
| As of v1.7.0, spaCy comes with new command line helpers to download and
@ -34,6 +36,13 @@ p
+cell flag
+cell Show help message and available arguments.
+row("foot")
+cell creates
+cell directory, symlink
+cell
| The installed model package in your #[code site-packages]
| directory and a shortcut link as a symlink in #[code spacy/data].
+aside("Downloading best practices")
| The #[code download] command is mostly intended as a convenient,
| interactive wrapper it performs compatibility checks and prints
@ -86,6 +95,13 @@ p
+cell flag
+cell Show help message and available arguments.
+row("foot")
+cell creates
+cell symlink
+cell
| A shortcut link of the given name as a symlink in
| #[code spacy/data].
+h(3, "info") Info
p
@ -113,6 +129,11 @@ p
+cell flag
+cell Show help message and available arguments.
+row("foot")
+cell prints
+cell #[code stdout]
+cell Information about your spaCy installation.
+h(3, "validate") Validate
+tag-new(2)
@ -129,6 +150,12 @@ p
+code(false, "bash", "$").
spacy validate
+table(["Argument", "Type", "Description"])
+row("foot")
+cell prints
+cell #[code stdout]
+cell Details about the compatibility of your installed models.
+h(3, "convert") Convert
p
@ -172,6 +199,11 @@ p
+cell flag
+cell Show help message and available arguments.
+row("foot")
+cell creates
+cell JSON
+cell Data in spaCy's #[+a("/api/annotation#json-input") JSON format].
p The following converters are available:
+table(["ID", "Description"])
@ -286,6 +318,11 @@ p
+cell flag
+cell Show help message and available arguments.
+row("foot")
+cell creates
+cell model, pickle
+cell A spaCy model on each epoch, and a final #[code .pickle] file.
+h(4, "train-hyperparams") Environment variables for hyperparameters
+tag-new(2)
@ -395,6 +432,50 @@ p
+cell Gradient L2 norm constraint.
+cell #[code 1.0]
+h(3, "vocab") Vocab
+tag-new(2)
p
| Compile a vocabulary from a
| #[+a("/api/annotation#vocab-jsonl") lexicon JSONL] file and optional
| word vectors. Will save out a valid spaCy model that you can load via
| #[+api("spacy#load") #[code spacy.load]] or package using the
| #[+api("cli#package") #[code package]] command.
+code(false, "bash", "$").
spacy vocab [lang] [output_dir] [lexemes_loc] [vectors_loc]
+table(["Argument", "Type", "Description"])
+row
+cell #[code lang]
+cell positional
+cell
| Model language
| #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code],
| e.g. #[code en].
+row
+cell #[code output_dir]
+cell positional
+cell Model output directory. Will be created if it doesn't exist.
+row
+cell #[code lexemes_loc]
+cell positional
+cell
| Location of lexical data in spaCy's
| #[+a("/api/annotation#vocab-jsonl") JSONL format].
+row
+cell #[code vectors_loc]
+cell positional
+cell Optional location of vectors data as numpy #[code .npz] file.
+row("foot")
+cell creates
+cell model
+cell A spaCy model containing the vocab and vectors.
+h(3, "evaluate") Evaluate
+tag-new(2)
@ -447,22 +528,36 @@ p
+cell flag
+cell Use gold preprocessing.
+row("foot")
+cell prints / creates
+cell #[code stdout], HTML
+cell Training results and optional displaCy visualizations.
+h(3, "package") Package
p
| Generate a #[+a("/usage/training#models-generating") model Python package]
| from an existing model data directory. All data files are copied over.
| If the path to a meta.json is supplied, or a meta.json is found in the
| input directory, this file is used. Otherwise, the data can be entered
| directly from the command line. The required file templates are downloaded
| from #[+src(gh("spacy-dev-resources", "templates/model")) GitHub] to make
| If the path to a #[code meta.json] is supplied, or a #[code meta.json] is
| found in the input directory, this file is used. Otherwise, the data can
| be entered directly from the command line. The required file templates
| are downloaded from
| #[+src(gh("spacy-dev-resources", "templates/model")) GitHub] to make
| sure you're always using the latest versions. This means you need to be
| connected to the internet to use this command.
| connected to the internet to use this command. After packaging, you
| can run #[code python setup.py sdist] from the newly created directory
| to turn your model into an installable archive file.
+code(false, "bash", "$", false, false, true).
spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--force]
+aside-code("Example", "bash").
spacy package /input /output
cd /output/en_model-0.0.0
python setup.py sdist
pip install dist/en_model-0.0.0.tar.gz
+table(["Argument", "Type", "Description"])
+row
+cell #[code input_dir]
@ -477,15 +572,16 @@ p
+row
+cell #[code --meta-path], #[code -m]
+cell option
+cell #[+tag-new(2)] Path to meta.json file (optional).
+cell #[+tag-new(2)] Path to #[code meta.json] file (optional).
+row
+cell #[code --create-meta], #[code -c]
+cell flag
+cell
| #[+tag-new(2)] Create a meta.json file on the command line, even
| if one already exists in the directory.
| #[+tag-new(2)] Create a #[code meta.json] file on the command
| line, even if one already exists in the directory. If an
| existing file is found, its entries will be shown as the defaults
| in the command line prompt.
+row
+cell #[code --force], #[code -f]
+cell flag
@ -495,3 +591,8 @@ p
+cell #[code --help], #[code -h]
+cell flag
+cell Show help message and available arguments.
+row("foot")
+cell creates
+cell directory
+cell A Python package containing the spaCy model.

View File

@ -18,7 +18,3 @@ include ../_includes/_mixins
+section("compat")
+h(2, "compat", "spacy/compaty.py") Compatibility functions
include _top-level/_compat
+section("cli", "spacy/cli")
+h(2, "cli") Command line
include _top-level/_cli

View File

@ -162,7 +162,7 @@ p
+cell int
+cell The integer ID by which the flag value can be checked.
+h(2, "add_flag") Vocab.clear_vectors
+h(2, "clear_vectors") Vocab.clear_vectors
+tag method
+tag-new(2)
@ -181,7 +181,50 @@ p
| Number of dimensions of the new vectors. If #[code None], size
| is not changed.
+h(2, "add_flag") Vocab.get_vector
+h(2, "prune_vectors") Vocab.prune_vectors
+tag method
+tag-new(2)
p
| Reduce the current vector table to #[code nr_row] unique entries. Words
| mapped to the discarded vectors will be remapped to the closest vector
| among those remaining. For example, suppose the original table had
| vectors for the words:
| #[code.u-break ['sat', 'cat', 'feline', 'reclined']]. If we prune the
| vector table to, two rows, we would discard the vectors for "feline"
| and "reclined". These words would then be remapped to the closest
| remaining vector so "feline" would have the same vector as "cat",
| and "reclined" would have the same vector as "sat". The similarities are
| judged by cosine. The original vectors may be large, so the cosines are
| calculated in minibatches, to reduce memory usage.
+aside-code("Example").
nlp.vocab.prune_vectors(10000)
assert len(nlp.vocab.vectors) &lt;= 1000
+table(["Name", "Type", "Description"])
+row
+cell #[code nr_row]
+cell int
+cell The number of rows to keep in the vector table.
+row
+cell #[code batch_size]
+cell int
+cell
| Batch of vectors for calculating the similarities. Larger batch
| sizes might be faster, while temporarily requiring more memory.
+row("foot")
+cell returns
+cell dict
+cell
| A dictionary keyed by removed words mapped to
| #[code (string, score)] tuples, where #[code string] is the entry
| the removed word was mapped to, and #[code score] the similarity
| score between the two words.
+h(2, "get_vector") Vocab.get_vector
+tag method
+tag-new(2)
@ -206,7 +249,7 @@ p
| A word vector. Size and shape are determined by the
| #[code Vocab.vectors] instance.
+h(2, "add_flag") Vocab.set_vector
+h(2, "set_vector") Vocab.set_vector
+tag method
+tag-new(2)
@ -228,7 +271,7 @@ p
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+cell The vector to set.
+h(2, "add_flag") Vocab.has_vector
+h(2, "has_vector") Vocab.has_vector
+tag method
+tag-new(2)

View File

@ -48,6 +48,9 @@
flex: 0 0 100%
flex-flow: column wrap
&.o-grid__col--no-gutter
margin-top: 0
// Fix overflow issue in old browsers
& > *

View File

@ -8,7 +8,7 @@
align-items: center
display: flex
justify-content: space-between
flex-flow: row wrap
flex-flow: row nowrap
padding: 0 2rem 0 1rem
z-index: 30
width: 100%

View File

@ -51,6 +51,7 @@
@include scroll-shadow-base($color-front)
display: inline-block
overflow-x: auto
overflow-y: hidden
width: auto
-webkit-overflow-scrolling: touch

View File

@ -3,7 +3,7 @@
+h(2, "changelog") Changelog
+button(gh("spacy") + "/releases", false, "secondary", "small").u-float-right.u-nowrap View releases
div(data-tpl="changelog" data-tpl-key="error")
div(data-tpl="changelog" data-tpl-key="error" style="display: none")
+infobox
| Unable to load changelog from GitHub. Please see the
| #[+a(gh("spacy") + "/releases") releases page] instead.

View File

@ -76,6 +76,16 @@ p
("Google rebrands its business apps", [(0, 6, "ORG")]),
("look what i found on google! 😂", [(21, 27, "PRODUCT")])]
+infobox("Tip: Try the Prodigy annotation tool")
+infobox-logos(["prodigy", 100, 29, "https://prodi.gy"])
| If you need to label a lot of data, check out
| #[+a("https://prodi.gy", true) Prodigy], a new, active learning-powered
| annotation tool we've developed. Prodigy is fast and extensible, and
| comes with a modern #[strong web application] that helps you collect
| training data faster. It integrates seamlessly with spaCy, pre-selects
| the #[strong most relevant examples] for annotation, and lets you
| train and evaluate ready-to-use spaCy models.
+h(3, "annotations") Training with annotations
p
@ -180,9 +190,10 @@ p
+cell #[code optimizer]
+cell Callable to update the model's weights.
+infobox
| For the #[strong full example and more details], see the usage guide on
| #[+a("/usage/training#ner") training the named entity recognizer],
| or the runnable
| #[+src(gh("spaCy", "examples/training/train_ner.py")) training script]
| on GitHub.
p
| Instead of writing your own training loop, you can also use the
| built-in #[+api("cli#train") #[code train]] command, which expects data
| in spaCy's #[+a("/api/annotation#json-input") JSON format]. On each epoch,
| a model will be saved out to the directory. After training, you can
| use the #[+api("cli#package") #[code package]] command to generate an
| installable Python package from your model.

View File

@ -190,7 +190,3 @@ p
+item
| #[strong Test] the model to make sure the parser works as expected.
+h(3, "training-json") JSON format for training
include ../../api/_annotation/_training