WIP on vectors fixes

This commit is contained in:
Matthew Honnibal 2017-10-31 11:22:56 +01:00
commit 9c11ee4a1c
34 changed files with 682 additions and 343 deletions

View File

@ -0,0 +1,21 @@
{"lang": "en", "settings": {"oov_prob": -20.502029418945312}}
{"orth": ".", "id": 1, "lower": ".", "norm": ".", "shape": ".", "prefix": ".", "suffix": ".", "length": 1, "cluster": "8", "prob": -3.0678977966308594, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": ",", "id": 2, "lower": ",", "norm": ",", "shape": ",", "prefix": ",", "suffix": ",", "length": 1, "cluster": "4", "prob": -3.4549596309661865, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "the", "id": 3, "lower": "the", "norm": "the", "shape": "xxx", "prefix": "t", "suffix": "the", "length": 3, "cluster": "11", "prob": -3.528766632080078, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "I", "id": 4, "lower": "i", "norm": "I", "shape": "X", "prefix": "I", "suffix": "I", "length": 1, "cluster": "346", "prob": -3.791565179824829, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": true, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "to", "id": 5, "lower": "to", "norm": "to", "shape": "xx", "prefix": "t", "suffix": "to", "length": 2, "cluster": "12", "prob": -3.8560216426849365, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "a", "id": 6, "lower": "a", "norm": "a", "shape": "x", "prefix": "a", "suffix": "a", "length": 1, "cluster": "19", "prob": -3.92978835105896, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "and", "id": 7, "lower": "and", "norm": "and", "shape": "xxx", "prefix": "a", "suffix": "and", "length": 3, "cluster": "20", "prob": -4.113108158111572, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "of", "id": 8, "lower": "of", "norm": "of", "shape": "xx", "prefix": "o", "suffix": "of", "length": 2, "cluster": "28", "prob": -4.27587366104126, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "you", "id": 9, "lower": "you", "norm": "you", "shape": "xxx", "prefix": "y", "suffix": "you", "length": 3, "cluster": "602", "prob": -4.373791217803955, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "it", "id": 10, "lower": "it", "norm": "it", "shape": "xx", "prefix": "i", "suffix": "it", "length": 2, "cluster": "474", "prob": -4.388050079345703, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "is", "id": 11, "lower": "is", "norm": "is", "shape": "xx", "prefix": "i", "suffix": "is", "length": 2, "cluster": "762", "prob": -4.457748889923096, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "that", "id": 12, "lower": "that", "norm": "that", "shape": "xxxx", "prefix": "t", "suffix": "hat", "length": 4, "cluster": "84", "prob": -4.464504718780518, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "\n\n", "id": 0, "lower": "\n\n", "norm": "\n\n", "shape": "\n\n", "prefix": "\n", "suffix": "\n\n", "length": 2, "cluster": "0", "prob": -4.606560707092285, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": true, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "in", "id": 13, "lower": "in", "norm": "in", "shape": "xx", "prefix": "i", "suffix": "in", "length": 2, "cluster": "60", "prob": -4.619071960449219, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "'s", "id": 14, "lower": "'s", "norm": "'s", "shape": "'x", "prefix": "'", "suffix": "'s", "length": 2, "cluster": "52", "prob": -4.830559253692627, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "n't", "id": 15, "lower": "n't", "norm": "n't", "shape": "x'x", "prefix": "n", "suffix": "n't", "length": 3, "cluster": "74", "prob": -4.859938621520996, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "for", "id": 16, "lower": "for", "norm": "for", "shape": "xxx", "prefix": "f", "suffix": "for", "length": 3, "cluster": "508", "prob": -4.8801093101501465, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "\"", "id": 17, "lower": "\"", "norm": "\"", "shape": "\"", "prefix": "\"", "suffix": "\"", "length": 1, "cluster": "0", "prob": -5.02677583694458, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": true, "is_left_punct": true, "is_right_punct": true}
{"orth": "?", "id": 18, "lower": "?", "norm": "?", "shape": "?", "prefix": "?", "suffix": "?", "length": 1, "cluster": "0", "prob": -5.05924654006958, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": " ", "id": 0, "lower": " ", "norm": " ", "shape": " ", "prefix": " ", "suffix": " ", "length": 1, "cluster": "0", "prob": -5.129165172576904, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": true, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}

View File

@ -7,7 +7,7 @@ if __name__ == '__main__':
import plac import plac
import sys import sys
from spacy.cli import download, link, info, package, train, convert, model from spacy.cli import download, link, info, package, train, convert, model
from spacy.cli import profile, evaluate, validate from spacy.cli import vocab, profile, evaluate, validate
from spacy.util import prints from spacy.util import prints
commands = { commands = {
@ -19,6 +19,7 @@ if __name__ == '__main__':
'convert': convert, 'convert': convert,
'package': package, 'package': package,
'model': model, 'model': model,
'vocab': vocab,
'profile': profile, 'profile': profile,
'validate': validate 'validate': validate
} }

View File

@ -7,4 +7,5 @@ from .train import train
from .evaluate import evaluate from .evaluate import evaluate
from .convert import convert from .convert import convert
from .model import model from .model import model
from .vocab import make_vocab as vocab
from .validate import validate from .validate import validate

View File

@ -17,14 +17,14 @@ numpy.random.seed(0)
@plac.annotations( @plac.annotations(
model=("Model name or path", "positional", None, str), model=("model name or path", "positional", None, str),
data_path=("Location of JSON-formatted evaluation data", "positional", data_path=("location of JSON-formatted evaluation data", "positional",
None, str), None, str),
gold_preproc=("Use gold preprocessing", "flag", "G", bool), gold_preproc=("use gold preprocessing", "flag", "G", bool),
gpu_id=("Use GPU", "option", "g", int), gpu_id=("use GPU", "option", "g", int),
displacy_path=("Directory to output rendered parses as HTML", "option", displacy_path=("directory to output rendered parses as HTML", "option",
"dp", str), "dp", str),
displacy_limit=("Limit of parses to render as HTML", "option", "dl", int)) displacy_limit=("limit of parses to render as HTML", "option", "dl", int))
def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False, def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
displacy_path=None, displacy_limit=25): displacy_path=None, displacy_limit=25):
""" """

View File

@ -16,10 +16,11 @@ from .. import about
input_dir=("directory with model data", "positional", None, str), input_dir=("directory with model data", "positional", None, str),
output_dir=("output parent directory", "positional", None, str), output_dir=("output parent directory", "positional", None, str),
meta_path=("path to meta.json", "option", "m", str), meta_path=("path to meta.json", "option", "m", str),
create_meta=("create meta.json, even if one exists in directory", "flag", create_meta=("create meta.json, even if one exists in directory if "
"c", bool), "existing meta is found, entries are shown as defaults in "
force=("force overwriting of existing folder in output directory", "flag", "the command line prompt", "flag", "c", bool),
"f", bool)) force=("force overwriting of existing model directory in output directory",
"flag", "f", bool))
def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False,
force=False): force=False):
""" """
@ -41,13 +42,13 @@ def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False,
template_manifest = get_template('MANIFEST.in') template_manifest = get_template('MANIFEST.in')
template_init = get_template('xx_model_name/__init__.py') template_init = get_template('xx_model_name/__init__.py')
meta_path = meta_path or input_path / 'meta.json' meta_path = meta_path or input_path / 'meta.json'
if not create_meta and meta_path.is_file(): if meta_path.is_file():
prints(meta_path, title="Reading meta.json from file")
meta = util.read_json(meta_path) meta = util.read_json(meta_path)
else: if not create_meta: # only print this if user doesn't want to overwrite
meta = generate_meta(input_dir) prints(meta_path, title="Loaded meta.json from file")
else:
meta = generate_meta(input_dir, meta)
meta = validate_meta(meta, ['lang', 'name', 'version']) meta = validate_meta(meta, ['lang', 'name', 'version'])
model_name = meta['lang'] + '_' + meta['name'] model_name = meta['lang'] + '_' + meta['name']
model_name_v = model_name + '-' + meta['version'] model_name_v = model_name + '-' + meta['version']
main_path = output_path / model_name_v main_path = output_path / model_name_v
@ -82,18 +83,19 @@ def create_file(file_path, contents):
file_path.open('w', encoding='utf-8').write(contents) file_path.open('w', encoding='utf-8').write(contents)
def generate_meta(model_path): def generate_meta(model_path, existing_meta):
meta = {} meta = existing_meta or {}
settings = [('lang', 'Model language', 'en'), settings = [('lang', 'Model language', meta.get('lang', 'en')),
('name', 'Model name', 'model'), ('name', 'Model name', meta.get('name', 'model')),
('version', 'Model version', '0.0.0'), ('version', 'Model version', meta.get('version', '0.0.0')),
('spacy_version', 'Required spaCy version', ('spacy_version', 'Required spaCy version',
'>=%s,<3.0.0' % about.__version__), '>=%s,<3.0.0' % about.__version__),
('description', 'Model description', False), ('description', 'Model description',
('author', 'Author', False), meta.get('description', False)),
('email', 'Author email', False), ('author', 'Author', meta.get('author', False)),
('url', 'Author website', False), ('email', 'Author email', meta.get('email', False)),
('license', 'License', 'CC BY-NC 3.0')] ('url', 'Author website', meta.get('url', False)),
('license', 'License', meta.get('license', 'CC BY-SA 3.0'))]
nlp = util.load_model_from_path(Path(model_path)) nlp = util.load_model_from_path(Path(model_path))
meta['pipeline'] = nlp.pipe_names meta['pipeline'] = nlp.pipe_names
meta['vectors'] = {'width': nlp.vocab.vectors_length, meta['vectors'] = {'width': nlp.vocab.vectors_length,

View File

@ -32,6 +32,7 @@ numpy.random.seed(0)
n_sents=("number of sentences", "option", "ns", int), n_sents=("number of sentences", "option", "ns", int),
use_gpu=("Use GPU", "option", "g", int), use_gpu=("Use GPU", "option", "g", int),
vectors=("Model to load vectors from", "option", "v"), vectors=("Model to load vectors from", "option", "v"),
vectors_limit=("Truncate to N vectors (requires -v)", "option", None, int),
no_tagger=("Don't train tagger", "flag", "T", bool), no_tagger=("Don't train tagger", "flag", "T", bool),
no_parser=("Don't train parser", "flag", "P", bool), no_parser=("Don't train parser", "flag", "P", bool),
no_entities=("Don't train NER", "flag", "N", bool), no_entities=("Don't train NER", "flag", "N", bool),
@ -40,9 +41,9 @@ numpy.random.seed(0)
meta_path=("Optional path to meta.json. All relevant properties will be " meta_path=("Optional path to meta.json. All relevant properties will be "
"overwritten.", "option", "m", Path)) "overwritten.", "option", "m", Path))
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, use_gpu=-1, vectors=None, vectors_limit=None, no_tagger=False,
no_entities=False, gold_preproc=False, version="0.0.0", no_parser=False, no_entities=False, gold_preproc=False,
meta_path=None): version="0.0.0", meta_path=None):
""" """
Train a model. Expects data in spaCy's JSON format. Train a model. Expects data in spaCy's JSON format.
""" """
@ -95,10 +96,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
if vectors: if vectors:
util.load_model(vectors, vocab=nlp.vocab) util.load_model(vectors, vocab=nlp.vocab)
if vectors_limit is not None: if vectors_limit is not None:
remap = nlp.vocab.prune_vectors(vectors_limit) nlp.vocab.prune_vectors(vectors_limit)
print('remap', len(remap))
for key, (value, sim) in remap.items():
print(repr(key), repr(value), sim)
for name in pipeline: for name in pipeline:
nlp.add_pipe(nlp.create_pipe(name), name=name) nlp.add_pipe(nlp.create_pipe(name), name=name)
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)

54
spacy/cli/vocab.py Normal file
View File

@ -0,0 +1,54 @@
# coding: utf8
from __future__ import unicode_literals
import plac
import json
import spacy
import numpy
from pathlib import Path
from ..util import prints, ensure_path
@plac.annotations(
lang=("model language", "positional", None, str),
output_dir=("model output directory", "positional", None, Path),
lexemes_loc=("location of JSONL-formatted lexical data", "positional",
None, Path),
vectors_loc=("optional: location of vectors data, as numpy .npz",
"positional", None, str))
def make_vocab(cmd, lang, output_dir, lexemes_loc, vectors_loc=None):
"""Compile a vocabulary from a lexicon jsonl file and word vectors."""
if not lexemes_loc.exists():
prints(lexemes_loc, title="Can't find lexical data", exits=1)
vectors_loc = ensure_path(vectors_loc)
nlp = spacy.blank(lang)
for word in nlp.vocab:
word.rank = 0
lex_added = 0
vec_added = 0
with lexemes_loc.open() as file_:
for line in file_:
if line.strip():
attrs = json.loads(line)
if 'settings' in attrs:
nlp.vocab.cfg.update(attrs['settings'])
else:
lex = nlp.vocab[attrs['orth']]
lex.set_attrs(**attrs)
assert lex.rank == attrs['id']
lex_added += 1
if vectors_loc is not None:
vector_data = numpy.load(open(vectors_loc, 'rb'))
nlp.vocab.clear_vectors(width=vector_data.shape[1])
for word in nlp.vocab:
if word.rank:
nlp.vocab.vectors.add(word.orth_, row=word.rank,
vector=vector_data[word.rank])
vec_added += 1
if not output_dir.exists():
output_dir.mkdir()
nlp.to_disk(output_dir)
prints("{} entries, {} vectors".format(lex_added, vec_added), output_dir,
title="Sucessfully compiled vocab and vectors, and saved model")
return nlp

View File

@ -300,5 +300,15 @@ GLOSSARY = {
'MONEY': 'Monetary values, including unit', 'MONEY': 'Monetary values, including unit',
'QUANTITY': 'Measurements, as of weight or distance', 'QUANTITY': 'Measurements, as of weight or distance',
'ORDINAL': '"first", "second", etc.', 'ORDINAL': '"first", "second", etc.',
'CARDINAL': 'Numerals that do not fall under another type' 'CARDINAL': 'Numerals that do not fall under another type',
# Named Entity Recognition
# Wikipedia
# http://www.sciencedirect.com/science/article/pii/S0004370212000276
# https://pdfs.semanticscholar.org/5744/578cc243d92287f47448870bb426c66cc941.pdf
'PER': 'Named person or family.',
'MISC': ('Miscellaneous entities, e.g. events, nationalities, '
'products or works of art'),
} }

View File

@ -154,6 +154,8 @@ class Language(object):
self._meta.setdefault('email', '') self._meta.setdefault('email', '')
self._meta.setdefault('url', '') self._meta.setdefault('url', '')
self._meta.setdefault('license', '') self._meta.setdefault('license', '')
self._meta['vectors'] = {'width': self.vocab.vectors_length,
'entries': len(self.vocab.vectors)}
self._meta['pipeline'] = self.pipe_names self._meta['pipeline'] = self.pipe_names
return self._meta return self._meta

View File

@ -13,6 +13,8 @@ from .typedefs cimport attr_t, flags_t
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_OOV from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_OOV
from .attrs cimport PROB
from .attrs import intify_attrs
from . import about from . import about
@ -68,6 +70,19 @@ cdef class Lexeme:
def __hash__(self): def __hash__(self):
return self.c.orth return self.c.orth
def set_attrs(self, **attrs):
cdef attr_id_t attr
attrs = intify_attrs(attrs)
for attr, value in attrs.items():
if attr == PROB:
self.c.prob = value
elif attr == CLUSTER:
self.c.cluster = int(value)
elif isinstance(value, int) or isinstance(value, long):
Lexeme.set_struct_attr(self.c, attr, value)
else:
Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value))
def set_flag(self, attr_id_t flag_id, bint value): def set_flag(self, attr_id_t flag_id, bint value):
"""Change the value of a boolean flag. """Change the value of a boolean flag.

View File

@ -209,7 +209,7 @@ def test_doc_api_right_edge(en_tokenizer):
def test_doc_api_has_vector(): def test_doc_api_has_vector():
vocab = Vocab() vocab = Vocab()
vocab.clear_vectors(2) vocab.clear_vectors(2)
vocab.vectors.add('kitten', numpy.asarray([0., 2.], dtype='f')) vocab.vectors.add('kitten', vector=numpy.asarray([0., 2.], dtype='f'))
doc = Doc(vocab, words=['kitten']) doc = Doc(vocab, words=['kitten'])
assert doc.has_vector assert doc.has_vector

View File

@ -73,8 +73,8 @@ def test_doc_token_api_is_properties(en_vocab):
def test_doc_token_api_vectors(): def test_doc_token_api_vectors():
vocab = Vocab() vocab = Vocab()
vocab.clear_vectors(2) vocab.clear_vectors(2)
vocab.vectors.add('apples', numpy.asarray([0., 2.], dtype='f')) vocab.vectors.add('apples', vector=numpy.asarray([0., 2.], dtype='f'))
vocab.vectors.add('oranges', numpy.asarray([0., 1.], dtype='f')) vocab.vectors.add('oranges', vector=numpy.asarray([0., 1.], dtype='f'))
doc = Doc(vocab, words=['apples', 'oranges', 'oov']) doc = Doc(vocab, words=['apples', 'oranges', 'oov'])
assert doc.has_vector assert doc.has_vector

View File

@ -21,8 +21,10 @@ cdef class Vectors:
Vectors data is kept in the vectors.data attribute, which should be an Vectors data is kept in the vectors.data attribute, which should be an
instance of numpy.ndarray (for CPU vectors) or cupy.ndarray instance of numpy.ndarray (for CPU vectors) or cupy.ndarray
(for GPU vectors). `vectors.key2row` is a dictionary mapping word hashes to (for GPU vectors). `vectors.key2row` is a dictionary mapping word hashes to
rows in the vectors.data table. The array `vectors.keys` keeps the keys in rows in the vectors.data table.
order, such that `keys[vectors.key2row[key]] == key`.
Multiple keys can be mapped to the same vector, so len(keys) may be greater
(but not smaller) than data.shape[0].
""" """
cdef public object data cdef public object data
cdef readonly StringStore strings cdef readonly StringStore strings
@ -101,7 +103,7 @@ cdef class Vectors:
RETURNS (int): The number of vectors in the data. RETURNS (int): The number of vectors in the data.
""" """
return self.i return self._i_vec
def __contains__(self, key): def __contains__(self, key):
"""Check whether a key has a vector entry in the table. """Check whether a key has a vector entry in the table.
@ -113,11 +115,13 @@ cdef class Vectors:
key = self.strings[key] key = self.strings[key]
return key in self.key2row return key in self.key2row
def add(self, key, vector=None): def add(self, key, *, vector=None, row=None):
"""Add a key to the table, optionally setting a vector value as well. """Add a key to the table. Keys can be mapped to an existing vector
by setting `row`, or a new vector can be added.
key (unicode / int): The key to add. key (unicode / int): The key to add.
vector (numpy.ndarray): An optional vector to add. vector (numpy.ndarray / None): A vector to add for the key.
row (int / None): The row-number of a vector to map the key to.
""" """
if isinstance(key, basestring_): if isinstance(key, basestring_):
key = self.strings.add(key) key = self.strings.add(key)
@ -131,8 +135,8 @@ cdef class Vectors:
self.key2row[key] = row self.key2row[key] = row
if vector is not None: if vector is not None:
self.data[i] = vector self.data[row] = vector
return i return row
def items(self): def items(self):
"""Iterate over `(string key, vector)` pairs, in order. """Iterate over `(string key, vector)` pairs, in order.

View File

@ -32,6 +32,7 @@ cdef class Vocab:
cdef readonly int length cdef readonly int length
cdef public object data_dir cdef public object data_dir
cdef public object lex_attr_getters cdef public object lex_attr_getters
cdef public object cfg
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL

View File

@ -5,6 +5,7 @@ import numpy
import dill import dill
from collections import OrderedDict from collections import OrderedDict
from thinc.neural.util import get_array_module
from .lexeme cimport EMPTY_LEXEME from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .strings cimport hash_string from .strings cimport hash_string
@ -27,7 +28,7 @@ cdef class Vocab:
C-data that is shared between `Doc` objects. C-data that is shared between `Doc` objects.
""" """
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None, def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
strings=tuple(), **deprecated_kwargs): strings=tuple(), oov_prob=-20., **deprecated_kwargs):
"""Create the vocabulary. """Create the vocabulary.
lex_attr_getters (dict): A dictionary mapping attribute IDs to lex_attr_getters (dict): A dictionary mapping attribute IDs to
@ -43,6 +44,7 @@ cdef class Vocab:
tag_map = tag_map if tag_map is not None else {} tag_map = tag_map if tag_map is not None else {}
if lemmatizer in (None, True, False): if lemmatizer in (None, True, False):
lemmatizer = Lemmatizer({}, {}, {}) lemmatizer = Lemmatizer({}, {}, {})
self.cfg = {'oov_prob': oov_prob}
self.mem = Pool() self.mem = Pool()
self._by_hash = PreshMap() self._by_hash = PreshMap()
self._by_orth = PreshMap() self._by_orth = PreshMap()
@ -239,7 +241,7 @@ cdef class Vocab:
def vectors_length(self): def vectors_length(self):
return self.vectors.data.shape[1] return self.vectors.data.shape[1]
def clear_vectors(self, new_dim=None): def clear_vectors(self, width=None):
"""Drop the current vector table. Because all vectors must be the same """Drop the current vector table. Because all vectors must be the same
width, you have to call this to change the size of the vectors. width, you have to call this to change the size of the vectors.
""" """
@ -283,16 +285,14 @@ cdef class Vocab:
keep = xp.ascontiguousarray(keep.T) keep = xp.ascontiguousarray(keep.T)
neighbours = xp.zeros((toss.shape[0],), dtype='i') neighbours = xp.zeros((toss.shape[0],), dtype='i')
scores = xp.zeros((toss.shape[0],), dtype='f') scores = xp.zeros((toss.shape[0],), dtype='f')
for i in range(0, toss.shape[0]//2, batch_size): for i in range(0, toss.shape[0], batch_size):
batch = toss[i : i+batch_size] batch = toss[i : i+batch_size]
batch /= xp.linalg.norm(batch, axis=1, keepdims=True)+1e-8 batch /= xp.linalg.norm(batch, axis=1, keepdims=True)+1e-8
sims = xp.dot(batch, keep) sims = xp.dot(batch, keep)
matches = sims.argmax(axis=1) matches = sims.argmax(axis=1)
neighbours[i:i+batch_size] = matches neighbours[i:i+batch_size] = matches
scores[i:i+batch_size] = sims.max(axis=1) scores[i:i+batch_size] = sims.max(axis=1)
i2k = {i: key for key, i in self.vectors.key2row.items()} for lex in self:
remap = {}
for lex in list(self):
# If we're losing the vector for this word, map it to the nearest # If we're losing the vector for this word, map it to the nearest
# vector we're keeping. # vector we're keeping.
if lex.rank >= nr_row: if lex.rank >= nr_row:

View File

@ -41,9 +41,6 @@
- var comps = path.split('#'); - var comps = path.split('#');
- return "top-level#" + comps[0] + '.' + comps[1]; - return "top-level#" + comps[0] + '.' + comps[1];
- } - }
- else if (path.startsWith('cli#')) {
- return "top-level#" + path.split('#')[1];
- }
- return path; - return path;
- } - }

View File

@ -1,244 +0,0 @@
//- 💫 MIXINS > BASE
//- Section
id - [string] anchor assigned to section (used for breadcrumb navigation)
mixin section(id)
section.o-section(id="section-" + id data-section=id)
block
//- Aside wrapper
label - [string] aside label
mixin aside-wrapper(label)
aside.c-aside
.c-aside__content(role="complementary")&attributes(attributes)
if label
h4.u-text-label.u-text-label--dark=label
block
//- SVG from map (uses embedded SVG sprite)
name - [string] SVG symbol id
width - [integer] width in px
height - [integer] height in px (default: same as width)
mixin svg(name, width, height)
svg(aria-hidden="true" viewBox="0 0 #{width} #{height || width}" width=width height=(height || width))&attributes(attributes)
use(xlink:href="#svg_#{name}")
//- Icon
name - [string] icon name (will be used as symbol id: #svg_{name})
width - [integer] icon width (default: 20)
height - [integer] icon height (defaults to width)
mixin icon(name, width, height)
- var width = width || 20
- var height = height || width
+svg(name, width, height).o-icon(style="min-width: #{width}px")&attributes(attributes)
//- Pro/Con/Neutral icon
icon - [string] "pro", "con" or "neutral" (default: "neutral")
size - [integer] icon size (optional)
mixin procon(icon, label, show_label, size)
- var colors = { yes: "green", no: "red", neutral: "subtle" }
span.u-nowrap
+icon(icon, size || 20)(class="u-color-#{colors[icon] || 'subtle'}").o-icon--inline&attributes(attributes)
span.u-text-small(class=show_label ? null : "u-hidden")=(label || icon)
//- Headlines Helper Mixin
level - [integer] 1, 2, 3, 4, or 5
mixin headline(level)
if level == 1
h1.u-heading-1&attributes(attributes)
block
else if level == 2
h2.u-heading-2&attributes(attributes)
block
else if level == 3
h3.u-heading-3&attributes(attributes)
block
else if level == 4
h4.u-heading-4&attributes(attributes)
block
else if level == 5
h5.u-heading-5&attributes(attributes)
block
//- Permalink rendering
id - [string] permalink ID used for link anchor
mixin permalink(id)
if id
a.u-permalink(href="##{id}")
block
else
block
//- Quickstart widget
quickstart.js with manual markup, inspired by PyTorch's "Getting started"
groups - [object] option groups, uses global variable QUICKSTART
headline - [string] optional text to be rendered as widget headline
mixin quickstart(groups, headline, description, hide_results)
.c-quickstart.o-block-small#qs
.c-quickstart__content
if headline
+h(2)=headline
if description
p=description
for group in groups
.c-quickstart__group.u-text-small(data-qs-group=group.id)
if group.title
.c-quickstart__legend=group.title
if group.help
| #[+help(group.help)]
.c-quickstart__fields
for option in group.options
input.c-quickstart__input(class="c-quickstart__input--" + (group.input_style ? group.input_style : group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id="qs-#{option.id}" value=option.id checked=option.checked)
label.c-quickstart__label.u-text-tiny(for="qs-#{option.id}")!=option.title
if option.meta
| #[span.c-quickstart__label__meta (#{option.meta})]
if option.help
| #[+help(option.help)]
if hide_results
block
else
pre.c-code-block
code.c-code-block__content.c-quickstart__code(data-qs-results="")
block
//- Quickstart code item
data - [object] Rendering conditions (keyed by option group ID, value: option)
style - [string] modifier ID for line style
mixin qs(data, style)
- args = {}
for value, setting in data
- args['data-qs-' + setting] = value
span.c-quickstart__line(class="c-quickstart__line--#{style || 'bash'}")&attributes(args)
block
//- Terminal-style code window
label - [string] title displayed in top bar of terminal window
mixin terminal(label)
.x-terminal
.x-terminal__icons: span
.u-padding-small.u-text-label.u-text-center=label
+code.x-terminal__code
block
//- Chart.js
id - [string] chart ID, will be assigned as #chart_{id}
mixin chart(id, height)
figure.o-block&attributes(attributes)
canvas(id="chart_#{id}" width="800" height=(height || "400") style="max-width: 100%")
//- Gitter chat button and widget
button - [string] text shown on button
label - [string] title of chat window (default: same as button)
mixin gitter(button, label)
aside.js-gitter.c-chat.is-collapsed(data-title=(label || button))
button.js-gitter-button.c-chat__button.u-text-tag
+icon("chat", 16).o-icon--inline
!=button
//- Badge
image - [string] path to badge image
url - [string] badge link
mixin badge(image, url)
+a(url).u-padding-small.u-hide-link&attributes(attributes)
img.o-badge(src=image alt=url height="20")
//- spaCy logo
mixin logo()
+svg("spacy", 675, 215).o-logo&attributes(attributes)
//- Landing
mixin landing-header()
header.c-landing
.c-landing__wrapper
.c-landing__content
block
mixin landing-banner(headline, label)
.c-landing__banner.u-padding.o-block.u-color-light
+grid.c-landing__banner__content.o-no-block
+grid-col("third")
h3.u-heading.u-heading-1
if label
div
span.u-text-label.u-text-label--light=label
!=headline
+grid-col("two-thirds").c-landing__banner__text
block
mixin landing-logos(title, logos)
.o-content.u-text-center&attributes(attributes)
h3.u-heading.u-text-label.u-color-dark=title
each row, i in logos
- var is_last = i == logos.length - 1
+grid("center").o-inline-list.o-no-block(class=is_last ? "o-no-block" : null)
each details, name in row
+a(details[0]).u-padding-medium
+icon(name, details[1], details[2])
if is_last
block
//- Under construction (temporary)
Marks sections that still need to be completed for the v2.0 release.
mixin under-construction()
+infobox("Under construction", "🚧")
| This section is still being written and will be updated for the v2.0
| release. Is there anything that you think should definitely mentioned or
| explained here? Any examples you'd like to see? #[strong Let us know]
| on the #[+a(gh("spacy") + "/issues/1105") v2.0 alpha thread] on GitHub!
//- Alpha infobox (temporary)
Added in the templates to notify user that they're visiting the alpha site.
mixin alpha-info()
+infobox("You are viewing the spaCy v2.0.0 alpha docs", "⚠️")
strong This page is part of the alpha documentation for spaCy v2.0.
| It does not reflect the state of the latest stable release.
| Because v2.0 is still under development, the implementation
| may differ from the intended state described here. See the
| #[+a(gh("spaCy") + "/releases/tag/v2.0.0-alpha") release notes]
| for details on how to install and test the new version. To
| read the official docs for spaCy v1.x,
| #[+a("https://spacy.io/docs") go here].

View File

@ -1,7 +1,39 @@
//- 💫 INCLUDES > MIXINS //- 💫 INCLUDES > MIXINS
include _functions include _functions
include _mixins-base
//- Section
id - [string] anchor assigned to section (used for breadcrumb navigation)
mixin section(id)
section.o-section(id="section-" + id data-section=id)
block
//- Headlines Helper Mixin
level - [integer] 1, 2, 3, 4, or 5
mixin headline(level)
if level == 1
h1.u-heading-1&attributes(attributes)
block
else if level == 2
h2.u-heading-2&attributes(attributes)
block
else if level == 3
h3.u-heading-3&attributes(attributes)
block
else if level == 4
h4.u-heading-4&attributes(attributes)
block
else if level == 5
h5.u-heading-5&attributes(attributes)
block
//- Headlines //- Headlines
@ -18,6 +50,18 @@ mixin h(level, id, source)
span Source #[+icon("code", 14).o-icon--inline] span Source #[+icon("code", 14).o-icon--inline]
//- Permalink rendering
id - [string] permalink ID used for link anchor
mixin permalink(id)
if id
a.u-permalink(href="##{id}")
block
else
block
//- External links //- External links
url - [string] link href url - [string] link href
trusted - [boolean] if not set / false, rel="noopener nofollow" is added trusted - [boolean] if not set / false, rel="noopener nofollow" is added
@ -63,6 +107,18 @@ mixin help(tooltip, icon_size)
+icon("help_o", icon_size || 16).o-icon--inline +icon("help_o", icon_size || 16).o-icon--inline
//- Aside wrapper
label - [string] aside label
mixin aside-wrapper(label)
aside.c-aside
.c-aside__content(role="complementary")&attributes(attributes)
if label
h4.u-text-label.u-text-label--dark=label
block
//- Aside for text //- Aside for text
label - [string] aside title (optional) label - [string] aside title (optional)
@ -112,6 +168,37 @@ mixin infobox-logos(...logos)
| #[+icon(logo[0], logo[1], logo[2]).u-color-dark] | #[+icon(logo[0], logo[1], logo[2]).u-color-dark]
//- SVG from map (uses embedded SVG sprite)
name - [string] SVG symbol id
width - [integer] width in px
height - [integer] height in px (default: same as width)
mixin svg(name, width, height)
svg(aria-hidden="true" viewBox="0 0 #{width} #{height || width}" width=width height=(height || width))&attributes(attributes)
use(xlink:href="#svg_#{name}")
//- Icon
name - [string] icon name (will be used as symbol id: #svg_{name})
width - [integer] icon width (default: 20)
height - [integer] icon height (defaults to width)
mixin icon(name, width, height)
- var width = width || 20
- var height = height || width
+svg(name, width, height).o-icon(style="min-width: #{width}px")&attributes(attributes)
//- Pro/Con/Neutral icon
icon - [string] "pro", "con" or "neutral" (default: "neutral")
size - [integer] icon size (optional)
mixin procon(icon, label, show_label, size)
- var colors = { yes: "green", no: "red", neutral: "subtle" }
span.u-nowrap
+icon(icon, size || 20)(class="u-color-#{colors[icon] || 'subtle'}").o-icon--inline&attributes(attributes)
span.u-text-small(class=show_label ? null : "u-hidden")=(label || icon)
//- Link button //- Link button
url - [string] link href url - [string] link href
@ -238,6 +325,14 @@ mixin graphic(original)
+button(original, false, "secondary", "small") View large graphic +button(original, false, "secondary", "small") View large graphic
//- Chart.js
id - [string] chart ID, will be assigned as #chart_{id}
mixin chart(id, height)
figure.o-block&attributes(attributes)
canvas(id="chart_#{id}" width="800" height=(height || "400") style="max-width: 100%")
//- Labels //- Labels
mixin label() mixin label()
@ -353,8 +448,8 @@ mixin grid(...style)
width - [string] "quarter", "third", "half", "two-thirds", "three-quarters" width - [string] "quarter", "third", "half", "two-thirds", "three-quarters"
see $grid in assets/css/_variables.sass see $grid in assets/css/_variables.sass
mixin grid-col(width) mixin grid-col(...style)
.o-grid__col(class="o-grid__col--#{width}")&attributes(attributes) .o-grid__col(class=prefixArgs(style, "o-grid__col"))&attributes(attributes)
block block
@ -445,3 +540,137 @@ mixin annotation-row(annots, style)
else else
+cell=cell +cell=cell
block block
//- spaCy logo
mixin logo()
+svg("spacy", 675, 215).o-logo&attributes(attributes)
//- Gitter chat button and widget
button - [string] text shown on button
label - [string] title of chat window (default: same as button)
mixin gitter(button, label)
aside.js-gitter.c-chat.is-collapsed(data-title=(label || button))
button.js-gitter-button.c-chat__button.u-text-tag
+icon("chat", 16).o-icon--inline
!=button
//- Badge
image - [string] path to badge image
url - [string] badge link
mixin badge(image, url)
+a(url).u-padding-small.u-hide-link&attributes(attributes)
img.o-badge(src=image alt=url height="20")
//- Quickstart widget
quickstart.js with manual markup, inspired by PyTorch's "Getting started"
groups - [object] option groups, uses global variable QUICKSTART
headline - [string] optional text to be rendered as widget headline
mixin quickstart(groups, headline, description, hide_results)
.c-quickstart.o-block-small#qs
.c-quickstart__content
if headline
+h(2)=headline
if description
p=description
for group in groups
.c-quickstart__group.u-text-small(data-qs-group=group.id)
if group.title
.c-quickstart__legend=group.title
if group.help
| #[+help(group.help)]
.c-quickstart__fields
for option in group.options
input.c-quickstart__input(class="c-quickstart__input--" + (group.input_style ? group.input_style : group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id="qs-#{option.id}" value=option.id checked=option.checked)
label.c-quickstart__label.u-text-tiny(for="qs-#{option.id}")!=option.title
if option.meta
| #[span.c-quickstart__label__meta (#{option.meta})]
if option.help
| #[+help(option.help)]
if hide_results
block
else
pre.c-code-block
code.c-code-block__content.c-quickstart__code(data-qs-results="")
block
//- Quickstart code item
data - [object] Rendering conditions (keyed by option group ID, value: option)
style - [string] modifier ID for line style
mixin qs(data, style)
- args = {}
for value, setting in data
- args['data-qs-' + setting] = value
span.c-quickstart__line(class="c-quickstart__line--#{style || 'bash'}")&attributes(args)
block
//- Terminal-style code window
label - [string] title displayed in top bar of terminal window
mixin terminal(label)
.x-terminal
.x-terminal__icons: span
.u-padding-small.u-text-label.u-text-center=label
+code.x-terminal__code
block
//- Landing
mixin landing-header()
header.c-landing
.c-landing__wrapper
.c-landing__content
block
mixin landing-banner(headline, label)
.c-landing__banner.u-padding.o-block.u-color-light
+grid.c-landing__banner__content.o-no-block
+grid-col("third")
h3.u-heading.u-heading-1
if label
div
span.u-text-label.u-text-label--light=label
!=headline
+grid-col("two-thirds").c-landing__banner__text
block
mixin landing-logos(title, logos)
.o-content.u-text-center&attributes(attributes)
h3.u-heading.u-text-label.u-color-dark=title
each row, i in logos
- var is_last = i == logos.length - 1
+grid("center").o-inline-list.o-no-block(class=is_last ? "o-no-block" : null)
each details, name in row
+a(details[0]).u-padding-medium
+icon(name, details[1], details[2])
if is_last
block
//- Under construction (temporary)
Marks sections that still need to be completed for the v2.0 release.
mixin under-construction()
+infobox("Under construction", "🚧")
| This section is still being written and will be updated for the v2.0
| release. Is there anything that you think should definitely mentioned or
| explained here? Any examples you'd like to see? #[strong Let us know]
| on the #[+a(gh("spacy") + "/issues/1105") v2.0 alpha thread] on GitHub!

View File

@ -25,9 +25,6 @@ main.o-main.o-main--sidebar.o-main--aside
+button(gh("spacy", source), false, "secondary", "small").u-nowrap +button(gh("spacy", source), false, "secondary", "small").u-nowrap
| Source #[+icon("code", 14)] | Source #[+icon("code", 14)]
//-if ALPHA
//- +alpha-info
if IS_MODELS if IS_MODELS
include _page_models include _page_models
else else

View File

@ -62,6 +62,9 @@ svg(style="position: absolute; visibility: hidden; width: 0; height: 0;" width="
symbol#svg_explosion(viewBox="0 0 500 500") symbol#svg_explosion(viewBox="0 0 500 500")
path(fill="currentColor" d="M111.7 74.9L91.2 93.1l9.1 10.2 17.8-15.8 7.4 8.4-17.8 15.8 10.1 11.4 20.6-18.2 7.7 8.7-30.4 26.9-41.9-47.3 30.3-26.9 7.6 8.6zM190.8 59.6L219 84.3l-14.4 4.5-20.4-18.2-6.4 26.6-14.4 4.5 8.9-36.4-26.9-24.1 14.3-4.5L179 54.2l5.7-25.2 14.3-4.5-8.2 35.1zM250.1 21.2l27.1 3.4c6.1.8 10.8 3.1 14 7.2 3.2 4.1 4.5 9.2 3.7 15.5-.8 6.3-3.2 11-7.4 14.1-4.1 3.1-9.2 4.3-15.3 3.5L258 63.2l-2.8 22.3-13-1.6 7.9-62.7zm11.5 13l-2.2 17.5 12.6 1.6c5.1.6 9.1-2 9.8-7.6.7-5.6-2.5-9.2-7.6-9.9l-12.6-1.6zM329.1 95.4l23.8 13.8-5.8 10L312 98.8l31.8-54.6 11.3 6.6-26 44.6zM440.5 145c-1.3 8.4-5.9 15.4-13.9 21.1s-16.2 7.7-24.6 6.1c-8.4-1.6-15.3-6.3-20.8-14.1-5.5-7.9-7.6-16-6.4-24.4 1.3-8.5 6-15.5 14-21.1 8-5.6 16.2-7.7 24.5-6 8.4 1.6 15.4 6.3 20.9 14.2 5.5 7.6 7.6 15.7 6.3 24.2zM412 119c-5.1-.8-10.3.6-15.6 4.4-5.2 3.7-8.4 8.1-9.4 13.2-1 5.2.2 10.1 3.5 14.8 3.4 4.8 7.5 7.5 12.7 8.2 5.2.8 10.4-.7 15.6-4.4 5.3-3.7 8.4-8.1 9.4-13.2 1.1-5.1-.1-9.9-3.4-14.7-3.4-4.8-7.6-7.6-12.8-8.3zM471.5 237.9c-2.8 4.8-7.1 7.6-13 8.7l-2.6-13.1c5.3-.9 8.1-5 7.2-11-.9-5.8-4.3-8.8-8.9-8.2-2.3.3-3.7 1.4-4.5 3.3-.7 1.9-1.4 5.2-1.7 10.1-.8 7.5-2.2 13.1-4.3 16.9-2.1 3.9-5.7 6.2-10.9 7-6.3.9-11.3-.5-15.2-4.4-3.9-3.8-6.3-9-7.3-15.7-1.1-7.4-.2-13.7 2.6-18.8 2.8-5.1 7.4-8.2 13.7-9.2l2.6 13c-5.6 1.1-8.7 6.6-7.7 13.4 1 6.6 3.9 9.5 8.6 8.8 4.4-.7 5.7-4.5 6.7-14.1.3-3.5.7-6.2 1.1-8.4.4-2.2 1.2-4.4 2.2-6.8 2.1-4.7 6-7.2 11.8-8.1 5.4-.8 10.3.4 14.5 3.7 4.2 3.3 6.9 8.5 8 15.6.9 6.9-.1 12.6-2.9 17.3zM408.6 293.5l2.4-12.9 62 11.7-2.4 12.9-62-11.7zM419.6 396.9c-8.3 2-16.5.3-24.8-5-8.2-5.3-13.2-12.1-14.9-20.5-1.6-8.4.1-16.6 5.3-24.6 5.2-8.1 11.9-13.1 20.2-15.1 8.4-1.9 16.6-.3 24.9 5 8.2 5.3 13.2 12.1 14.8 20.5 1.7 8.4 0 16.6-5.2 24.7-5.2 8-12 13-20.3 15zm13.4-36.3c-1.2-5.1-4.5-9.3-9.9-12.8s-10.6-4.7-15.8-3.7-9.3 4-12.4 8.9-4.1 9.8-2.8 14.8c1.2 5.1 4.5 9.3 9.9 12.8 5.5 3.5 10.7 4.8 15.8 3.7 5.1-.9 9.2-3.8 12.3-8.7s4.1-9.9 2.9-15zM303.6 416.5l9.6-5.4 43.3 20.4-19.2-34 11.4-6.4 31 55-9.6 5.4-43.4-20.5 19.2 34.1-11.3 6.4-31-55zM238.2 468.8c-49 0-96.9-17.4-134.8-49-38.3-32-64-76.7-72.5-125.9-2-11.9-3.1-24-3.1-35.9 0-36.5 9.6-72.6 27.9-104.4 2.1-3.6 6.7-4.9 10.3-2.8 3.6 2.1 4.9 6.7 2.8 10.3-16.9 29.5-25.9 63.1-25.9 96.9 0 11.1 1 22.3 2.9 33.4 7.9 45.7 31.8 87.2 67.3 116.9 35.2 29.3 79.6 45.5 125.1 45.5 11.1 0 22.3-1 33.4-2.9 4.1-.7 8 2 8.7 6.1.7 4.1-2 8-6.1 8.7-11.9 2-24 3.1-36 3.1z") path(fill="currentColor" d="M111.7 74.9L91.2 93.1l9.1 10.2 17.8-15.8 7.4 8.4-17.8 15.8 10.1 11.4 20.6-18.2 7.7 8.7-30.4 26.9-41.9-47.3 30.3-26.9 7.6 8.6zM190.8 59.6L219 84.3l-14.4 4.5-20.4-18.2-6.4 26.6-14.4 4.5 8.9-36.4-26.9-24.1 14.3-4.5L179 54.2l5.7-25.2 14.3-4.5-8.2 35.1zM250.1 21.2l27.1 3.4c6.1.8 10.8 3.1 14 7.2 3.2 4.1 4.5 9.2 3.7 15.5-.8 6.3-3.2 11-7.4 14.1-4.1 3.1-9.2 4.3-15.3 3.5L258 63.2l-2.8 22.3-13-1.6 7.9-62.7zm11.5 13l-2.2 17.5 12.6 1.6c5.1.6 9.1-2 9.8-7.6.7-5.6-2.5-9.2-7.6-9.9l-12.6-1.6zM329.1 95.4l23.8 13.8-5.8 10L312 98.8l31.8-54.6 11.3 6.6-26 44.6zM440.5 145c-1.3 8.4-5.9 15.4-13.9 21.1s-16.2 7.7-24.6 6.1c-8.4-1.6-15.3-6.3-20.8-14.1-5.5-7.9-7.6-16-6.4-24.4 1.3-8.5 6-15.5 14-21.1 8-5.6 16.2-7.7 24.5-6 8.4 1.6 15.4 6.3 20.9 14.2 5.5 7.6 7.6 15.7 6.3 24.2zM412 119c-5.1-.8-10.3.6-15.6 4.4-5.2 3.7-8.4 8.1-9.4 13.2-1 5.2.2 10.1 3.5 14.8 3.4 4.8 7.5 7.5 12.7 8.2 5.2.8 10.4-.7 15.6-4.4 5.3-3.7 8.4-8.1 9.4-13.2 1.1-5.1-.1-9.9-3.4-14.7-3.4-4.8-7.6-7.6-12.8-8.3zM471.5 237.9c-2.8 4.8-7.1 7.6-13 8.7l-2.6-13.1c5.3-.9 8.1-5 7.2-11-.9-5.8-4.3-8.8-8.9-8.2-2.3.3-3.7 1.4-4.5 3.3-.7 1.9-1.4 5.2-1.7 10.1-.8 7.5-2.2 13.1-4.3 16.9-2.1 3.9-5.7 6.2-10.9 7-6.3.9-11.3-.5-15.2-4.4-3.9-3.8-6.3-9-7.3-15.7-1.1-7.4-.2-13.7 2.6-18.8 2.8-5.1 7.4-8.2 13.7-9.2l2.6 13c-5.6 1.1-8.7 6.6-7.7 13.4 1 6.6 3.9 9.5 8.6 8.8 4.4-.7 5.7-4.5 6.7-14.1.3-3.5.7-6.2 1.1-8.4.4-2.2 1.2-4.4 2.2-6.8 2.1-4.7 6-7.2 11.8-8.1 5.4-.8 10.3.4 14.5 3.7 4.2 3.3 6.9 8.5 8 15.6.9 6.9-.1 12.6-2.9 17.3zM408.6 293.5l2.4-12.9 62 11.7-2.4 12.9-62-11.7zM419.6 396.9c-8.3 2-16.5.3-24.8-5-8.2-5.3-13.2-12.1-14.9-20.5-1.6-8.4.1-16.6 5.3-24.6 5.2-8.1 11.9-13.1 20.2-15.1 8.4-1.9 16.6-.3 24.9 5 8.2 5.3 13.2 12.1 14.8 20.5 1.7 8.4 0 16.6-5.2 24.7-5.2 8-12 13-20.3 15zm13.4-36.3c-1.2-5.1-4.5-9.3-9.9-12.8s-10.6-4.7-15.8-3.7-9.3 4-12.4 8.9-4.1 9.8-2.8 14.8c1.2 5.1 4.5 9.3 9.9 12.8 5.5 3.5 10.7 4.8 15.8 3.7 5.1-.9 9.2-3.8 12.3-8.7s4.1-9.9 2.9-15zM303.6 416.5l9.6-5.4 43.3 20.4-19.2-34 11.4-6.4 31 55-9.6 5.4-43.4-20.5 19.2 34.1-11.3 6.4-31-55zM238.2 468.8c-49 0-96.9-17.4-134.8-49-38.3-32-64-76.7-72.5-125.9-2-11.9-3.1-24-3.1-35.9 0-36.5 9.6-72.6 27.9-104.4 2.1-3.6 6.7-4.9 10.3-2.8 3.6 2.1 4.9 6.7 2.8 10.3-16.9 29.5-25.9 63.1-25.9 96.9 0 11.1 1 22.3 2.9 33.4 7.9 45.7 31.8 87.2 67.3 116.9 35.2 29.3 79.6 45.5 125.1 45.5 11.1 0 22.3-1 33.4-2.9 4.1-.7 8 2 8.7 6.1.7 4.1-2 8-6.1 8.7-11.9 2-24 3.1-36 3.1z")
symbol#svg_prodigy(viewBox="0 0 538.5 157.6")
path(fill="currentColor" d="M70.6 48.6c7 7.3 10.5 17.1 10.5 29.2S77.7 99.7 70.6 107c-6.9 7.3-15.9 11.1-27 11.1-9.4 0-16.8-2.7-21.7-8.2v44.8H0V39h20.7v8.1c4.8-6.4 12.4-9.6 22.9-9.6 11.1 0 20.1 3.7 27 11.1zM21.9 76v3.6c0 12.1 7.3 19.8 18.3 19.8 11.2 0 18.7-7.9 18.7-21.6s-7.5-21.6-18.7-21.6c-11 0-18.3 7.7-18.3 19.8zM133.8 59.4c-12.6 0-20.5 7-20.5 17.8v39.3h-22V39h21.1v8.8c4-6.4 11.2-9.6 21.3-9.6v21.2zM209.5 107.1c-7.6 7.3-17.5 11.1-29.5 11.1s-21.9-3.8-29.7-11.1c-7.6-7.5-11.5-17.2-11.5-29.2 0-12.1 3.9-21.9 11.5-29.2 7.8-7.3 17.7-11.1 29.7-11.1s21.9 3.8 29.5 11.1c7.8 7.3 11.7 17.1 11.7 29.2 0 11.9-3.9 21.7-11.7 29.2zM180 56.2c-5.7 0-10.3 1.9-13.8 5.8-3.5 3.8-5.2 9-5.2 15.7 0 6.7 1.8 12 5.2 15.7 3.4 3.8 8.1 5.7 13.8 5.7s10.3-1.9 13.8-5.7 5.2-9 5.2-15.7c0-6.8-1.8-12-5.2-15.7-3.5-3.8-8.1-5.8-13.8-5.8zM313 116.5h-20.5v-7.9c-4.4 5.5-12.7 9.6-23.1 9.6-10.9 0-19.9-3.8-27-11.1C235.5 99.7 232 90 232 77.8s3.5-21.9 10.3-29.2c7-7.3 16-11.1 27-11.1 9.7 0 17.1 2.7 21.9 8.2V0H313v116.5zm-58.8-38.7c0 13.6 7.5 21.4 18.7 21.4 10.9 0 18.3-7.3 18.3-19.8V76c0-12.2-7.3-19.8-18.3-19.8-11.2 0-18.7 8-18.7 21.6zM354.1 13.6c0 3.6-1.3 6.8-3.9 9.3-5 4.9-13.6 4.9-18.6 0-8.4-7.5-1.6-23.1 9.3-22.5 7.4 0 13.2 5.9 13.2 13.2zm-2.2 102.9H330V39h21.9v77.5zM425.1 47.1V39h20.5v80.4c0 11.2-3.6 20.1-10.6 26.8-7 6.7-16.6 10-28.5 10-23.4 0-36.9-11.4-39.9-29.8l21.7-.8c1 7.6 7.6 12 17.4 12 11.2 0 18.1-5.8 18.1-16.6v-11.1c-5.1 5.5-12.5 8.2-21.9 8.2-10.9 0-19.9-3.8-27-11.1-6.9-7.3-10.3-17.1-10.3-29.2s3.5-21.9 10.3-29.2c7-7.3 16-11.1 27-11.1 10.7 0 18.4 3.1 23.2 9.6zm-38.3 30.7c0 13.6 7.5 21.6 18.7 21.6 11 0 18.3-7.6 18.3-19.8V76c0-12.2-7.3-19.8-18.3-19.8-11.2 0-18.7 8-18.7 21.6zM488.8 154.8H465l19.8-45.1L454.5 39h24.1l17.8 46.2L514.2 39h24.3l-49.7 115.8z")
//- Machine learning & NLP libraries //- Machine learning & NLP libraries

View File

@ -1,6 +1,11 @@
//- 💫 DOCS > API > ANNOTATION > NAMED ENTITIES //- 💫 DOCS > API > ANNOTATION > NAMED ENTITIES
+table([ "Type", "Description" ]) p
| Models trained on the
| #[+a("https://catalog.ldc.upenn.edu/ldc2013t19") OntoNotes 5] corpus
| support the following entity types:
+table(["Type", "Description"])
+row +row
+cell #[code PERSON] +cell #[code PERSON]
+cell People, including fictional. +cell People, including fictional.
@ -45,9 +50,6 @@
+cell #[code LANGUAGE] +cell #[code LANGUAGE]
+cell Any named language. +cell Any named language.
p The following values are also annotated in a style similar to names:
+table([ "Type", "Description" ])
+row +row
+cell #[code DATE] +cell #[code DATE]
+cell Absolute or relative dates or periods. +cell Absolute or relative dates or periods.
@ -75,3 +77,33 @@ p The following values are also annotated in a style similar to names:
+row +row
+cell #[code CARDINAL] +cell #[code CARDINAL]
+cell Numerals that do not fall under another type. +cell Numerals that do not fall under another type.
+h(4, "ner-wikipedia-scheme") Wikipedia scheme
p
| Models trained on Wikipedia corpus
| (#[+a("http://www.sciencedirect.com/science/article/pii/S0004370212000276") Nothman et al., 2013])
| use a less fine-grained NER annotation scheme and recognise the
| following entities:
+table(["Type", "Description"])
+row
+cell #[code PER]
+cell Named person or family.
+row
+cell #[code LOC]
+cell
| Name of politically or geographically defined location (cities,
| provinces, countries, international regions, bodies of water,
| mountains).
+row
+cell #[code ORG]
+cell Named corporate, governmental, or other organizational entity.
+row
+cell #[code MISC]
+cell
| Miscellaneous entities, e.g. events, nationalities, products or
| works of art.

View File

@ -1,5 +1,7 @@
//- 💫 DOCS > API > ANNOTATION > TRAINING //- 💫 DOCS > API > ANNOTATION > TRAINING
+h(3, "json-input") JSON input format for training
p p
| spaCy takes training data in JSON format. The built-in | spaCy takes training data in JSON format. The built-in
| #[+api("cli#convert") #[code convert]] command helps you convert the | #[+api("cli#convert") #[code convert]] command helps you convert the
@ -46,3 +48,57 @@ p
| Treebank: | Treebank:
+github("spacy", "examples/training/training-data.json", false, false, "json") +github("spacy", "examples/training/training-data.json", false, false, "json")
+h(3, "vocab-jsonl") Lexical data for vocabulary
+tag-new(2)
p
| The populate a model's vocabulary, you can use the
| #[+api("cli#vocab") #[code spacy vocab]] command and load in a
| #[+a("https://jsonlines.readthedocs.io/en/latest/") newline-delimited JSON]
| (JSONL) file containing one lexical entry per line. The first line
| defines the language and vocabulary settings. All other lines are
| expected to be JSON objects describing an individual lexeme. The lexical
| attributes will be then set as attributes on spaCy's
| #[+api("lexeme#attributes") #[code Lexeme]] object. The #[code vocab]
| command outputs a ready-to-use spaCy model with a #[code Vocab]
| containing the lexical data.
+code("First line").
{"lang": "en", "settings": {"oov_prob": -20.502029418945312}}
+code("Entry structure").
{
"orth": string,
"id": int,
"lower": string,
"norm": string,
"shape": string
"prefix": string,
"suffix": string,
"length": int,
"cluster": string,
"prob": float,
"is_alpha": bool,
"is_ascii": bool,
"is_digit": bool,
"is_lower": bool,
"is_punct": bool,
"is_space": bool,
"is_title": bool,
"is_upper": bool,
"like_url": bool,
"like_num": bool,
"like_email": bool,
"is_stop": bool,
"is_oov": bool,
"is_quote": bool,
"is_left_punct": bool,
"is_right_punct": bool
}
p
| Here's an example of the 20 most frequent lexemes in the English
| training data:
+github("spacy", "examples/training/vocab-data.jsonl", false, false, "json")

View File

@ -3,8 +3,10 @@
"Overview": { "Overview": {
"Architecture": "./", "Architecture": "./",
"Annotation Specs": "annotation", "Annotation Specs": "annotation",
"Command Line": "cli",
"Functions": "top-level" "Functions": "top-level"
}, },
"Containers": { "Containers": {
"Doc": "doc", "Doc": "doc",
"Token": "token", "Token": "token",
@ -45,14 +47,19 @@
} }
}, },
"cli": {
"title": "Command Line Interface",
"teaser": "Download, train and package models, and debug spaCy.",
"source": "spacy/cli"
},
"top-level": { "top-level": {
"title": "Top-level Functions", "title": "Top-level Functions",
"menu": { "menu": {
"spacy": "spacy", "spacy": "spacy",
"displacy": "displacy", "displacy": "displacy",
"Utility Functions": "util", "Utility Functions": "util",
"Compatibility": "compat", "Compatibility": "compat"
"Command Line": "cli"
} }
}, },
@ -213,7 +220,7 @@
"Lemmatization": "lemmatization", "Lemmatization": "lemmatization",
"Dependencies": "dependency-parsing", "Dependencies": "dependency-parsing",
"Named Entities": "named-entities", "Named Entities": "named-entities",
"Training Data": "training" "Models & Training": "training"
} }
} }
} }

View File

@ -85,7 +85,9 @@ p
+row +row
+cell #[code name] +cell #[code name]
+cell unicode +cell unicode
+cell ISO code of the language class to load. +cell
| #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code]
| of the language class to load.
+row +row
+cell #[code disable] +cell #[code disable]

View File

@ -99,6 +99,6 @@ p This document describes the target annotations spaCy is trained to predict.
include _annotation/_biluo include _annotation/_biluo
+section("training") +section("training")
+h(2, "json-input") JSON input format for training +h(2, "training") Models and training data
include _annotation/_training include _annotation/_training

View File

@ -1,4 +1,6 @@
//- 💫 DOCS > API > TOP-LEVEL > COMMAND LINE INTERFACE //- 💫 DOCS > API > COMMAND LINE INTERFACE
include ../_includes/_mixins
p p
| As of v1.7.0, spaCy comes with new command line helpers to download and | As of v1.7.0, spaCy comes with new command line helpers to download and
@ -34,6 +36,13 @@ p
+cell flag +cell flag
+cell Show help message and available arguments. +cell Show help message and available arguments.
+row("foot")
+cell creates
+cell directory, symlink
+cell
| The installed model package in your #[code site-packages]
| directory and a shortcut link as a symlink in #[code spacy/data].
+aside("Downloading best practices") +aside("Downloading best practices")
| The #[code download] command is mostly intended as a convenient, | The #[code download] command is mostly intended as a convenient,
| interactive wrapper it performs compatibility checks and prints | interactive wrapper it performs compatibility checks and prints
@ -86,6 +95,13 @@ p
+cell flag +cell flag
+cell Show help message and available arguments. +cell Show help message and available arguments.
+row("foot")
+cell creates
+cell symlink
+cell
| A shortcut link of the given name as a symlink in
| #[code spacy/data].
+h(3, "info") Info +h(3, "info") Info
p p
@ -113,6 +129,11 @@ p
+cell flag +cell flag
+cell Show help message and available arguments. +cell Show help message and available arguments.
+row("foot")
+cell prints
+cell #[code stdout]
+cell Information about your spaCy installation.
+h(3, "validate") Validate +h(3, "validate") Validate
+tag-new(2) +tag-new(2)
@ -129,6 +150,12 @@ p
+code(false, "bash", "$"). +code(false, "bash", "$").
spacy validate spacy validate
+table(["Argument", "Type", "Description"])
+row("foot")
+cell prints
+cell #[code stdout]
+cell Details about the compatibility of your installed models.
+h(3, "convert") Convert +h(3, "convert") Convert
p p
@ -172,6 +199,11 @@ p
+cell flag +cell flag
+cell Show help message and available arguments. +cell Show help message and available arguments.
+row("foot")
+cell creates
+cell JSON
+cell Data in spaCy's #[+a("/api/annotation#json-input") JSON format].
p The following converters are available: p The following converters are available:
+table(["ID", "Description"]) +table(["ID", "Description"])
@ -286,6 +318,11 @@ p
+cell flag +cell flag
+cell Show help message and available arguments. +cell Show help message and available arguments.
+row("foot")
+cell creates
+cell model, pickle
+cell A spaCy model on each epoch, and a final #[code .pickle] file.
+h(4, "train-hyperparams") Environment variables for hyperparameters +h(4, "train-hyperparams") Environment variables for hyperparameters
+tag-new(2) +tag-new(2)
@ -395,6 +432,50 @@ p
+cell Gradient L2 norm constraint. +cell Gradient L2 norm constraint.
+cell #[code 1.0] +cell #[code 1.0]
+h(3, "vocab") Vocab
+tag-new(2)
p
| Compile a vocabulary from a
| #[+a("/api/annotation#vocab-jsonl") lexicon JSONL] file and optional
| word vectors. Will save out a valid spaCy model that you can load via
| #[+api("spacy#load") #[code spacy.load]] or package using the
| #[+api("cli#package") #[code package]] command.
+code(false, "bash", "$").
spacy vocab [lang] [output_dir] [lexemes_loc] [vectors_loc]
+table(["Argument", "Type", "Description"])
+row
+cell #[code lang]
+cell positional
+cell
| Model language
| #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code],
| e.g. #[code en].
+row
+cell #[code output_dir]
+cell positional
+cell Model output directory. Will be created if it doesn't exist.
+row
+cell #[code lexemes_loc]
+cell positional
+cell
| Location of lexical data in spaCy's
| #[+a("/api/annotation#vocab-jsonl") JSONL format].
+row
+cell #[code vectors_loc]
+cell positional
+cell Optional location of vectors data as numpy #[code .npz] file.
+row("foot")
+cell creates
+cell model
+cell A spaCy model containing the vocab and vectors.
+h(3, "evaluate") Evaluate +h(3, "evaluate") Evaluate
+tag-new(2) +tag-new(2)
@ -447,22 +528,36 @@ p
+cell flag +cell flag
+cell Use gold preprocessing. +cell Use gold preprocessing.
+row("foot")
+cell prints / creates
+cell #[code stdout], HTML
+cell Training results and optional displaCy visualizations.
+h(3, "package") Package +h(3, "package") Package
p p
| Generate a #[+a("/usage/training#models-generating") model Python package] | Generate a #[+a("/usage/training#models-generating") model Python package]
| from an existing model data directory. All data files are copied over. | from an existing model data directory. All data files are copied over.
| If the path to a meta.json is supplied, or a meta.json is found in the | If the path to a #[code meta.json] is supplied, or a #[code meta.json] is
| input directory, this file is used. Otherwise, the data can be entered | found in the input directory, this file is used. Otherwise, the data can
| directly from the command line. The required file templates are downloaded | be entered directly from the command line. The required file templates
| from #[+src(gh("spacy-dev-resources", "templates/model")) GitHub] to make | are downloaded from
| #[+src(gh("spacy-dev-resources", "templates/model")) GitHub] to make
| sure you're always using the latest versions. This means you need to be | sure you're always using the latest versions. This means you need to be
| connected to the internet to use this command. | connected to the internet to use this command. After packaging, you
| can run #[code python setup.py sdist] from the newly created directory
| to turn your model into an installable archive file.
+code(false, "bash", "$", false, false, true). +code(false, "bash", "$", false, false, true).
spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--force] spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--force]
+aside-code("Example", "bash").
spacy package /input /output
cd /output/en_model-0.0.0
python setup.py sdist
pip install dist/en_model-0.0.0.tar.gz
+table(["Argument", "Type", "Description"]) +table(["Argument", "Type", "Description"])
+row +row
+cell #[code input_dir] +cell #[code input_dir]
@ -477,15 +572,16 @@ p
+row +row
+cell #[code --meta-path], #[code -m] +cell #[code --meta-path], #[code -m]
+cell option +cell option
+cell #[+tag-new(2)] Path to meta.json file (optional). +cell #[+tag-new(2)] Path to #[code meta.json] file (optional).
+row +row
+cell #[code --create-meta], #[code -c] +cell #[code --create-meta], #[code -c]
+cell flag +cell flag
+cell +cell
| #[+tag-new(2)] Create a meta.json file on the command line, even | #[+tag-new(2)] Create a #[code meta.json] file on the command
| if one already exists in the directory. | line, even if one already exists in the directory. If an
| existing file is found, its entries will be shown as the defaults
| in the command line prompt.
+row +row
+cell #[code --force], #[code -f] +cell #[code --force], #[code -f]
+cell flag +cell flag
@ -495,3 +591,8 @@ p
+cell #[code --help], #[code -h] +cell #[code --help], #[code -h]
+cell flag +cell flag
+cell Show help message and available arguments. +cell Show help message and available arguments.
+row("foot")
+cell creates
+cell directory
+cell A Python package containing the spaCy model.

View File

@ -18,7 +18,3 @@ include ../_includes/_mixins
+section("compat") +section("compat")
+h(2, "compat", "spacy/compaty.py") Compatibility functions +h(2, "compat", "spacy/compaty.py") Compatibility functions
include _top-level/_compat include _top-level/_compat
+section("cli", "spacy/cli")
+h(2, "cli") Command line
include _top-level/_cli

View File

@ -162,7 +162,7 @@ p
+cell int +cell int
+cell The integer ID by which the flag value can be checked. +cell The integer ID by which the flag value can be checked.
+h(2, "add_flag") Vocab.clear_vectors +h(2, "clear_vectors") Vocab.clear_vectors
+tag method +tag method
+tag-new(2) +tag-new(2)
@ -181,7 +181,50 @@ p
| Number of dimensions of the new vectors. If #[code None], size | Number of dimensions of the new vectors. If #[code None], size
| is not changed. | is not changed.
+h(2, "add_flag") Vocab.get_vector +h(2, "prune_vectors") Vocab.prune_vectors
+tag method
+tag-new(2)
p
| Reduce the current vector table to #[code nr_row] unique entries. Words
| mapped to the discarded vectors will be remapped to the closest vector
| among those remaining. For example, suppose the original table had
| vectors for the words:
| #[code.u-break ['sat', 'cat', 'feline', 'reclined']]. If we prune the
| vector table to, two rows, we would discard the vectors for "feline"
| and "reclined". These words would then be remapped to the closest
| remaining vector so "feline" would have the same vector as "cat",
| and "reclined" would have the same vector as "sat". The similarities are
| judged by cosine. The original vectors may be large, so the cosines are
| calculated in minibatches, to reduce memory usage.
+aside-code("Example").
nlp.vocab.prune_vectors(10000)
assert len(nlp.vocab.vectors) &lt;= 1000
+table(["Name", "Type", "Description"])
+row
+cell #[code nr_row]
+cell int
+cell The number of rows to keep in the vector table.
+row
+cell #[code batch_size]
+cell int
+cell
| Batch of vectors for calculating the similarities. Larger batch
| sizes might be faster, while temporarily requiring more memory.
+row("foot")
+cell returns
+cell dict
+cell
| A dictionary keyed by removed words mapped to
| #[code (string, score)] tuples, where #[code string] is the entry
| the removed word was mapped to, and #[code score] the similarity
| score between the two words.
+h(2, "get_vector") Vocab.get_vector
+tag method +tag method
+tag-new(2) +tag-new(2)
@ -206,7 +249,7 @@ p
| A word vector. Size and shape are determined by the | A word vector. Size and shape are determined by the
| #[code Vocab.vectors] instance. | #[code Vocab.vectors] instance.
+h(2, "add_flag") Vocab.set_vector +h(2, "set_vector") Vocab.set_vector
+tag method +tag method
+tag-new(2) +tag-new(2)
@ -228,7 +271,7 @@ p
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+cell The vector to set. +cell The vector to set.
+h(2, "add_flag") Vocab.has_vector +h(2, "has_vector") Vocab.has_vector
+tag method +tag method
+tag-new(2) +tag-new(2)

View File

@ -48,6 +48,9 @@
flex: 0 0 100% flex: 0 0 100%
flex-flow: column wrap flex-flow: column wrap
&.o-grid__col--no-gutter
margin-top: 0
// Fix overflow issue in old browsers // Fix overflow issue in old browsers
& > * & > *

View File

@ -8,7 +8,7 @@
align-items: center align-items: center
display: flex display: flex
justify-content: space-between justify-content: space-between
flex-flow: row wrap flex-flow: row nowrap
padding: 0 2rem 0 1rem padding: 0 2rem 0 1rem
z-index: 30 z-index: 30
width: 100% width: 100%

View File

@ -51,6 +51,7 @@
@include scroll-shadow-base($color-front) @include scroll-shadow-base($color-front)
display: inline-block display: inline-block
overflow-x: auto overflow-x: auto
overflow-y: hidden
width: auto width: auto
-webkit-overflow-scrolling: touch -webkit-overflow-scrolling: touch

View File

@ -3,7 +3,7 @@
+h(2, "changelog") Changelog +h(2, "changelog") Changelog
+button(gh("spacy") + "/releases", false, "secondary", "small").u-float-right.u-nowrap View releases +button(gh("spacy") + "/releases", false, "secondary", "small").u-float-right.u-nowrap View releases
div(data-tpl="changelog" data-tpl-key="error") div(data-tpl="changelog" data-tpl-key="error" style="display: none")
+infobox +infobox
| Unable to load changelog from GitHub. Please see the | Unable to load changelog from GitHub. Please see the
| #[+a(gh("spacy") + "/releases") releases page] instead. | #[+a(gh("spacy") + "/releases") releases page] instead.

View File

@ -76,6 +76,16 @@ p
("Google rebrands its business apps", [(0, 6, "ORG")]), ("Google rebrands its business apps", [(0, 6, "ORG")]),
("look what i found on google! 😂", [(21, 27, "PRODUCT")])] ("look what i found on google! 😂", [(21, 27, "PRODUCT")])]
+infobox("Tip: Try the Prodigy annotation tool")
+infobox-logos(["prodigy", 100, 29, "https://prodi.gy"])
| If you need to label a lot of data, check out
| #[+a("https://prodi.gy", true) Prodigy], a new, active learning-powered
| annotation tool we've developed. Prodigy is fast and extensible, and
| comes with a modern #[strong web application] that helps you collect
| training data faster. It integrates seamlessly with spaCy, pre-selects
| the #[strong most relevant examples] for annotation, and lets you
| train and evaluate ready-to-use spaCy models.
+h(3, "annotations") Training with annotations +h(3, "annotations") Training with annotations
p p
@ -180,9 +190,10 @@ p
+cell #[code optimizer] +cell #[code optimizer]
+cell Callable to update the model's weights. +cell Callable to update the model's weights.
+infobox p
| For the #[strong full example and more details], see the usage guide on | Instead of writing your own training loop, you can also use the
| #[+a("/usage/training#ner") training the named entity recognizer], | built-in #[+api("cli#train") #[code train]] command, which expects data
| or the runnable | in spaCy's #[+a("/api/annotation#json-input") JSON format]. On each epoch,
| #[+src(gh("spaCy", "examples/training/train_ner.py")) training script] | a model will be saved out to the directory. After training, you can
| on GitHub. | use the #[+api("cli#package") #[code package]] command to generate an
| installable Python package from your model.

View File

@ -190,7 +190,3 @@ p
+item +item
| #[strong Test] the model to make sure the parser works as expected. | #[strong Test] the model to make sure the parser works as expected.
+h(3, "training-json") JSON format for training
include ../../api/_annotation/_training