mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
WIP on vectors fixes
This commit is contained in:
commit
9c11ee4a1c
21
examples/training/vocab-data.jsonl
Normal file
21
examples/training/vocab-data.jsonl
Normal file
|
@ -0,0 +1,21 @@
|
|||
{"lang": "en", "settings": {"oov_prob": -20.502029418945312}}
|
||||
{"orth": ".", "id": 1, "lower": ".", "norm": ".", "shape": ".", "prefix": ".", "suffix": ".", "length": 1, "cluster": "8", "prob": -3.0678977966308594, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
|
||||
{"orth": ",", "id": 2, "lower": ",", "norm": ",", "shape": ",", "prefix": ",", "suffix": ",", "length": 1, "cluster": "4", "prob": -3.4549596309661865, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
|
||||
{"orth": "the", "id": 3, "lower": "the", "norm": "the", "shape": "xxx", "prefix": "t", "suffix": "the", "length": 3, "cluster": "11", "prob": -3.528766632080078, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
|
||||
{"orth": "I", "id": 4, "lower": "i", "norm": "I", "shape": "X", "prefix": "I", "suffix": "I", "length": 1, "cluster": "346", "prob": -3.791565179824829, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": true, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
|
||||
{"orth": "to", "id": 5, "lower": "to", "norm": "to", "shape": "xx", "prefix": "t", "suffix": "to", "length": 2, "cluster": "12", "prob": -3.8560216426849365, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
|
||||
{"orth": "a", "id": 6, "lower": "a", "norm": "a", "shape": "x", "prefix": "a", "suffix": "a", "length": 1, "cluster": "19", "prob": -3.92978835105896, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
|
||||
{"orth": "and", "id": 7, "lower": "and", "norm": "and", "shape": "xxx", "prefix": "a", "suffix": "and", "length": 3, "cluster": "20", "prob": -4.113108158111572, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
|
||||
{"orth": "of", "id": 8, "lower": "of", "norm": "of", "shape": "xx", "prefix": "o", "suffix": "of", "length": 2, "cluster": "28", "prob": -4.27587366104126, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
|
||||
{"orth": "you", "id": 9, "lower": "you", "norm": "you", "shape": "xxx", "prefix": "y", "suffix": "you", "length": 3, "cluster": "602", "prob": -4.373791217803955, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
|
||||
{"orth": "it", "id": 10, "lower": "it", "norm": "it", "shape": "xx", "prefix": "i", "suffix": "it", "length": 2, "cluster": "474", "prob": -4.388050079345703, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
|
||||
{"orth": "is", "id": 11, "lower": "is", "norm": "is", "shape": "xx", "prefix": "i", "suffix": "is", "length": 2, "cluster": "762", "prob": -4.457748889923096, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
|
||||
{"orth": "that", "id": 12, "lower": "that", "norm": "that", "shape": "xxxx", "prefix": "t", "suffix": "hat", "length": 4, "cluster": "84", "prob": -4.464504718780518, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
|
||||
{"orth": "\n\n", "id": 0, "lower": "\n\n", "norm": "\n\n", "shape": "\n\n", "prefix": "\n", "suffix": "\n\n", "length": 2, "cluster": "0", "prob": -4.606560707092285, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": true, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
|
||||
{"orth": "in", "id": 13, "lower": "in", "norm": "in", "shape": "xx", "prefix": "i", "suffix": "in", "length": 2, "cluster": "60", "prob": -4.619071960449219, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
|
||||
{"orth": "'s", "id": 14, "lower": "'s", "norm": "'s", "shape": "'x", "prefix": "'", "suffix": "'s", "length": 2, "cluster": "52", "prob": -4.830559253692627, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
|
||||
{"orth": "n't", "id": 15, "lower": "n't", "norm": "n't", "shape": "x'x", "prefix": "n", "suffix": "n't", "length": 3, "cluster": "74", "prob": -4.859938621520996, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
|
||||
{"orth": "for", "id": 16, "lower": "for", "norm": "for", "shape": "xxx", "prefix": "f", "suffix": "for", "length": 3, "cluster": "508", "prob": -4.8801093101501465, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
|
||||
{"orth": "\"", "id": 17, "lower": "\"", "norm": "\"", "shape": "\"", "prefix": "\"", "suffix": "\"", "length": 1, "cluster": "0", "prob": -5.02677583694458, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": true, "is_left_punct": true, "is_right_punct": true}
|
||||
{"orth": "?", "id": 18, "lower": "?", "norm": "?", "shape": "?", "prefix": "?", "suffix": "?", "length": 1, "cluster": "0", "prob": -5.05924654006958, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
|
||||
{"orth": " ", "id": 0, "lower": " ", "norm": " ", "shape": " ", "prefix": " ", "suffix": " ", "length": 1, "cluster": "0", "prob": -5.129165172576904, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": true, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
|
|
@ -7,7 +7,7 @@ if __name__ == '__main__':
|
|||
import plac
|
||||
import sys
|
||||
from spacy.cli import download, link, info, package, train, convert, model
|
||||
from spacy.cli import profile, evaluate, validate
|
||||
from spacy.cli import vocab, profile, evaluate, validate
|
||||
from spacy.util import prints
|
||||
|
||||
commands = {
|
||||
|
@ -19,6 +19,7 @@ if __name__ == '__main__':
|
|||
'convert': convert,
|
||||
'package': package,
|
||||
'model': model,
|
||||
'vocab': vocab,
|
||||
'profile': profile,
|
||||
'validate': validate
|
||||
}
|
||||
|
|
|
@ -7,4 +7,5 @@ from .train import train
|
|||
from .evaluate import evaluate
|
||||
from .convert import convert
|
||||
from .model import model
|
||||
from .vocab import make_vocab as vocab
|
||||
from .validate import validate
|
||||
|
|
|
@ -17,14 +17,14 @@ numpy.random.seed(0)
|
|||
|
||||
|
||||
@plac.annotations(
|
||||
model=("Model name or path", "positional", None, str),
|
||||
data_path=("Location of JSON-formatted evaluation data", "positional",
|
||||
model=("model name or path", "positional", None, str),
|
||||
data_path=("location of JSON-formatted evaluation data", "positional",
|
||||
None, str),
|
||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||
gpu_id=("Use GPU", "option", "g", int),
|
||||
displacy_path=("Directory to output rendered parses as HTML", "option",
|
||||
gold_preproc=("use gold preprocessing", "flag", "G", bool),
|
||||
gpu_id=("use GPU", "option", "g", int),
|
||||
displacy_path=("directory to output rendered parses as HTML", "option",
|
||||
"dp", str),
|
||||
displacy_limit=("Limit of parses to render as HTML", "option", "dl", int))
|
||||
displacy_limit=("limit of parses to render as HTML", "option", "dl", int))
|
||||
def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
|
||||
displacy_path=None, displacy_limit=25):
|
||||
"""
|
||||
|
|
|
@ -16,10 +16,11 @@ from .. import about
|
|||
input_dir=("directory with model data", "positional", None, str),
|
||||
output_dir=("output parent directory", "positional", None, str),
|
||||
meta_path=("path to meta.json", "option", "m", str),
|
||||
create_meta=("create meta.json, even if one exists in directory", "flag",
|
||||
"c", bool),
|
||||
force=("force overwriting of existing folder in output directory", "flag",
|
||||
"f", bool))
|
||||
create_meta=("create meta.json, even if one exists in directory – if "
|
||||
"existing meta is found, entries are shown as defaults in "
|
||||
"the command line prompt", "flag", "c", bool),
|
||||
force=("force overwriting of existing model directory in output directory",
|
||||
"flag", "f", bool))
|
||||
def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False,
|
||||
force=False):
|
||||
"""
|
||||
|
@ -41,13 +42,13 @@ def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False,
|
|||
template_manifest = get_template('MANIFEST.in')
|
||||
template_init = get_template('xx_model_name/__init__.py')
|
||||
meta_path = meta_path or input_path / 'meta.json'
|
||||
if not create_meta and meta_path.is_file():
|
||||
prints(meta_path, title="Reading meta.json from file")
|
||||
if meta_path.is_file():
|
||||
meta = util.read_json(meta_path)
|
||||
else:
|
||||
meta = generate_meta(input_dir)
|
||||
if not create_meta: # only print this if user doesn't want to overwrite
|
||||
prints(meta_path, title="Loaded meta.json from file")
|
||||
else:
|
||||
meta = generate_meta(input_dir, meta)
|
||||
meta = validate_meta(meta, ['lang', 'name', 'version'])
|
||||
|
||||
model_name = meta['lang'] + '_' + meta['name']
|
||||
model_name_v = model_name + '-' + meta['version']
|
||||
main_path = output_path / model_name_v
|
||||
|
@ -82,18 +83,19 @@ def create_file(file_path, contents):
|
|||
file_path.open('w', encoding='utf-8').write(contents)
|
||||
|
||||
|
||||
def generate_meta(model_path):
|
||||
meta = {}
|
||||
settings = [('lang', 'Model language', 'en'),
|
||||
('name', 'Model name', 'model'),
|
||||
('version', 'Model version', '0.0.0'),
|
||||
def generate_meta(model_path, existing_meta):
|
||||
meta = existing_meta or {}
|
||||
settings = [('lang', 'Model language', meta.get('lang', 'en')),
|
||||
('name', 'Model name', meta.get('name', 'model')),
|
||||
('version', 'Model version', meta.get('version', '0.0.0')),
|
||||
('spacy_version', 'Required spaCy version',
|
||||
'>=%s,<3.0.0' % about.__version__),
|
||||
('description', 'Model description', False),
|
||||
('author', 'Author', False),
|
||||
('email', 'Author email', False),
|
||||
('url', 'Author website', False),
|
||||
('license', 'License', 'CC BY-NC 3.0')]
|
||||
('description', 'Model description',
|
||||
meta.get('description', False)),
|
||||
('author', 'Author', meta.get('author', False)),
|
||||
('email', 'Author email', meta.get('email', False)),
|
||||
('url', 'Author website', meta.get('url', False)),
|
||||
('license', 'License', meta.get('license', 'CC BY-SA 3.0'))]
|
||||
nlp = util.load_model_from_path(Path(model_path))
|
||||
meta['pipeline'] = nlp.pipe_names
|
||||
meta['vectors'] = {'width': nlp.vocab.vectors_length,
|
||||
|
|
|
@ -32,6 +32,7 @@ numpy.random.seed(0)
|
|||
n_sents=("number of sentences", "option", "ns", int),
|
||||
use_gpu=("Use GPU", "option", "g", int),
|
||||
vectors=("Model to load vectors from", "option", "v"),
|
||||
vectors_limit=("Truncate to N vectors (requires -v)", "option", None, int),
|
||||
no_tagger=("Don't train tagger", "flag", "T", bool),
|
||||
no_parser=("Don't train parser", "flag", "P", bool),
|
||||
no_entities=("Don't train NER", "flag", "N", bool),
|
||||
|
@ -40,9 +41,9 @@ numpy.random.seed(0)
|
|||
meta_path=("Optional path to meta.json. All relevant properties will be "
|
||||
"overwritten.", "option", "m", Path))
|
||||
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
||||
use_gpu=-1, vectors=None, no_tagger=False, no_parser=False,
|
||||
no_entities=False, gold_preproc=False, version="0.0.0",
|
||||
meta_path=None):
|
||||
use_gpu=-1, vectors=None, vectors_limit=None, no_tagger=False,
|
||||
no_parser=False, no_entities=False, gold_preproc=False,
|
||||
version="0.0.0", meta_path=None):
|
||||
"""
|
||||
Train a model. Expects data in spaCy's JSON format.
|
||||
"""
|
||||
|
@ -95,10 +96,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
|||
if vectors:
|
||||
util.load_model(vectors, vocab=nlp.vocab)
|
||||
if vectors_limit is not None:
|
||||
remap = nlp.vocab.prune_vectors(vectors_limit)
|
||||
print('remap', len(remap))
|
||||
for key, (value, sim) in remap.items():
|
||||
print(repr(key), repr(value), sim)
|
||||
nlp.vocab.prune_vectors(vectors_limit)
|
||||
for name in pipeline:
|
||||
nlp.add_pipe(nlp.create_pipe(name), name=name)
|
||||
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
|
||||
|
|
54
spacy/cli/vocab.py
Normal file
54
spacy/cli/vocab.py
Normal file
|
@ -0,0 +1,54 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
import json
|
||||
import spacy
|
||||
import numpy
|
||||
from pathlib import Path
|
||||
|
||||
from ..util import prints, ensure_path
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
lang=("model language", "positional", None, str),
|
||||
output_dir=("model output directory", "positional", None, Path),
|
||||
lexemes_loc=("location of JSONL-formatted lexical data", "positional",
|
||||
None, Path),
|
||||
vectors_loc=("optional: location of vectors data, as numpy .npz",
|
||||
"positional", None, str))
|
||||
def make_vocab(cmd, lang, output_dir, lexemes_loc, vectors_loc=None):
|
||||
"""Compile a vocabulary from a lexicon jsonl file and word vectors."""
|
||||
if not lexemes_loc.exists():
|
||||
prints(lexemes_loc, title="Can't find lexical data", exits=1)
|
||||
vectors_loc = ensure_path(vectors_loc)
|
||||
nlp = spacy.blank(lang)
|
||||
for word in nlp.vocab:
|
||||
word.rank = 0
|
||||
lex_added = 0
|
||||
vec_added = 0
|
||||
with lexemes_loc.open() as file_:
|
||||
for line in file_:
|
||||
if line.strip():
|
||||
attrs = json.loads(line)
|
||||
if 'settings' in attrs:
|
||||
nlp.vocab.cfg.update(attrs['settings'])
|
||||
else:
|
||||
lex = nlp.vocab[attrs['orth']]
|
||||
lex.set_attrs(**attrs)
|
||||
assert lex.rank == attrs['id']
|
||||
lex_added += 1
|
||||
if vectors_loc is not None:
|
||||
vector_data = numpy.load(open(vectors_loc, 'rb'))
|
||||
nlp.vocab.clear_vectors(width=vector_data.shape[1])
|
||||
for word in nlp.vocab:
|
||||
if word.rank:
|
||||
nlp.vocab.vectors.add(word.orth_, row=word.rank,
|
||||
vector=vector_data[word.rank])
|
||||
vec_added += 1
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
nlp.to_disk(output_dir)
|
||||
prints("{} entries, {} vectors".format(lex_added, vec_added), output_dir,
|
||||
title="Sucessfully compiled vocab and vectors, and saved model")
|
||||
return nlp
|
|
@ -300,5 +300,15 @@ GLOSSARY = {
|
|||
'MONEY': 'Monetary values, including unit',
|
||||
'QUANTITY': 'Measurements, as of weight or distance',
|
||||
'ORDINAL': '"first", "second", etc.',
|
||||
'CARDINAL': 'Numerals that do not fall under another type'
|
||||
'CARDINAL': 'Numerals that do not fall under another type',
|
||||
|
||||
|
||||
# Named Entity Recognition
|
||||
# Wikipedia
|
||||
# http://www.sciencedirect.com/science/article/pii/S0004370212000276
|
||||
# https://pdfs.semanticscholar.org/5744/578cc243d92287f47448870bb426c66cc941.pdf
|
||||
|
||||
'PER': 'Named person or family.',
|
||||
'MISC': ('Miscellaneous entities, e.g. events, nationalities, '
|
||||
'products or works of art'),
|
||||
}
|
||||
|
|
|
@ -154,6 +154,8 @@ class Language(object):
|
|||
self._meta.setdefault('email', '')
|
||||
self._meta.setdefault('url', '')
|
||||
self._meta.setdefault('license', '')
|
||||
self._meta['vectors'] = {'width': self.vocab.vectors_length,
|
||||
'entries': len(self.vocab.vectors)}
|
||||
self._meta['pipeline'] = self.pipe_names
|
||||
return self._meta
|
||||
|
||||
|
|
|
@ -13,6 +13,8 @@ from .typedefs cimport attr_t, flags_t
|
|||
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
||||
from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_OOV
|
||||
from .attrs cimport PROB
|
||||
from .attrs import intify_attrs
|
||||
from . import about
|
||||
|
||||
|
||||
|
@ -68,6 +70,19 @@ cdef class Lexeme:
|
|||
def __hash__(self):
|
||||
return self.c.orth
|
||||
|
||||
def set_attrs(self, **attrs):
|
||||
cdef attr_id_t attr
|
||||
attrs = intify_attrs(attrs)
|
||||
for attr, value in attrs.items():
|
||||
if attr == PROB:
|
||||
self.c.prob = value
|
||||
elif attr == CLUSTER:
|
||||
self.c.cluster = int(value)
|
||||
elif isinstance(value, int) or isinstance(value, long):
|
||||
Lexeme.set_struct_attr(self.c, attr, value)
|
||||
else:
|
||||
Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value))
|
||||
|
||||
def set_flag(self, attr_id_t flag_id, bint value):
|
||||
"""Change the value of a boolean flag.
|
||||
|
||||
|
|
|
@ -209,7 +209,7 @@ def test_doc_api_right_edge(en_tokenizer):
|
|||
def test_doc_api_has_vector():
|
||||
vocab = Vocab()
|
||||
vocab.clear_vectors(2)
|
||||
vocab.vectors.add('kitten', numpy.asarray([0., 2.], dtype='f'))
|
||||
vocab.vectors.add('kitten', vector=numpy.asarray([0., 2.], dtype='f'))
|
||||
doc = Doc(vocab, words=['kitten'])
|
||||
assert doc.has_vector
|
||||
|
||||
|
|
|
@ -73,8 +73,8 @@ def test_doc_token_api_is_properties(en_vocab):
|
|||
def test_doc_token_api_vectors():
|
||||
vocab = Vocab()
|
||||
vocab.clear_vectors(2)
|
||||
vocab.vectors.add('apples', numpy.asarray([0., 2.], dtype='f'))
|
||||
vocab.vectors.add('oranges', numpy.asarray([0., 1.], dtype='f'))
|
||||
vocab.vectors.add('apples', vector=numpy.asarray([0., 2.], dtype='f'))
|
||||
vocab.vectors.add('oranges', vector=numpy.asarray([0., 1.], dtype='f'))
|
||||
doc = Doc(vocab, words=['apples', 'oranges', 'oov'])
|
||||
assert doc.has_vector
|
||||
|
||||
|
|
|
@ -21,8 +21,10 @@ cdef class Vectors:
|
|||
Vectors data is kept in the vectors.data attribute, which should be an
|
||||
instance of numpy.ndarray (for CPU vectors) or cupy.ndarray
|
||||
(for GPU vectors). `vectors.key2row` is a dictionary mapping word hashes to
|
||||
rows in the vectors.data table. The array `vectors.keys` keeps the keys in
|
||||
order, such that `keys[vectors.key2row[key]] == key`.
|
||||
rows in the vectors.data table.
|
||||
|
||||
Multiple keys can be mapped to the same vector, so len(keys) may be greater
|
||||
(but not smaller) than data.shape[0].
|
||||
"""
|
||||
cdef public object data
|
||||
cdef readonly StringStore strings
|
||||
|
@ -101,7 +103,7 @@ cdef class Vectors:
|
|||
|
||||
RETURNS (int): The number of vectors in the data.
|
||||
"""
|
||||
return self.i
|
||||
return self._i_vec
|
||||
|
||||
def __contains__(self, key):
|
||||
"""Check whether a key has a vector entry in the table.
|
||||
|
@ -113,11 +115,13 @@ cdef class Vectors:
|
|||
key = self.strings[key]
|
||||
return key in self.key2row
|
||||
|
||||
def add(self, key, vector=None):
|
||||
"""Add a key to the table, optionally setting a vector value as well.
|
||||
def add(self, key, *, vector=None, row=None):
|
||||
"""Add a key to the table. Keys can be mapped to an existing vector
|
||||
by setting `row`, or a new vector can be added.
|
||||
|
||||
key (unicode / int): The key to add.
|
||||
vector (numpy.ndarray): An optional vector to add.
|
||||
vector (numpy.ndarray / None): A vector to add for the key.
|
||||
row (int / None): The row-number of a vector to map the key to.
|
||||
"""
|
||||
if isinstance(key, basestring_):
|
||||
key = self.strings.add(key)
|
||||
|
@ -131,8 +135,8 @@ cdef class Vectors:
|
|||
|
||||
self.key2row[key] = row
|
||||
if vector is not None:
|
||||
self.data[i] = vector
|
||||
return i
|
||||
self.data[row] = vector
|
||||
return row
|
||||
|
||||
def items(self):
|
||||
"""Iterate over `(string key, vector)` pairs, in order.
|
||||
|
|
|
@ -32,6 +32,7 @@ cdef class Vocab:
|
|||
cdef readonly int length
|
||||
cdef public object data_dir
|
||||
cdef public object lex_attr_getters
|
||||
cdef public object cfg
|
||||
|
||||
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
|
||||
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
|
||||
|
|
|
@ -5,6 +5,7 @@ import numpy
|
|||
import dill
|
||||
|
||||
from collections import OrderedDict
|
||||
from thinc.neural.util import get_array_module
|
||||
from .lexeme cimport EMPTY_LEXEME
|
||||
from .lexeme cimport Lexeme
|
||||
from .strings cimport hash_string
|
||||
|
@ -27,7 +28,7 @@ cdef class Vocab:
|
|||
C-data that is shared between `Doc` objects.
|
||||
"""
|
||||
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
|
||||
strings=tuple(), **deprecated_kwargs):
|
||||
strings=tuple(), oov_prob=-20., **deprecated_kwargs):
|
||||
"""Create the vocabulary.
|
||||
|
||||
lex_attr_getters (dict): A dictionary mapping attribute IDs to
|
||||
|
@ -43,6 +44,7 @@ cdef class Vocab:
|
|||
tag_map = tag_map if tag_map is not None else {}
|
||||
if lemmatizer in (None, True, False):
|
||||
lemmatizer = Lemmatizer({}, {}, {})
|
||||
self.cfg = {'oov_prob': oov_prob}
|
||||
self.mem = Pool()
|
||||
self._by_hash = PreshMap()
|
||||
self._by_orth = PreshMap()
|
||||
|
@ -239,7 +241,7 @@ cdef class Vocab:
|
|||
def vectors_length(self):
|
||||
return self.vectors.data.shape[1]
|
||||
|
||||
def clear_vectors(self, new_dim=None):
|
||||
def clear_vectors(self, width=None):
|
||||
"""Drop the current vector table. Because all vectors must be the same
|
||||
width, you have to call this to change the size of the vectors.
|
||||
"""
|
||||
|
@ -283,16 +285,14 @@ cdef class Vocab:
|
|||
keep = xp.ascontiguousarray(keep.T)
|
||||
neighbours = xp.zeros((toss.shape[0],), dtype='i')
|
||||
scores = xp.zeros((toss.shape[0],), dtype='f')
|
||||
for i in range(0, toss.shape[0]//2, batch_size):
|
||||
for i in range(0, toss.shape[0], batch_size):
|
||||
batch = toss[i : i+batch_size]
|
||||
batch /= xp.linalg.norm(batch, axis=1, keepdims=True)+1e-8
|
||||
sims = xp.dot(batch, keep)
|
||||
matches = sims.argmax(axis=1)
|
||||
neighbours[i:i+batch_size] = matches
|
||||
scores[i:i+batch_size] = sims.max(axis=1)
|
||||
i2k = {i: key for key, i in self.vectors.key2row.items()}
|
||||
remap = {}
|
||||
for lex in list(self):
|
||||
for lex in self:
|
||||
# If we're losing the vector for this word, map it to the nearest
|
||||
# vector we're keeping.
|
||||
if lex.rank >= nr_row:
|
||||
|
|
|
@ -41,9 +41,6 @@
|
|||
- var comps = path.split('#');
|
||||
- return "top-level#" + comps[0] + '.' + comps[1];
|
||||
- }
|
||||
- else if (path.startsWith('cli#')) {
|
||||
- return "top-level#" + path.split('#')[1];
|
||||
- }
|
||||
- return path;
|
||||
- }
|
||||
|
||||
|
|
|
@ -1,244 +0,0 @@
|
|||
//- 💫 MIXINS > BASE
|
||||
|
||||
//- Section
|
||||
id - [string] anchor assigned to section (used for breadcrumb navigation)
|
||||
|
||||
mixin section(id)
|
||||
section.o-section(id="section-" + id data-section=id)
|
||||
block
|
||||
|
||||
|
||||
//- Aside wrapper
|
||||
label - [string] aside label
|
||||
|
||||
mixin aside-wrapper(label)
|
||||
aside.c-aside
|
||||
.c-aside__content(role="complementary")&attributes(attributes)
|
||||
if label
|
||||
h4.u-text-label.u-text-label--dark=label
|
||||
|
||||
block
|
||||
|
||||
|
||||
//- SVG from map (uses embedded SVG sprite)
|
||||
name - [string] SVG symbol id
|
||||
width - [integer] width in px
|
||||
height - [integer] height in px (default: same as width)
|
||||
|
||||
mixin svg(name, width, height)
|
||||
svg(aria-hidden="true" viewBox="0 0 #{width} #{height || width}" width=width height=(height || width))&attributes(attributes)
|
||||
use(xlink:href="#svg_#{name}")
|
||||
|
||||
|
||||
//- Icon
|
||||
name - [string] icon name (will be used as symbol id: #svg_{name})
|
||||
width - [integer] icon width (default: 20)
|
||||
height - [integer] icon height (defaults to width)
|
||||
|
||||
mixin icon(name, width, height)
|
||||
- var width = width || 20
|
||||
- var height = height || width
|
||||
+svg(name, width, height).o-icon(style="min-width: #{width}px")&attributes(attributes)
|
||||
|
||||
|
||||
//- Pro/Con/Neutral icon
|
||||
icon - [string] "pro", "con" or "neutral" (default: "neutral")
|
||||
size - [integer] icon size (optional)
|
||||
|
||||
mixin procon(icon, label, show_label, size)
|
||||
- var colors = { yes: "green", no: "red", neutral: "subtle" }
|
||||
span.u-nowrap
|
||||
+icon(icon, size || 20)(class="u-color-#{colors[icon] || 'subtle'}").o-icon--inline&attributes(attributes)
|
||||
span.u-text-small(class=show_label ? null : "u-hidden")=(label || icon)
|
||||
|
||||
//- Headlines Helper Mixin
|
||||
level - [integer] 1, 2, 3, 4, or 5
|
||||
|
||||
mixin headline(level)
|
||||
if level == 1
|
||||
h1.u-heading-1&attributes(attributes)
|
||||
block
|
||||
|
||||
else if level == 2
|
||||
h2.u-heading-2&attributes(attributes)
|
||||
block
|
||||
|
||||
else if level == 3
|
||||
h3.u-heading-3&attributes(attributes)
|
||||
block
|
||||
|
||||
else if level == 4
|
||||
h4.u-heading-4&attributes(attributes)
|
||||
block
|
||||
|
||||
else if level == 5
|
||||
h5.u-heading-5&attributes(attributes)
|
||||
block
|
||||
|
||||
|
||||
//- Permalink rendering
|
||||
id - [string] permalink ID used for link anchor
|
||||
|
||||
mixin permalink(id)
|
||||
if id
|
||||
a.u-permalink(href="##{id}")
|
||||
block
|
||||
|
||||
else
|
||||
block
|
||||
|
||||
|
||||
//- Quickstart widget
|
||||
quickstart.js with manual markup, inspired by PyTorch's "Getting started"
|
||||
groups - [object] option groups, uses global variable QUICKSTART
|
||||
headline - [string] optional text to be rendered as widget headline
|
||||
|
||||
mixin quickstart(groups, headline, description, hide_results)
|
||||
.c-quickstart.o-block-small#qs
|
||||
.c-quickstart__content
|
||||
if headline
|
||||
+h(2)=headline
|
||||
if description
|
||||
p=description
|
||||
for group in groups
|
||||
.c-quickstart__group.u-text-small(data-qs-group=group.id)
|
||||
if group.title
|
||||
.c-quickstart__legend=group.title
|
||||
if group.help
|
||||
| #[+help(group.help)]
|
||||
.c-quickstart__fields
|
||||
for option in group.options
|
||||
input.c-quickstart__input(class="c-quickstart__input--" + (group.input_style ? group.input_style : group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id="qs-#{option.id}" value=option.id checked=option.checked)
|
||||
label.c-quickstart__label.u-text-tiny(for="qs-#{option.id}")!=option.title
|
||||
if option.meta
|
||||
| #[span.c-quickstart__label__meta (#{option.meta})]
|
||||
if option.help
|
||||
| #[+help(option.help)]
|
||||
|
||||
if hide_results
|
||||
block
|
||||
else
|
||||
pre.c-code-block
|
||||
code.c-code-block__content.c-quickstart__code(data-qs-results="")
|
||||
block
|
||||
|
||||
|
||||
//- Quickstart code item
|
||||
data - [object] Rendering conditions (keyed by option group ID, value: option)
|
||||
style - [string] modifier ID for line style
|
||||
|
||||
mixin qs(data, style)
|
||||
- args = {}
|
||||
for value, setting in data
|
||||
- args['data-qs-' + setting] = value
|
||||
span.c-quickstart__line(class="c-quickstart__line--#{style || 'bash'}")&attributes(args)
|
||||
block
|
||||
|
||||
|
||||
//- Terminal-style code window
|
||||
label - [string] title displayed in top bar of terminal window
|
||||
|
||||
mixin terminal(label)
|
||||
.x-terminal
|
||||
.x-terminal__icons: span
|
||||
.u-padding-small.u-text-label.u-text-center=label
|
||||
|
||||
+code.x-terminal__code
|
||||
block
|
||||
|
||||
//- Chart.js
|
||||
id - [string] chart ID, will be assigned as #chart_{id}
|
||||
|
||||
mixin chart(id, height)
|
||||
figure.o-block&attributes(attributes)
|
||||
canvas(id="chart_#{id}" width="800" height=(height || "400") style="max-width: 100%")
|
||||
|
||||
|
||||
//- Gitter chat button and widget
|
||||
button - [string] text shown on button
|
||||
label - [string] title of chat window (default: same as button)
|
||||
|
||||
mixin gitter(button, label)
|
||||
aside.js-gitter.c-chat.is-collapsed(data-title=(label || button))
|
||||
|
||||
button.js-gitter-button.c-chat__button.u-text-tag
|
||||
+icon("chat", 16).o-icon--inline
|
||||
!=button
|
||||
|
||||
|
||||
//- Badge
|
||||
image - [string] path to badge image
|
||||
url - [string] badge link
|
||||
|
||||
mixin badge(image, url)
|
||||
+a(url).u-padding-small.u-hide-link&attributes(attributes)
|
||||
img.o-badge(src=image alt=url height="20")
|
||||
|
||||
|
||||
//- spaCy logo
|
||||
|
||||
mixin logo()
|
||||
+svg("spacy", 675, 215).o-logo&attributes(attributes)
|
||||
|
||||
|
||||
//- Landing
|
||||
|
||||
mixin landing-header()
|
||||
header.c-landing
|
||||
.c-landing__wrapper
|
||||
.c-landing__content
|
||||
block
|
||||
|
||||
mixin landing-banner(headline, label)
|
||||
.c-landing__banner.u-padding.o-block.u-color-light
|
||||
+grid.c-landing__banner__content.o-no-block
|
||||
+grid-col("third")
|
||||
h3.u-heading.u-heading-1
|
||||
if label
|
||||
div
|
||||
span.u-text-label.u-text-label--light=label
|
||||
!=headline
|
||||
|
||||
+grid-col("two-thirds").c-landing__banner__text
|
||||
block
|
||||
|
||||
|
||||
mixin landing-logos(title, logos)
|
||||
.o-content.u-text-center&attributes(attributes)
|
||||
h3.u-heading.u-text-label.u-color-dark=title
|
||||
|
||||
each row, i in logos
|
||||
- var is_last = i == logos.length - 1
|
||||
+grid("center").o-inline-list.o-no-block(class=is_last ? "o-no-block" : null)
|
||||
each details, name in row
|
||||
+a(details[0]).u-padding-medium
|
||||
+icon(name, details[1], details[2])
|
||||
|
||||
if is_last
|
||||
block
|
||||
|
||||
|
||||
//- Under construction (temporary)
|
||||
Marks sections that still need to be completed for the v2.0 release.
|
||||
|
||||
mixin under-construction()
|
||||
+infobox("Under construction", "🚧")
|
||||
| This section is still being written and will be updated for the v2.0
|
||||
| release. Is there anything that you think should definitely mentioned or
|
||||
| explained here? Any examples you'd like to see? #[strong Let us know]
|
||||
| on the #[+a(gh("spacy") + "/issues/1105") v2.0 alpha thread] on GitHub!
|
||||
|
||||
|
||||
//- Alpha infobox (temporary)
|
||||
Added in the templates to notify user that they're visiting the alpha site.
|
||||
|
||||
mixin alpha-info()
|
||||
+infobox("You are viewing the spaCy v2.0.0 alpha docs", "⚠️")
|
||||
strong This page is part of the alpha documentation for spaCy v2.0.
|
||||
| It does not reflect the state of the latest stable release.
|
||||
| Because v2.0 is still under development, the implementation
|
||||
| may differ from the intended state described here. See the
|
||||
| #[+a(gh("spaCy") + "/releases/tag/v2.0.0-alpha") release notes]
|
||||
| for details on how to install and test the new version. To
|
||||
| read the official docs for spaCy v1.x,
|
||||
| #[+a("https://spacy.io/docs") go here].
|
|
@ -1,7 +1,39 @@
|
|||
//- 💫 INCLUDES > MIXINS
|
||||
|
||||
include _functions
|
||||
include _mixins-base
|
||||
|
||||
|
||||
//- Section
|
||||
id - [string] anchor assigned to section (used for breadcrumb navigation)
|
||||
|
||||
mixin section(id)
|
||||
section.o-section(id="section-" + id data-section=id)
|
||||
block
|
||||
|
||||
|
||||
//- Headlines Helper Mixin
|
||||
level - [integer] 1, 2, 3, 4, or 5
|
||||
|
||||
mixin headline(level)
|
||||
if level == 1
|
||||
h1.u-heading-1&attributes(attributes)
|
||||
block
|
||||
|
||||
else if level == 2
|
||||
h2.u-heading-2&attributes(attributes)
|
||||
block
|
||||
|
||||
else if level == 3
|
||||
h3.u-heading-3&attributes(attributes)
|
||||
block
|
||||
|
||||
else if level == 4
|
||||
h4.u-heading-4&attributes(attributes)
|
||||
block
|
||||
|
||||
else if level == 5
|
||||
h5.u-heading-5&attributes(attributes)
|
||||
block
|
||||
|
||||
|
||||
//- Headlines
|
||||
|
@ -18,6 +50,18 @@ mixin h(level, id, source)
|
|||
span Source #[+icon("code", 14).o-icon--inline]
|
||||
|
||||
|
||||
//- Permalink rendering
|
||||
id - [string] permalink ID used for link anchor
|
||||
|
||||
mixin permalink(id)
|
||||
if id
|
||||
a.u-permalink(href="##{id}")
|
||||
block
|
||||
|
||||
else
|
||||
block
|
||||
|
||||
|
||||
//- External links
|
||||
url - [string] link href
|
||||
trusted - [boolean] if not set / false, rel="noopener nofollow" is added
|
||||
|
@ -63,6 +107,18 @@ mixin help(tooltip, icon_size)
|
|||
+icon("help_o", icon_size || 16).o-icon--inline
|
||||
|
||||
|
||||
//- Aside wrapper
|
||||
label - [string] aside label
|
||||
|
||||
mixin aside-wrapper(label)
|
||||
aside.c-aside
|
||||
.c-aside__content(role="complementary")&attributes(attributes)
|
||||
if label
|
||||
h4.u-text-label.u-text-label--dark=label
|
||||
|
||||
block
|
||||
|
||||
|
||||
//- Aside for text
|
||||
label - [string] aside title (optional)
|
||||
|
||||
|
@ -112,6 +168,37 @@ mixin infobox-logos(...logos)
|
|||
| #[+icon(logo[0], logo[1], logo[2]).u-color-dark]
|
||||
|
||||
|
||||
//- SVG from map (uses embedded SVG sprite)
|
||||
name - [string] SVG symbol id
|
||||
width - [integer] width in px
|
||||
height - [integer] height in px (default: same as width)
|
||||
|
||||
mixin svg(name, width, height)
|
||||
svg(aria-hidden="true" viewBox="0 0 #{width} #{height || width}" width=width height=(height || width))&attributes(attributes)
|
||||
use(xlink:href="#svg_#{name}")
|
||||
|
||||
|
||||
//- Icon
|
||||
name - [string] icon name (will be used as symbol id: #svg_{name})
|
||||
width - [integer] icon width (default: 20)
|
||||
height - [integer] icon height (defaults to width)
|
||||
|
||||
mixin icon(name, width, height)
|
||||
- var width = width || 20
|
||||
- var height = height || width
|
||||
+svg(name, width, height).o-icon(style="min-width: #{width}px")&attributes(attributes)
|
||||
|
||||
|
||||
//- Pro/Con/Neutral icon
|
||||
icon - [string] "pro", "con" or "neutral" (default: "neutral")
|
||||
size - [integer] icon size (optional)
|
||||
|
||||
mixin procon(icon, label, show_label, size)
|
||||
- var colors = { yes: "green", no: "red", neutral: "subtle" }
|
||||
span.u-nowrap
|
||||
+icon(icon, size || 20)(class="u-color-#{colors[icon] || 'subtle'}").o-icon--inline&attributes(attributes)
|
||||
span.u-text-small(class=show_label ? null : "u-hidden")=(label || icon)
|
||||
|
||||
|
||||
//- Link button
|
||||
url - [string] link href
|
||||
|
@ -238,6 +325,14 @@ mixin graphic(original)
|
|||
+button(original, false, "secondary", "small") View large graphic
|
||||
|
||||
|
||||
//- Chart.js
|
||||
id - [string] chart ID, will be assigned as #chart_{id}
|
||||
|
||||
mixin chart(id, height)
|
||||
figure.o-block&attributes(attributes)
|
||||
canvas(id="chart_#{id}" width="800" height=(height || "400") style="max-width: 100%")
|
||||
|
||||
|
||||
//- Labels
|
||||
|
||||
mixin label()
|
||||
|
@ -353,8 +448,8 @@ mixin grid(...style)
|
|||
width - [string] "quarter", "third", "half", "two-thirds", "three-quarters"
|
||||
see $grid in assets/css/_variables.sass
|
||||
|
||||
mixin grid-col(width)
|
||||
.o-grid__col(class="o-grid__col--#{width}")&attributes(attributes)
|
||||
mixin grid-col(...style)
|
||||
.o-grid__col(class=prefixArgs(style, "o-grid__col"))&attributes(attributes)
|
||||
block
|
||||
|
||||
|
||||
|
@ -445,3 +540,137 @@ mixin annotation-row(annots, style)
|
|||
else
|
||||
+cell=cell
|
||||
block
|
||||
|
||||
|
||||
//- spaCy logo
|
||||
|
||||
mixin logo()
|
||||
+svg("spacy", 675, 215).o-logo&attributes(attributes)
|
||||
|
||||
|
||||
//- Gitter chat button and widget
|
||||
button - [string] text shown on button
|
||||
label - [string] title of chat window (default: same as button)
|
||||
|
||||
mixin gitter(button, label)
|
||||
aside.js-gitter.c-chat.is-collapsed(data-title=(label || button))
|
||||
|
||||
button.js-gitter-button.c-chat__button.u-text-tag
|
||||
+icon("chat", 16).o-icon--inline
|
||||
!=button
|
||||
|
||||
|
||||
//- Badge
|
||||
image - [string] path to badge image
|
||||
url - [string] badge link
|
||||
|
||||
mixin badge(image, url)
|
||||
+a(url).u-padding-small.u-hide-link&attributes(attributes)
|
||||
img.o-badge(src=image alt=url height="20")
|
||||
|
||||
|
||||
//- Quickstart widget
|
||||
quickstart.js with manual markup, inspired by PyTorch's "Getting started"
|
||||
groups - [object] option groups, uses global variable QUICKSTART
|
||||
headline - [string] optional text to be rendered as widget headline
|
||||
|
||||
mixin quickstart(groups, headline, description, hide_results)
|
||||
.c-quickstart.o-block-small#qs
|
||||
.c-quickstart__content
|
||||
if headline
|
||||
+h(2)=headline
|
||||
if description
|
||||
p=description
|
||||
for group in groups
|
||||
.c-quickstart__group.u-text-small(data-qs-group=group.id)
|
||||
if group.title
|
||||
.c-quickstart__legend=group.title
|
||||
if group.help
|
||||
| #[+help(group.help)]
|
||||
.c-quickstart__fields
|
||||
for option in group.options
|
||||
input.c-quickstart__input(class="c-quickstart__input--" + (group.input_style ? group.input_style : group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id="qs-#{option.id}" value=option.id checked=option.checked)
|
||||
label.c-quickstart__label.u-text-tiny(for="qs-#{option.id}")!=option.title
|
||||
if option.meta
|
||||
| #[span.c-quickstart__label__meta (#{option.meta})]
|
||||
if option.help
|
||||
| #[+help(option.help)]
|
||||
|
||||
if hide_results
|
||||
block
|
||||
else
|
||||
pre.c-code-block
|
||||
code.c-code-block__content.c-quickstart__code(data-qs-results="")
|
||||
block
|
||||
|
||||
|
||||
//- Quickstart code item
|
||||
data - [object] Rendering conditions (keyed by option group ID, value: option)
|
||||
style - [string] modifier ID for line style
|
||||
|
||||
mixin qs(data, style)
|
||||
- args = {}
|
||||
for value, setting in data
|
||||
- args['data-qs-' + setting] = value
|
||||
span.c-quickstart__line(class="c-quickstart__line--#{style || 'bash'}")&attributes(args)
|
||||
block
|
||||
|
||||
|
||||
//- Terminal-style code window
|
||||
label - [string] title displayed in top bar of terminal window
|
||||
|
||||
mixin terminal(label)
|
||||
.x-terminal
|
||||
.x-terminal__icons: span
|
||||
.u-padding-small.u-text-label.u-text-center=label
|
||||
|
||||
+code.x-terminal__code
|
||||
block
|
||||
|
||||
|
||||
//- Landing
|
||||
|
||||
mixin landing-header()
|
||||
header.c-landing
|
||||
.c-landing__wrapper
|
||||
.c-landing__content
|
||||
block
|
||||
|
||||
mixin landing-banner(headline, label)
|
||||
.c-landing__banner.u-padding.o-block.u-color-light
|
||||
+grid.c-landing__banner__content.o-no-block
|
||||
+grid-col("third")
|
||||
h3.u-heading.u-heading-1
|
||||
if label
|
||||
div
|
||||
span.u-text-label.u-text-label--light=label
|
||||
!=headline
|
||||
|
||||
+grid-col("two-thirds").c-landing__banner__text
|
||||
block
|
||||
|
||||
|
||||
mixin landing-logos(title, logos)
|
||||
.o-content.u-text-center&attributes(attributes)
|
||||
h3.u-heading.u-text-label.u-color-dark=title
|
||||
|
||||
each row, i in logos
|
||||
- var is_last = i == logos.length - 1
|
||||
+grid("center").o-inline-list.o-no-block(class=is_last ? "o-no-block" : null)
|
||||
each details, name in row
|
||||
+a(details[0]).u-padding-medium
|
||||
+icon(name, details[1], details[2])
|
||||
|
||||
if is_last
|
||||
block
|
||||
|
||||
|
||||
//- Under construction (temporary)
|
||||
Marks sections that still need to be completed for the v2.0 release.
|
||||
|
||||
mixin under-construction()
|
||||
+infobox("Under construction", "🚧")
|
||||
| This section is still being written and will be updated for the v2.0
|
||||
| release. Is there anything that you think should definitely mentioned or
|
||||
| explained here? Any examples you'd like to see? #[strong Let us know]
|
||||
| on the #[+a(gh("spacy") + "/issues/1105") v2.0 alpha thread] on GitHub!
|
||||
|
|
|
@ -25,9 +25,6 @@ main.o-main.o-main--sidebar.o-main--aside
|
|||
+button(gh("spacy", source), false, "secondary", "small").u-nowrap
|
||||
| Source #[+icon("code", 14)]
|
||||
|
||||
//-if ALPHA
|
||||
//- +alpha-info
|
||||
|
||||
if IS_MODELS
|
||||
include _page_models
|
||||
else
|
||||
|
|
|
@ -62,6 +62,9 @@ svg(style="position: absolute; visibility: hidden; width: 0; height: 0;" width="
|
|||
symbol#svg_explosion(viewBox="0 0 500 500")
|
||||
path(fill="currentColor" d="M111.7 74.9L91.2 93.1l9.1 10.2 17.8-15.8 7.4 8.4-17.8 15.8 10.1 11.4 20.6-18.2 7.7 8.7-30.4 26.9-41.9-47.3 30.3-26.9 7.6 8.6zM190.8 59.6L219 84.3l-14.4 4.5-20.4-18.2-6.4 26.6-14.4 4.5 8.9-36.4-26.9-24.1 14.3-4.5L179 54.2l5.7-25.2 14.3-4.5-8.2 35.1zM250.1 21.2l27.1 3.4c6.1.8 10.8 3.1 14 7.2 3.2 4.1 4.5 9.2 3.7 15.5-.8 6.3-3.2 11-7.4 14.1-4.1 3.1-9.2 4.3-15.3 3.5L258 63.2l-2.8 22.3-13-1.6 7.9-62.7zm11.5 13l-2.2 17.5 12.6 1.6c5.1.6 9.1-2 9.8-7.6.7-5.6-2.5-9.2-7.6-9.9l-12.6-1.6zM329.1 95.4l23.8 13.8-5.8 10L312 98.8l31.8-54.6 11.3 6.6-26 44.6zM440.5 145c-1.3 8.4-5.9 15.4-13.9 21.1s-16.2 7.7-24.6 6.1c-8.4-1.6-15.3-6.3-20.8-14.1-5.5-7.9-7.6-16-6.4-24.4 1.3-8.5 6-15.5 14-21.1 8-5.6 16.2-7.7 24.5-6 8.4 1.6 15.4 6.3 20.9 14.2 5.5 7.6 7.6 15.7 6.3 24.2zM412 119c-5.1-.8-10.3.6-15.6 4.4-5.2 3.7-8.4 8.1-9.4 13.2-1 5.2.2 10.1 3.5 14.8 3.4 4.8 7.5 7.5 12.7 8.2 5.2.8 10.4-.7 15.6-4.4 5.3-3.7 8.4-8.1 9.4-13.2 1.1-5.1-.1-9.9-3.4-14.7-3.4-4.8-7.6-7.6-12.8-8.3zM471.5 237.9c-2.8 4.8-7.1 7.6-13 8.7l-2.6-13.1c5.3-.9 8.1-5 7.2-11-.9-5.8-4.3-8.8-8.9-8.2-2.3.3-3.7 1.4-4.5 3.3-.7 1.9-1.4 5.2-1.7 10.1-.8 7.5-2.2 13.1-4.3 16.9-2.1 3.9-5.7 6.2-10.9 7-6.3.9-11.3-.5-15.2-4.4-3.9-3.8-6.3-9-7.3-15.7-1.1-7.4-.2-13.7 2.6-18.8 2.8-5.1 7.4-8.2 13.7-9.2l2.6 13c-5.6 1.1-8.7 6.6-7.7 13.4 1 6.6 3.9 9.5 8.6 8.8 4.4-.7 5.7-4.5 6.7-14.1.3-3.5.7-6.2 1.1-8.4.4-2.2 1.2-4.4 2.2-6.8 2.1-4.7 6-7.2 11.8-8.1 5.4-.8 10.3.4 14.5 3.7 4.2 3.3 6.9 8.5 8 15.6.9 6.9-.1 12.6-2.9 17.3zM408.6 293.5l2.4-12.9 62 11.7-2.4 12.9-62-11.7zM419.6 396.9c-8.3 2-16.5.3-24.8-5-8.2-5.3-13.2-12.1-14.9-20.5-1.6-8.4.1-16.6 5.3-24.6 5.2-8.1 11.9-13.1 20.2-15.1 8.4-1.9 16.6-.3 24.9 5 8.2 5.3 13.2 12.1 14.8 20.5 1.7 8.4 0 16.6-5.2 24.7-5.2 8-12 13-20.3 15zm13.4-36.3c-1.2-5.1-4.5-9.3-9.9-12.8s-10.6-4.7-15.8-3.7-9.3 4-12.4 8.9-4.1 9.8-2.8 14.8c1.2 5.1 4.5 9.3 9.9 12.8 5.5 3.5 10.7 4.8 15.8 3.7 5.1-.9 9.2-3.8 12.3-8.7s4.1-9.9 2.9-15zM303.6 416.5l9.6-5.4 43.3 20.4-19.2-34 11.4-6.4 31 55-9.6 5.4-43.4-20.5 19.2 34.1-11.3 6.4-31-55zM238.2 468.8c-49 0-96.9-17.4-134.8-49-38.3-32-64-76.7-72.5-125.9-2-11.9-3.1-24-3.1-35.9 0-36.5 9.6-72.6 27.9-104.4 2.1-3.6 6.7-4.9 10.3-2.8 3.6 2.1 4.9 6.7 2.8 10.3-16.9 29.5-25.9 63.1-25.9 96.9 0 11.1 1 22.3 2.9 33.4 7.9 45.7 31.8 87.2 67.3 116.9 35.2 29.3 79.6 45.5 125.1 45.5 11.1 0 22.3-1 33.4-2.9 4.1-.7 8 2 8.7 6.1.7 4.1-2 8-6.1 8.7-11.9 2-24 3.1-36 3.1z")
|
||||
|
||||
symbol#svg_prodigy(viewBox="0 0 538.5 157.6")
|
||||
path(fill="currentColor" d="M70.6 48.6c7 7.3 10.5 17.1 10.5 29.2S77.7 99.7 70.6 107c-6.9 7.3-15.9 11.1-27 11.1-9.4 0-16.8-2.7-21.7-8.2v44.8H0V39h20.7v8.1c4.8-6.4 12.4-9.6 22.9-9.6 11.1 0 20.1 3.7 27 11.1zM21.9 76v3.6c0 12.1 7.3 19.8 18.3 19.8 11.2 0 18.7-7.9 18.7-21.6s-7.5-21.6-18.7-21.6c-11 0-18.3 7.7-18.3 19.8zM133.8 59.4c-12.6 0-20.5 7-20.5 17.8v39.3h-22V39h21.1v8.8c4-6.4 11.2-9.6 21.3-9.6v21.2zM209.5 107.1c-7.6 7.3-17.5 11.1-29.5 11.1s-21.9-3.8-29.7-11.1c-7.6-7.5-11.5-17.2-11.5-29.2 0-12.1 3.9-21.9 11.5-29.2 7.8-7.3 17.7-11.1 29.7-11.1s21.9 3.8 29.5 11.1c7.8 7.3 11.7 17.1 11.7 29.2 0 11.9-3.9 21.7-11.7 29.2zM180 56.2c-5.7 0-10.3 1.9-13.8 5.8-3.5 3.8-5.2 9-5.2 15.7 0 6.7 1.8 12 5.2 15.7 3.4 3.8 8.1 5.7 13.8 5.7s10.3-1.9 13.8-5.7 5.2-9 5.2-15.7c0-6.8-1.8-12-5.2-15.7-3.5-3.8-8.1-5.8-13.8-5.8zM313 116.5h-20.5v-7.9c-4.4 5.5-12.7 9.6-23.1 9.6-10.9 0-19.9-3.8-27-11.1C235.5 99.7 232 90 232 77.8s3.5-21.9 10.3-29.2c7-7.3 16-11.1 27-11.1 9.7 0 17.1 2.7 21.9 8.2V0H313v116.5zm-58.8-38.7c0 13.6 7.5 21.4 18.7 21.4 10.9 0 18.3-7.3 18.3-19.8V76c0-12.2-7.3-19.8-18.3-19.8-11.2 0-18.7 8-18.7 21.6zM354.1 13.6c0 3.6-1.3 6.8-3.9 9.3-5 4.9-13.6 4.9-18.6 0-8.4-7.5-1.6-23.1 9.3-22.5 7.4 0 13.2 5.9 13.2 13.2zm-2.2 102.9H330V39h21.9v77.5zM425.1 47.1V39h20.5v80.4c0 11.2-3.6 20.1-10.6 26.8-7 6.7-16.6 10-28.5 10-23.4 0-36.9-11.4-39.9-29.8l21.7-.8c1 7.6 7.6 12 17.4 12 11.2 0 18.1-5.8 18.1-16.6v-11.1c-5.1 5.5-12.5 8.2-21.9 8.2-10.9 0-19.9-3.8-27-11.1-6.9-7.3-10.3-17.1-10.3-29.2s3.5-21.9 10.3-29.2c7-7.3 16-11.1 27-11.1 10.7 0 18.4 3.1 23.2 9.6zm-38.3 30.7c0 13.6 7.5 21.6 18.7 21.6 11 0 18.3-7.6 18.3-19.8V76c0-12.2-7.3-19.8-18.3-19.8-11.2 0-18.7 8-18.7 21.6zM488.8 154.8H465l19.8-45.1L454.5 39h24.1l17.8 46.2L514.2 39h24.3l-49.7 115.8z")
|
||||
|
||||
|
||||
//- Machine learning & NLP libraries
|
||||
|
||||
|
|
|
@ -1,6 +1,11 @@
|
|||
//- 💫 DOCS > API > ANNOTATION > NAMED ENTITIES
|
||||
|
||||
+table([ "Type", "Description" ])
|
||||
p
|
||||
| Models trained on the
|
||||
| #[+a("https://catalog.ldc.upenn.edu/ldc2013t19") OntoNotes 5] corpus
|
||||
| support the following entity types:
|
||||
|
||||
+table(["Type", "Description"])
|
||||
+row
|
||||
+cell #[code PERSON]
|
||||
+cell People, including fictional.
|
||||
|
@ -45,9 +50,6 @@
|
|||
+cell #[code LANGUAGE]
|
||||
+cell Any named language.
|
||||
|
||||
p The following values are also annotated in a style similar to names:
|
||||
|
||||
+table([ "Type", "Description" ])
|
||||
+row
|
||||
+cell #[code DATE]
|
||||
+cell Absolute or relative dates or periods.
|
||||
|
@ -75,3 +77,33 @@ p The following values are also annotated in a style similar to names:
|
|||
+row
|
||||
+cell #[code CARDINAL]
|
||||
+cell Numerals that do not fall under another type.
|
||||
|
||||
+h(4, "ner-wikipedia-scheme") Wikipedia scheme
|
||||
|
||||
p
|
||||
| Models trained on Wikipedia corpus
|
||||
| (#[+a("http://www.sciencedirect.com/science/article/pii/S0004370212000276") Nothman et al., 2013])
|
||||
| use a less fine-grained NER annotation scheme and recognise the
|
||||
| following entities:
|
||||
|
||||
+table(["Type", "Description"])
|
||||
+row
|
||||
+cell #[code PER]
|
||||
+cell Named person or family.
|
||||
|
||||
+row
|
||||
+cell #[code LOC]
|
||||
+cell
|
||||
| Name of politically or geographically defined location (cities,
|
||||
| provinces, countries, international regions, bodies of water,
|
||||
| mountains).
|
||||
|
||||
+row
|
||||
+cell #[code ORG]
|
||||
+cell Named corporate, governmental, or other organizational entity.
|
||||
|
||||
+row
|
||||
+cell #[code MISC]
|
||||
+cell
|
||||
| Miscellaneous entities, e.g. events, nationalities, products or
|
||||
| works of art.
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
//- 💫 DOCS > API > ANNOTATION > TRAINING
|
||||
|
||||
+h(3, "json-input") JSON input format for training
|
||||
|
||||
p
|
||||
| spaCy takes training data in JSON format. The built-in
|
||||
| #[+api("cli#convert") #[code convert]] command helps you convert the
|
||||
|
@ -46,3 +48,57 @@ p
|
|||
| Treebank:
|
||||
|
||||
+github("spacy", "examples/training/training-data.json", false, false, "json")
|
||||
|
||||
+h(3, "vocab-jsonl") Lexical data for vocabulary
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| The populate a model's vocabulary, you can use the
|
||||
| #[+api("cli#vocab") #[code spacy vocab]] command and load in a
|
||||
| #[+a("https://jsonlines.readthedocs.io/en/latest/") newline-delimited JSON]
|
||||
| (JSONL) file containing one lexical entry per line. The first line
|
||||
| defines the language and vocabulary settings. All other lines are
|
||||
| expected to be JSON objects describing an individual lexeme. The lexical
|
||||
| attributes will be then set as attributes on spaCy's
|
||||
| #[+api("lexeme#attributes") #[code Lexeme]] object. The #[code vocab]
|
||||
| command outputs a ready-to-use spaCy model with a #[code Vocab]
|
||||
| containing the lexical data.
|
||||
|
||||
+code("First line").
|
||||
{"lang": "en", "settings": {"oov_prob": -20.502029418945312}}
|
||||
|
||||
+code("Entry structure").
|
||||
{
|
||||
"orth": string,
|
||||
"id": int,
|
||||
"lower": string,
|
||||
"norm": string,
|
||||
"shape": string
|
||||
"prefix": string,
|
||||
"suffix": string,
|
||||
"length": int,
|
||||
"cluster": string,
|
||||
"prob": float,
|
||||
"is_alpha": bool,
|
||||
"is_ascii": bool,
|
||||
"is_digit": bool,
|
||||
"is_lower": bool,
|
||||
"is_punct": bool,
|
||||
"is_space": bool,
|
||||
"is_title": bool,
|
||||
"is_upper": bool,
|
||||
"like_url": bool,
|
||||
"like_num": bool,
|
||||
"like_email": bool,
|
||||
"is_stop": bool,
|
||||
"is_oov": bool,
|
||||
"is_quote": bool,
|
||||
"is_left_punct": bool,
|
||||
"is_right_punct": bool
|
||||
}
|
||||
|
||||
p
|
||||
| Here's an example of the 20 most frequent lexemes in the English
|
||||
| training data:
|
||||
|
||||
+github("spacy", "examples/training/vocab-data.jsonl", false, false, "json")
|
||||
|
|
|
@ -3,8 +3,10 @@
|
|||
"Overview": {
|
||||
"Architecture": "./",
|
||||
"Annotation Specs": "annotation",
|
||||
"Command Line": "cli",
|
||||
"Functions": "top-level"
|
||||
},
|
||||
|
||||
"Containers": {
|
||||
"Doc": "doc",
|
||||
"Token": "token",
|
||||
|
@ -45,14 +47,19 @@
|
|||
}
|
||||
},
|
||||
|
||||
"cli": {
|
||||
"title": "Command Line Interface",
|
||||
"teaser": "Download, train and package models, and debug spaCy.",
|
||||
"source": "spacy/cli"
|
||||
},
|
||||
|
||||
"top-level": {
|
||||
"title": "Top-level Functions",
|
||||
"menu": {
|
||||
"spacy": "spacy",
|
||||
"displacy": "displacy",
|
||||
"Utility Functions": "util",
|
||||
"Compatibility": "compat",
|
||||
"Command Line": "cli"
|
||||
"Compatibility": "compat"
|
||||
}
|
||||
},
|
||||
|
||||
|
@ -213,7 +220,7 @@
|
|||
"Lemmatization": "lemmatization",
|
||||
"Dependencies": "dependency-parsing",
|
||||
"Named Entities": "named-entities",
|
||||
"Training Data": "training"
|
||||
"Models & Training": "training"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -85,7 +85,9 @@ p
|
|||
+row
|
||||
+cell #[code name]
|
||||
+cell unicode
|
||||
+cell ISO code of the language class to load.
|
||||
+cell
|
||||
| #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code]
|
||||
| of the language class to load.
|
||||
|
||||
+row
|
||||
+cell #[code disable]
|
||||
|
|
|
@ -99,6 +99,6 @@ p This document describes the target annotations spaCy is trained to predict.
|
|||
include _annotation/_biluo
|
||||
|
||||
+section("training")
|
||||
+h(2, "json-input") JSON input format for training
|
||||
+h(2, "training") Models and training data
|
||||
|
||||
include _annotation/_training
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
//- 💫 DOCS > API > TOP-LEVEL > COMMAND LINE INTERFACE
|
||||
//- 💫 DOCS > API > COMMAND LINE INTERFACE
|
||||
|
||||
include ../_includes/_mixins
|
||||
|
||||
p
|
||||
| As of v1.7.0, spaCy comes with new command line helpers to download and
|
||||
|
@ -34,6 +36,13 @@ p
|
|||
+cell flag
|
||||
+cell Show help message and available arguments.
|
||||
|
||||
+row("foot")
|
||||
+cell creates
|
||||
+cell directory, symlink
|
||||
+cell
|
||||
| The installed model package in your #[code site-packages]
|
||||
| directory and a shortcut link as a symlink in #[code spacy/data].
|
||||
|
||||
+aside("Downloading best practices")
|
||||
| The #[code download] command is mostly intended as a convenient,
|
||||
| interactive wrapper – it performs compatibility checks and prints
|
||||
|
@ -86,6 +95,13 @@ p
|
|||
+cell flag
|
||||
+cell Show help message and available arguments.
|
||||
|
||||
+row("foot")
|
||||
+cell creates
|
||||
+cell symlink
|
||||
+cell
|
||||
| A shortcut link of the given name as a symlink in
|
||||
| #[code spacy/data].
|
||||
|
||||
+h(3, "info") Info
|
||||
|
||||
p
|
||||
|
@ -113,6 +129,11 @@ p
|
|||
+cell flag
|
||||
+cell Show help message and available arguments.
|
||||
|
||||
+row("foot")
|
||||
+cell prints
|
||||
+cell #[code stdout]
|
||||
+cell Information about your spaCy installation.
|
||||
|
||||
+h(3, "validate") Validate
|
||||
+tag-new(2)
|
||||
|
||||
|
@ -129,6 +150,12 @@ p
|
|||
+code(false, "bash", "$").
|
||||
spacy validate
|
||||
|
||||
+table(["Argument", "Type", "Description"])
|
||||
+row("foot")
|
||||
+cell prints
|
||||
+cell #[code stdout]
|
||||
+cell Details about the compatibility of your installed models.
|
||||
|
||||
+h(3, "convert") Convert
|
||||
|
||||
p
|
||||
|
@ -172,6 +199,11 @@ p
|
|||
+cell flag
|
||||
+cell Show help message and available arguments.
|
||||
|
||||
+row("foot")
|
||||
+cell creates
|
||||
+cell JSON
|
||||
+cell Data in spaCy's #[+a("/api/annotation#json-input") JSON format].
|
||||
|
||||
p The following converters are available:
|
||||
|
||||
+table(["ID", "Description"])
|
||||
|
@ -286,6 +318,11 @@ p
|
|||
+cell flag
|
||||
+cell Show help message and available arguments.
|
||||
|
||||
+row("foot")
|
||||
+cell creates
|
||||
+cell model, pickle
|
||||
+cell A spaCy model on each epoch, and a final #[code .pickle] file.
|
||||
|
||||
+h(4, "train-hyperparams") Environment variables for hyperparameters
|
||||
+tag-new(2)
|
||||
|
||||
|
@ -395,6 +432,50 @@ p
|
|||
+cell Gradient L2 norm constraint.
|
||||
+cell #[code 1.0]
|
||||
|
||||
+h(3, "vocab") Vocab
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| Compile a vocabulary from a
|
||||
| #[+a("/api/annotation#vocab-jsonl") lexicon JSONL] file and optional
|
||||
| word vectors. Will save out a valid spaCy model that you can load via
|
||||
| #[+api("spacy#load") #[code spacy.load]] or package using the
|
||||
| #[+api("cli#package") #[code package]] command.
|
||||
|
||||
+code(false, "bash", "$").
|
||||
spacy vocab [lang] [output_dir] [lexemes_loc] [vectors_loc]
|
||||
|
||||
+table(["Argument", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code lang]
|
||||
+cell positional
|
||||
+cell
|
||||
| Model language
|
||||
| #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code],
|
||||
| e.g. #[code en].
|
||||
|
||||
+row
|
||||
+cell #[code output_dir]
|
||||
+cell positional
|
||||
+cell Model output directory. Will be created if it doesn't exist.
|
||||
|
||||
+row
|
||||
+cell #[code lexemes_loc]
|
||||
+cell positional
|
||||
+cell
|
||||
| Location of lexical data in spaCy's
|
||||
| #[+a("/api/annotation#vocab-jsonl") JSONL format].
|
||||
|
||||
+row
|
||||
+cell #[code vectors_loc]
|
||||
+cell positional
|
||||
+cell Optional location of vectors data as numpy #[code .npz] file.
|
||||
|
||||
+row("foot")
|
||||
+cell creates
|
||||
+cell model
|
||||
+cell A spaCy model containing the vocab and vectors.
|
||||
|
||||
+h(3, "evaluate") Evaluate
|
||||
+tag-new(2)
|
||||
|
||||
|
@ -447,22 +528,36 @@ p
|
|||
+cell flag
|
||||
+cell Use gold preprocessing.
|
||||
|
||||
+row("foot")
|
||||
+cell prints / creates
|
||||
+cell #[code stdout], HTML
|
||||
+cell Training results and optional displaCy visualizations.
|
||||
|
||||
|
||||
+h(3, "package") Package
|
||||
|
||||
p
|
||||
| Generate a #[+a("/usage/training#models-generating") model Python package]
|
||||
| from an existing model data directory. All data files are copied over.
|
||||
| If the path to a meta.json is supplied, or a meta.json is found in the
|
||||
| input directory, this file is used. Otherwise, the data can be entered
|
||||
| directly from the command line. The required file templates are downloaded
|
||||
| from #[+src(gh("spacy-dev-resources", "templates/model")) GitHub] to make
|
||||
| If the path to a #[code meta.json] is supplied, or a #[code meta.json] is
|
||||
| found in the input directory, this file is used. Otherwise, the data can
|
||||
| be entered directly from the command line. The required file templates
|
||||
| are downloaded from
|
||||
| #[+src(gh("spacy-dev-resources", "templates/model")) GitHub] to make
|
||||
| sure you're always using the latest versions. This means you need to be
|
||||
| connected to the internet to use this command.
|
||||
| connected to the internet to use this command. After packaging, you
|
||||
| can run #[code python setup.py sdist] from the newly created directory
|
||||
| to turn your model into an installable archive file.
|
||||
|
||||
+code(false, "bash", "$", false, false, true).
|
||||
spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--force]
|
||||
|
||||
+aside-code("Example", "bash").
|
||||
spacy package /input /output
|
||||
cd /output/en_model-0.0.0
|
||||
python setup.py sdist
|
||||
pip install dist/en_model-0.0.0.tar.gz
|
||||
|
||||
+table(["Argument", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code input_dir]
|
||||
|
@ -477,15 +572,16 @@ p
|
|||
+row
|
||||
+cell #[code --meta-path], #[code -m]
|
||||
+cell option
|
||||
+cell #[+tag-new(2)] Path to meta.json file (optional).
|
||||
+cell #[+tag-new(2)] Path to #[code meta.json] file (optional).
|
||||
|
||||
+row
|
||||
+cell #[code --create-meta], #[code -c]
|
||||
+cell flag
|
||||
+cell
|
||||
| #[+tag-new(2)] Create a meta.json file on the command line, even
|
||||
| if one already exists in the directory.
|
||||
|
||||
| #[+tag-new(2)] Create a #[code meta.json] file on the command
|
||||
| line, even if one already exists in the directory. If an
|
||||
| existing file is found, its entries will be shown as the defaults
|
||||
| in the command line prompt.
|
||||
+row
|
||||
+cell #[code --force], #[code -f]
|
||||
+cell flag
|
||||
|
@ -495,3 +591,8 @@ p
|
|||
+cell #[code --help], #[code -h]
|
||||
+cell flag
|
||||
+cell Show help message and available arguments.
|
||||
|
||||
+row("foot")
|
||||
+cell creates
|
||||
+cell directory
|
||||
+cell A Python package containing the spaCy model.
|
|
@ -18,7 +18,3 @@ include ../_includes/_mixins
|
|||
+section("compat")
|
||||
+h(2, "compat", "spacy/compaty.py") Compatibility functions
|
||||
include _top-level/_compat
|
||||
|
||||
+section("cli", "spacy/cli")
|
||||
+h(2, "cli") Command line
|
||||
include _top-level/_cli
|
||||
|
|
|
@ -162,7 +162,7 @@ p
|
|||
+cell int
|
||||
+cell The integer ID by which the flag value can be checked.
|
||||
|
||||
+h(2, "add_flag") Vocab.clear_vectors
|
||||
+h(2, "clear_vectors") Vocab.clear_vectors
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
|
@ -181,7 +181,50 @@ p
|
|||
| Number of dimensions of the new vectors. If #[code None], size
|
||||
| is not changed.
|
||||
|
||||
+h(2, "add_flag") Vocab.get_vector
|
||||
+h(2, "prune_vectors") Vocab.prune_vectors
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| Reduce the current vector table to #[code nr_row] unique entries. Words
|
||||
| mapped to the discarded vectors will be remapped to the closest vector
|
||||
| among those remaining. For example, suppose the original table had
|
||||
| vectors for the words:
|
||||
| #[code.u-break ['sat', 'cat', 'feline', 'reclined']]. If we prune the
|
||||
| vector table to, two rows, we would discard the vectors for "feline"
|
||||
| and "reclined". These words would then be remapped to the closest
|
||||
| remaining vector – so "feline" would have the same vector as "cat",
|
||||
| and "reclined" would have the same vector as "sat". The similarities are
|
||||
| judged by cosine. The original vectors may be large, so the cosines are
|
||||
| calculated in minibatches, to reduce memory usage.
|
||||
|
||||
+aside-code("Example").
|
||||
nlp.vocab.prune_vectors(10000)
|
||||
assert len(nlp.vocab.vectors) <= 1000
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code nr_row]
|
||||
+cell int
|
||||
+cell The number of rows to keep in the vector table.
|
||||
|
||||
+row
|
||||
+cell #[code batch_size]
|
||||
+cell int
|
||||
+cell
|
||||
| Batch of vectors for calculating the similarities. Larger batch
|
||||
| sizes might be faster, while temporarily requiring more memory.
|
||||
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell dict
|
||||
+cell
|
||||
| A dictionary keyed by removed words mapped to
|
||||
| #[code (string, score)] tuples, where #[code string] is the entry
|
||||
| the removed word was mapped to, and #[code score] the similarity
|
||||
| score between the two words.
|
||||
|
||||
+h(2, "get_vector") Vocab.get_vector
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
|
@ -206,7 +249,7 @@ p
|
|||
| A word vector. Size and shape are determined by the
|
||||
| #[code Vocab.vectors] instance.
|
||||
|
||||
+h(2, "add_flag") Vocab.set_vector
|
||||
+h(2, "set_vector") Vocab.set_vector
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
|
@ -228,7 +271,7 @@ p
|
|||
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell The vector to set.
|
||||
|
||||
+h(2, "add_flag") Vocab.has_vector
|
||||
+h(2, "has_vector") Vocab.has_vector
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
|
|
|
@ -48,6 +48,9 @@
|
|||
flex: 0 0 100%
|
||||
flex-flow: column wrap
|
||||
|
||||
&.o-grid__col--no-gutter
|
||||
margin-top: 0
|
||||
|
||||
// Fix overflow issue in old browsers
|
||||
|
||||
& > *
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
align-items: center
|
||||
display: flex
|
||||
justify-content: space-between
|
||||
flex-flow: row wrap
|
||||
flex-flow: row nowrap
|
||||
padding: 0 2rem 0 1rem
|
||||
z-index: 30
|
||||
width: 100%
|
||||
|
|
|
@ -51,6 +51,7 @@
|
|||
@include scroll-shadow-base($color-front)
|
||||
display: inline-block
|
||||
overflow-x: auto
|
||||
overflow-y: hidden
|
||||
width: auto
|
||||
-webkit-overflow-scrolling: touch
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
+h(2, "changelog") Changelog
|
||||
+button(gh("spacy") + "/releases", false, "secondary", "small").u-float-right.u-nowrap View releases
|
||||
|
||||
div(data-tpl="changelog" data-tpl-key="error")
|
||||
div(data-tpl="changelog" data-tpl-key="error" style="display: none")
|
||||
+infobox
|
||||
| Unable to load changelog from GitHub. Please see the
|
||||
| #[+a(gh("spacy") + "/releases") releases page] instead.
|
||||
|
|
|
@ -76,6 +76,16 @@ p
|
|||
("Google rebrands its business apps", [(0, 6, "ORG")]),
|
||||
("look what i found on google! 😂", [(21, 27, "PRODUCT")])]
|
||||
|
||||
+infobox("Tip: Try the Prodigy annotation tool")
|
||||
+infobox-logos(["prodigy", 100, 29, "https://prodi.gy"])
|
||||
| If you need to label a lot of data, check out
|
||||
| #[+a("https://prodi.gy", true) Prodigy], a new, active learning-powered
|
||||
| annotation tool we've developed. Prodigy is fast and extensible, and
|
||||
| comes with a modern #[strong web application] that helps you collect
|
||||
| training data faster. It integrates seamlessly with spaCy, pre-selects
|
||||
| the #[strong most relevant examples] for annotation, and lets you
|
||||
| train and evaluate ready-to-use spaCy models.
|
||||
|
||||
+h(3, "annotations") Training with annotations
|
||||
|
||||
p
|
||||
|
@ -180,9 +190,10 @@ p
|
|||
+cell #[code optimizer]
|
||||
+cell Callable to update the model's weights.
|
||||
|
||||
+infobox
|
||||
| For the #[strong full example and more details], see the usage guide on
|
||||
| #[+a("/usage/training#ner") training the named entity recognizer],
|
||||
| or the runnable
|
||||
| #[+src(gh("spaCy", "examples/training/train_ner.py")) training script]
|
||||
| on GitHub.
|
||||
p
|
||||
| Instead of writing your own training loop, you can also use the
|
||||
| built-in #[+api("cli#train") #[code train]] command, which expects data
|
||||
| in spaCy's #[+a("/api/annotation#json-input") JSON format]. On each epoch,
|
||||
| a model will be saved out to the directory. After training, you can
|
||||
| use the #[+api("cli#package") #[code package]] command to generate an
|
||||
| installable Python package from your model.
|
||||
|
|
|
@ -190,7 +190,3 @@ p
|
|||
|
||||
+item
|
||||
| #[strong Test] the model to make sure the parser works as expected.
|
||||
|
||||
+h(3, "training-json") JSON format for training
|
||||
|
||||
include ../../api/_annotation/_training
|
||||
|
|
Loading…
Reference in New Issue
Block a user