diff --git a/spacy/__init__.py b/spacy/__init__.py index 8dc0937f5..f9e29037f 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,10 +1,7 @@ # coding: utf8 from __future__ import unicode_literals -import importlib - -from .compat import basestring_ -from .cli.info import info +from .cli.info import info as cli_info from .glossary import explain from .deprecated import resolve_load_name from . import util @@ -12,11 +9,8 @@ from . import util def load(name, **overrides): name = resolve_load_name(name, **overrides) - model_path = util.resolve_model_path(name) - meta = util.parse_package_meta(model_path) - if 'lang' not in meta: - raise IOError('No language setting found in model meta.') - cls = util.get_lang_class(meta['lang']) - overrides['meta'] = meta - overrides['path'] = model_path - return cls(**overrides) + return util.load_model(name) + + +def info(model=None, markdown=False): + return cli_info(None, model, markdown) diff --git a/spacy/_ml.py b/spacy/_ml.py index 4667798b2..ac7849bbb 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -19,6 +19,8 @@ import numpy def _init_for_precomputed(W, ops): + if (W**2).sum() != 0.: + return reshaped = W.reshape((W.shape[1], W.shape[0] * W.shape[2])) ops.xavier_uniform_init(reshaped) W[:] = reshaped.reshape(W.shape) @@ -86,10 +88,10 @@ class PrecomputableAffine(Model): d_b=Gradient("b") ) class PrecomputableMaxouts(Model): - def __init__(self, nO=None, nI=None, nF=None, pieces=3, **kwargs): + def __init__(self, nO=None, nI=None, nF=None, nP=3, **kwargs): Model.__init__(self, **kwargs) self.nO = nO - self.nP = pieces + self.nP = nP self.nI = nI self.nF = nF @@ -247,6 +249,7 @@ def doc2feats(cols=None): model.cols = cols return model + def print_shape(prefix): def forward(X, drop=0.): return X, lambda dX, **kwargs: dX diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index c7730ab9e..a0a76e5ec 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -7,7 +7,6 @@ from pathlib import Path from .converters import conllu2json, iob2json from ..util import prints - # Converters are matched by file extension. To add a converter, add a new entry # to this dict with the file extension mapped to the converter function imported # from /converters. @@ -25,8 +24,9 @@ CONVERTERS = { n_sents=("Number of sentences per doc", "option", "n", float), morphology=("Enable appending morphology to tags", "flag", "m", bool) ) -def convert(input_file, output_dir, n_sents, morphology): - """Convert files into JSON format for use with train command and other +def convert(cmd, input_file, output_dir, n_sents, morphology): + """ + Convert files into JSON format for use with train command and other experiment management functions. """ input_path = Path(input_file) @@ -39,4 +39,5 @@ def convert(input_file, output_dir, n_sents, morphology): if not file_ext in CONVERTERS: prints("Can't find converter for %s" % input_path.parts[-1], title="Unknown format", exits=1) - CONVERTERS[file_ext](input_path, output_path, *args) + CONVERTERS[file_ext](input_path, output_path, + n_sents=n_sents, use_morphology=morphology) diff --git a/spacy/cli/converters/iob2json.py b/spacy/cli/converters/iob2json.py index 45393dd80..c2e944c0a 100644 --- a/spacy/cli/converters/iob2json.py +++ b/spacy/cli/converters/iob2json.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from ...compat import json_dumps, path2str from ...util import prints +from ...gold import iob_to_biluo def iob2json(input_path, output_path, n_sents=10, *a, **k): @@ -29,9 +30,10 @@ def read_iob(file_): continue tokens = [t.rsplit('|', 2) for t in line.split()] words, pos, iob = zip(*tokens) + biluo = iob_to_biluo(iob) sentences.append([ {'orth': w, 'tag': p, 'ner': ent} - for (w, p, ent) in zip(words, pos, iob) + for (w, p, ent) in zip(words, pos, biluo) ]) sentences = [{'tokens': sent} for sent in sentences] paragraphs = [{'sentences': [sent]} for sent in sentences] diff --git a/spacy/cli/download.py b/spacy/cli/download.py index fdcacb891..b6e5549da 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -17,8 +17,9 @@ from .. import about direct=("force direct download. Needs model name with version and won't " "perform compatibility check", "flag", "d", bool) ) -def download(model, direct=False): - """Download compatible model from default download path using pip. Model +def download(cmd, model, direct=False): + """ + Download compatible model from default download path using pip. Model can be shortcut, model name or, if --direct flag is set, full model name with version. """ @@ -31,7 +32,7 @@ def download(model, direct=False): version = get_version(model_name, compatibility) download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version)) try: - link(model_name, model, force=True) + link(None, model_name, model, force=True) except: # Dirty, but since spacy.download and the auto-linking is mostly # a convenience wrapper, it's best to show a success message and diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 6f7467521..70f054d84 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -14,14 +14,20 @@ from .. import util model=("optional: shortcut link of model", "positional", None, str), markdown=("generate Markdown for GitHub issues", "flag", "md", str) ) -def info(model=None, markdown=False): +def info(cmd, model=None, markdown=False): """Print info about spaCy installation. If a model shortcut link is speficied as an argument, print model information. Flag --markdown prints details in Markdown for easy copy-pasting to GitHub issues. """ if model: - model_path = util.resolve_model_path(model) - meta = util.parse_package_meta(model_path) + if util.is_package(model): + model_path = util.get_package_path(model) + else: + model_path = util.get_data_path() / model + meta_path = model_path / 'meta.json' + if not meta_path.is_file(): + prints(meta_path, title="Can't find model meta.json", exits=1) + meta = read_json(meta_path) if model_path.resolve() != model_path: meta['link'] = path2str(model_path) meta['source'] = path2str(model_path.resolve()) diff --git a/spacy/cli/link.py b/spacy/cli/link.py index 1feef8bce..66824c042 100644 --- a/spacy/cli/link.py +++ b/spacy/cli/link.py @@ -14,13 +14,14 @@ from .. import util link_name=("name of shortuct link to create", "positional", None, str), force=("force overwriting of existing link", "flag", "f", bool) ) -def link(origin, link_name, force=False): - """Create a symlink for models within the spacy/data directory. Accepts +def link(cmd, origin, link_name, force=False): + """ + Create a symlink for models within the spacy/data directory. Accepts either the name of a pip package, or the local path to the model data directory. Linking models allows loading them via spacy.load(link_name). """ if util.is_package(origin): - model_path = util.get_model_package_path(origin) + model_path = util.get_package_path(model) else: model_path = Path(origin) if not model_path.exists(): diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 9acd0a2fa..e78a4eeb4 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -18,8 +18,9 @@ from .. import about meta=("path to meta.json", "option", "m", str), force=("force overwriting of existing folder in output directory", "flag", "f", bool) ) -def package(input_dir, output_dir, meta, force): - """Generate Python package for model data, including meta and required +def package(cmd, input_dir, output_dir, meta=None, force=False): + """ + Generate Python package for model data, including meta and required installation files. A new directory will be created in the specified output directory, and model data will be copied over. """ @@ -42,7 +43,7 @@ def package(input_dir, output_dir, meta, force): meta = util.read_json(meta_path) else: meta = generate_meta() - validate_meta(meta, ['lang', 'name', 'version']) + meta = validate_meta(meta, ['lang', 'name', 'version']) model_name = meta['lang'] + '_' + meta['name'] model_name_v = model_name + '-' + meta['version'] @@ -85,20 +86,32 @@ def generate_meta(): ('email', 'Author email', False), ('url', 'Author website', False), ('license', 'License', 'CC BY-NC 3.0')] - prints("Enter the package settings for your model.", title="Generating meta.json") meta = {} for setting, desc, default in settings: response = util.get_raw_input(desc, default) meta[setting] = default if response == '' and default else response + meta['pipeline'] = generate_pipeline() return meta +def generate_pipeline(): + prints("If set to 'True', the default pipeline is used. If set to 'False', " + "the pipeline will be disabled. Components should be specified as a " + "comma-separated list of component names, e.g. vectorizer, tagger, " + "parser, ner. For more information, see the docs on processing pipelines.", + title="Enter your model's pipeline components") + pipeline = util.get_raw_input("Pipeline components", True) + replace = {'True': True, 'False': False} + return replace[pipeline] if pipeline in replace else pipeline.split(', ') + + def validate_meta(meta, keys): for key in keys: if key not in meta or meta[key] == '': prints("This setting is required to build your package.", title='No "%s" setting found in meta.json' % key, exits=1) + return meta def get_template(filepath): diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 07e97fe1e..b1e9446ed 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -14,7 +14,7 @@ from timeit import default_timer as timer from ..tokens.doc import Doc from ..scorer import Scorer from ..gold import GoldParse, merge_sents -from ..gold import GoldCorpus +from ..gold import GoldCorpus, minibatch from ..util import prints from .. import util from .. import displacy @@ -32,9 +32,11 @@ from .. import displacy no_parser=("Don't train parser", "flag", "P", bool), no_entities=("Don't train NER", "flag", "N", bool) ) -def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, +def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, use_gpu=False, no_tagger=False, no_parser=False, no_entities=False): - """Train a model. Expects data in spaCy's JSON format.""" + """ + Train a model. Expects data in spaCy's JSON format. + """ n_sents = n_sents or None output_path = util.ensure_path(output_dir) train_path = util.ensure_path(train_data) @@ -53,45 +55,48 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, if no_parser and 'dependencies' in pipeline: pipeline.remove('dependencies') if no_entities and 'entities' in pipeline: pipeline.remove('entities') + # Take dropout and batch size as generators of values -- dropout + # starts high and decays sharply, to force the optimizer to explore. + # Batch size starts at 1 and grows, so that we make updates quickly + # at the beginning of training. + dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2), + util.env_opt('dropout_to', 0.2), + util.env_opt('dropout_decay', 0.0)) + batch_sizes = util.compounding(util.env_opt('batch_from', 1), + util.env_opt('batch_to', 64), + util.env_opt('batch_compound', 1.001)) + nlp = lang_class(pipeline=pipeline) corpus = GoldCorpus(train_path, dev_path, limit=n_sents) - - dropout = util.env_opt('dropout', 0.0) - dropout_decay = util.env_opt('dropout_decay', 0.0) - orig_dropout = dropout + n_train_docs = corpus.count_train() optimizer = nlp.begin_training(lambda: corpus.train_tuples, use_gpu=use_gpu) - n_train_docs = corpus.count_train() - batch_size = float(util.env_opt('min_batch_size', 4)) - max_batch_size = util.env_opt('max_batch_size', 64) - batch_accel = util.env_opt('batch_accel', 1.001) - print("Itn.\tDep. Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %") - for i in range(n_iter): - with tqdm.tqdm(total=n_train_docs) as pbar: - train_docs = corpus.train_docs(nlp, shuffle=i, projectivize=True) - idx = 0 - while idx < n_train_docs: - batch = list(cytoolz.take(int(batch_size), train_docs)) - if not batch: - break - docs, golds = zip(*batch) - nlp.update(docs, golds, drop=dropout, sgd=optimizer) - pbar.update(len(docs)) - idx += len(docs) - batch_size *= batch_accel - batch_size = min(batch_size, max_batch_size) - dropout = linear_decay(orig_dropout, dropout_decay, i*n_train_docs+idx) - with nlp.use_params(optimizer.averages): - start = timer() - scorer = nlp.evaluate(corpus.dev_docs(nlp)) - end = timer() - n_words = scorer.tokens.tp + scorer.tokens.fn - assert n_words != 0 - wps = n_words / (end-start) - print_progress(i, {}, scorer.scores, wps=wps) - with (output_path / 'model.bin').open('wb') as file_: - with nlp.use_params(optimizer.averages): - dill.dump(nlp, file_, -1) + + print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %") + try: + for i in range(n_iter): + with tqdm.tqdm(total=corpus.count_train(), leave=False) as pbar: + train_docs = corpus.train_docs(nlp, projectivize=True, + gold_preproc=False, max_length=0) + losses = {} + for batch in minibatch(train_docs, size=batch_sizes): + docs, golds = zip(*batch) + nlp.update(docs, golds, sgd=optimizer, + drop=next(dropout_rates), losses=losses) + pbar.update(len(docs)) + + with nlp.use_params(optimizer.averages): + with (output_path / ('model%d.pickle' % i)).open('wb') as file_: + dill.dump(nlp, file_, -1) + with (output_path / ('model%d.pickle' % i)).open('rb') as file_: + nlp_loaded = dill.load(file_) + scorer = nlp_loaded.evaluate(corpus.dev_docs(nlp_loaded, gold_preproc=False)) + print_progress(i, losses, scorer.scores) + finally: + print("Saving model...") + with (output_path / 'model-final.pickle').open('wb') as file_: + with nlp.use_params(optimizer.averages): + dill.dump(nlp, file_, -1) def _render_parses(i, to_render): @@ -109,13 +114,13 @@ def print_progress(itn, losses, dev_scores, wps=0.0): for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc', 'ents_p', 'ents_r', 'ents_f', 'wps']: scores[col] = 0.0 - scores.update(losses) + scores['dep_loss'] = losses.get('parser', 0.0) + scores['tag_loss'] = losses.get('tagger', 0.0) scores.update(dev_scores) - scores[wps] = wps + scores['wps'] = wps tpl = '\t'.join(( '{:d}', '{dep_loss:.3f}', - '{tag_loss:.3f}', '{uas:.3f}', '{ents_p:.3f}', '{ents_r:.3f}', diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 53bd25890..faf135b00 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -6,6 +6,7 @@ import io import re import ujson import random +import cytoolz from .syntax import nonproj from .util import ensure_path @@ -141,6 +142,19 @@ def _min_edit_path(cand_words, gold_words): return prev_costs[n_gold], previous_row[-1] +def minibatch(items, size=8): + '''Iterate over batches of items. `size` may be an iterator, + so that batch-size can vary on each step. + ''' + items = iter(items) + while True: + batch_size = next(size) #if hasattr(size, '__next__') else size + batch = list(cytoolz.take(int(batch_size), items)) + if len(batch) == 0: + break + yield list(batch) + + class GoldCorpus(object): """An annotated corpus, using the JSON file format. Manages annotations for tagging, dependency parsing and NER.""" @@ -184,15 +198,15 @@ class GoldCorpus(object): n += 1 return n - def train_docs(self, nlp, shuffle=0, gold_preproc=False, - projectivize=False): + def train_docs(self, nlp, gold_preproc=False, + projectivize=False, max_length=None): train_tuples = self.train_tuples if projectivize: train_tuples = nonproj.preprocess_training_data( self.train_tuples) - if shuffle: - random.shuffle(train_tuples) - gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc) + random.shuffle(train_tuples) + gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc, + max_length=max_length) yield from gold_docs def dev_docs(self, nlp, gold_preproc=False): @@ -201,7 +215,7 @@ class GoldCorpus(object): yield from gold_docs @classmethod - def iter_gold_docs(cls, nlp, tuples, gold_preproc): + def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None): for raw_text, paragraph_tuples in tuples: if gold_preproc: raw_text = None @@ -212,7 +226,8 @@ class GoldCorpus(object): gold_preproc) golds = cls._make_golds(docs, paragraph_tuples) for doc, gold in zip(docs, golds): - yield doc, gold + if not max_length or len(doc) < max_length: + yield doc, gold @classmethod def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc): @@ -291,7 +306,7 @@ def read_json_file(loc, docs_filter=None, limit=None): yield [paragraph.get('raw', None), sents] -def _iob_to_biluo(tags): +def iob_to_biluo(tags): out = [] curr_label = None tags = list(tags) @@ -396,7 +411,10 @@ cdef class GoldParse: else: self.words[i] = words[gold_i] self.tags[i] = tags[gold_i] - self.heads[i] = self.gold_to_cand[heads[gold_i]] + if heads[gold_i] is None: + self.heads[i] = None + else: + self.heads[i] = self.gold_to_cand[heads[gold_i]] self.labels[i] = deps[gold_i] self.ner[i] = entities[gold_i] diff --git a/spacy/lang/bn/punctuation.py b/spacy/lang/bn/punctuation.py index 66b7d967c..96485dd55 100644 --- a/spacy/lang/bn/punctuation.py +++ b/spacy/lang/bn/punctuation.py @@ -1,8 +1,8 @@ # coding: utf8 from __future__ import unicode_literals -from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, UNITS -from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES +from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS +from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES, UNITS _currency = r"\$|¢|£|€|¥|฿|৳" @@ -10,16 +10,16 @@ _quotes = QUOTES.replace("'", '') _list_punct = LIST_PUNCT + '। ॥'.strip().split() -_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES) +_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS) -_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + +_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS + [r'(?<=[0-9])\+', r'(?<=°[FfCcKk])\.', r'(?<=[0-9])(?:{})'.format(_currency), r'(?<=[0-9])(?:{})'.format(UNITS), r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)]) -_infixes = (LIST_ELLIPSES + +_infixes = (LIST_ELLIPSES + LIST_ICONS + [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 5b81eddde..bec685646 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -20,7 +20,6 @@ _upper = [_latin_upper] _lower = [_latin_lower] _uncased = [_bengali, _hebrew] - ALPHA = merge_char_classes(_upper + _lower + _uncased) ALPHA_LOWER = merge_char_classes(_lower + _uncased) ALPHA_UPPER = merge_char_classes(_upper + _uncased) @@ -33,13 +32,14 @@ _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$' _punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &' _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «' _hyphens = '- – — -- ---' - +_other_symbols = r'[\p{So}]' UNITS = merge_chars(_units) CURRENCY = merge_chars(_currency) QUOTES = merge_chars(_quotes) PUNCT = merge_chars(_punct) HYPHENS = merge_chars(_hyphens) +ICONS = _other_symbols LIST_UNITS = split_chars(_units) LIST_CURRENCY = split_chars(_currency) @@ -47,3 +47,4 @@ LIST_QUOTES = split_chars(_quotes) LIST_PUNCT = split_chars(_punct) LIST_HYPHENS = split_chars(_hyphens) LIST_ELLIPSES = [r'\.\.+', '…'] +LIST_ICONS = [_other_symbols] diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index 7b7d4e1bb..7e1da789b 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -35,4 +35,4 @@ class English(Language): Defaults = EnglishDefaults -__all__ = ['English', 'EnglishDefaults'] +__all__ = ['English'] diff --git a/spacy/lang/punctuation.py b/spacy/lang/punctuation.py index 74bb28f5f..680f5cff0 100644 --- a/spacy/lang/punctuation.py +++ b/spacy/lang/punctuation.py @@ -2,15 +2,16 @@ from __future__ import unicode_literals from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY -from .char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES -from .char_classes import CURRENCY, UNITS +from .char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS +from .char_classes import QUOTES, CURRENCY, UNITS _prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + - LIST_CURRENCY) + LIST_CURRENCY + LIST_ICONS) -_suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + +_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS + + ["'s", "'S", "’s", "’S"] + [r'(?<=[0-9])\+', r'(?<=°[FfCcKk])\.', r'(?<=[0-9])(?:{})'.format(CURRENCY), @@ -19,7 +20,7 @@ _suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QU r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER)]) -_infixes = (LIST_ELLIPSES + +_infixes = (LIST_ELLIPSES + LIST_ICONS + [r'(?<=[0-9])[+\-\*^](?=[0-9-])', r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), diff --git a/spacy/lang/xx/__init__.py b/spacy/lang/xx/__init__.py new file mode 100644 index 000000000..fef8c9d59 --- /dev/null +++ b/spacy/lang/xx/__init__.py @@ -0,0 +1,26 @@ +# coding: utf8 +from __future__ import unicode_literals + + +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ...language import Language +from ...attrs import LANG +from ...util import update_exc + + +class MultiLanguageDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'xx' + + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) + + +class MultiLanguage(Language): + """Language class to be used for models that support multiple languages. + This module allows models to specify their language ID as 'xx'. + """ + lang = 'xx' + Defaults = MultiLanguageDefaults + + +__all__ = ['MultiLanguage'] diff --git a/spacy/language.py b/spacy/language.py index 23bbe1719..e874dbb78 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -6,7 +6,8 @@ import dill import numpy from thinc.neural import Model from thinc.neural.ops import NumpyOps, CupyOps -from thinc.neural.optimizers import Adam +from thinc.neural.optimizers import Adam, SGD +import random from .tokenizer import Tokenizer from .vocab import Vocab @@ -172,13 +173,13 @@ class Language(object): flat_list.append(pipe) self.pipeline = flat_list - def __call__(self, text, **disabled): + def __call__(self, text, disable=[]): """'Apply the pipeline to some text. The text can span multiple sentences, and can contain arbtrary whitespace. Alignment into the original string is preserved. text (unicode): The text to be processed. - **disabled: Elements of the pipeline that should not be run. + disable (list): Names of the pipeline components to disable. RETURNS (Doc): A container for accessing the annotations. EXAMPLE: @@ -189,12 +190,12 @@ class Language(object): doc = self.make_doc(text) for proc in self.pipeline: name = getattr(proc, 'name', None) - if name in disabled and not disabled[name]: + if name in disable: continue proc(doc) return doc - def update(self, docs, golds, drop=0., sgd=None): + def update(self, docs, golds, drop=0., sgd=None, losses=None): """Update the models in the pipeline. docs (iterable): A batch of `Doc` objects. @@ -211,12 +212,21 @@ class Language(object): """ tok2vec = self.pipeline[0] feats = tok2vec.doc2feats(docs) - for proc in self.pipeline[1:]: + grads = {} + def get_grads(W, dW, key=None): + grads[key] = (W, dW) + pipes = list(self.pipeline[1:]) + random.shuffle(pipes) + for proc in pipes: if not hasattr(proc, 'update'): continue tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop) - d_tokvecses = proc.update((docs, tokvecses), golds, sgd=sgd, drop=drop) - bp_tokvecses(d_tokvecses, sgd=sgd) + d_tokvecses = proc.update((docs, tokvecses), golds, + drop=drop, sgd=get_grads, losses=losses) + if d_tokvecses is not None: + bp_tokvecses(d_tokvecses, sgd=sgd) + for key, (W, dW) in grads.items(): + sgd(W, dW, key=key) # Clear the tensor variable, to free GPU memory. # If we don't do this, the memory leak gets pretty # bad, because we may be holding part of a batch. @@ -260,13 +270,20 @@ class Language(object): if cfg.get('use_gpu'): Model.ops = CupyOps() Model.Ops = CupyOps - print("Use GPU") for proc in self.pipeline: if hasattr(proc, 'begin_training'): context = proc.begin_training(get_gold_tuples(), pipeline=self.pipeline) contexts.append(context) - optimizer = Adam(Model.ops, 0.001) + learn_rate = util.env_opt('learn_rate', 0.001) + beta1 = util.env_opt('optimizer_B1', 0.9) + beta2 = util.env_opt('optimizer_B2', 0.999) + eps = util.env_opt('optimizer_eps', 1e-08) + L2 = util.env_opt('L2_penalty', 1e-6) + max_grad_norm = util.env_opt('grad_norm_clip', 1.) + optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1, + beta2=beta2, eps=eps) + optimizer.max_grad_norm = max_grad_norm return optimizer def evaluate(self, docs_golds): @@ -306,7 +323,7 @@ class Language(object): except StopIteration: pass - def pipe(self, texts, n_threads=2, batch_size=1000, **disabled): + def pipe(self, texts, n_threads=2, batch_size=1000, disable=[]): """Process texts as a stream, and yield `Doc` objects in order. Supports GIL-free multi-threading. @@ -314,7 +331,7 @@ class Language(object): n_threads (int): The number of worker threads to use. If -1, OpenMP will decide how many to use at run time. Default is 2. batch_size (int): The number of texts to buffer. - **disabled: Pipeline components to exclude. + disable (list): Names of the pipeline components to disable. YIELDS (Doc): Documents in the order of the original text. EXAMPLE: @@ -326,7 +343,7 @@ class Language(object): docs = texts for proc in self.pipeline: name = getattr(proc, 'name', None) - if name in disabled and not disabled[name]: + if name in disable: continue if hasattr(proc, 'pipe'): docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size) @@ -336,12 +353,14 @@ class Language(object): for doc in docs: yield doc - def to_disk(self, path, **exclude): - """Save the current state to a directory. + def to_disk(self, path, disable=[]): + """Save the current state to a directory. If a model is loaded, this + will include the model. path (unicode or Path): A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. - **exclude: Named attributes to prevent from being saved. + disable (list): Nameds of pipeline components to disable and prevent + from being saved. EXAMPLE: >>> nlp.to_disk('/path/to/models') @@ -353,7 +372,7 @@ class Language(object): raise IOError("Output path must be a directory") props = {} for name, value in self.__dict__.items(): - if name in exclude: + if name in disable: continue if hasattr(value, 'to_disk'): value.to_disk(path / name) @@ -362,13 +381,14 @@ class Language(object): with (path / 'props.pickle').open('wb') as file_: dill.dump(props, file_) - def from_disk(self, path, **exclude): + def from_disk(self, path, disable=[]): """Loads state from a directory. Modifies the object in place and - returns it. + returns it. If the saved `Language` object contains a model, the + model will be loaded. path (unicode or Path): A path to a directory. Paths may be either strings or `Path`-like objects. - **exclude: Named attributes to prevent from being loaded. + disable (list): Names of the pipeline components to disable. RETURNS (Language): The modified `Language` object. EXAMPLE: @@ -377,35 +397,36 @@ class Language(object): """ path = util.ensure_path(path) for name in path.iterdir(): - if name not in exclude and hasattr(self, str(name)): + if name not in disable and hasattr(self, str(name)): getattr(self, name).from_disk(path / name) with (path / 'props.pickle').open('rb') as file_: bytes_data = file_.read() - self.from_bytes(bytes_data, **exclude) + self.from_bytes(bytes_data, disable) return self - def to_bytes(self, **exclude): + def to_bytes(self, disable=[]): """Serialize the current state to a binary string. - **exclude: Named attributes to prevent from being serialized. + disable (list): Nameds of pipeline components to disable and prevent + from being serialized. RETURNS (bytes): The serialized form of the `Language` object. """ props = dict(self.__dict__) - for key in exclude: + for key in disable: if key in props: props.pop(key) return dill.dumps(props, -1) - def from_bytes(self, bytes_data, **exclude): + def from_bytes(self, bytes_data, disable=[]): """Load state from a binary string. bytes_data (bytes): The data to load from. - **exclude: Named attributes to prevent from being loaded. + disable (list): Names of the pipeline components to disable. RETURNS (Language): The `Language` object. """ props = dill.loads(bytes_data) for key, value in props.items(): - if key not in exclude: + if key not in disable: setattr(self, key, value) return self diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index af71b1ad6..37e5be382 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -43,7 +43,7 @@ class TokenVectorEncoder(object): name = 'tok2vec' @classmethod - def Model(cls, width=128, embed_size=5000, **cfg): + def Model(cls, width=128, embed_size=7500, **cfg): """Create a new statistical model for the class. width (int): Output size of the model. @@ -119,7 +119,7 @@ class TokenVectorEncoder(object): assert tokvecs.shape[0] == len(doc) doc.tensor = tokvecs - def update(self, docs, golds, state=None, drop=0., sgd=None): + def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None): """Update the model. docs (iterable): A batch of `Doc` objects. @@ -199,7 +199,7 @@ class NeuralTagger(object): vocab.morphology.assign_tag_id(&doc.c[j], tag_id) idx += 1 - def update(self, docs_tokvecs, golds, drop=0., sgd=None): + def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None): docs, tokvecs = docs_tokvecs if self.model.nI is None: @@ -228,6 +228,7 @@ class NeuralTagger(object): idx += 1 correct = self.model.ops.xp.array(correct, dtype='i') d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) + d_scores /= d_scores.shape[0] loss = (d_scores**2).sum() d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) return float(loss), d_scores @@ -248,7 +249,8 @@ class NeuralTagger(object): vocab.morphology.lemmatizer) token_vector_width = pipeline[0].model.nO self.model = with_flatten( - Softmax(self.vocab.morphology.n_tags, token_vector_width)) + chain(Maxout(token_vector_width, token_vector_width), + Softmax(self.vocab.morphology.n_tags, token_vector_width))) def use_params(self, params): with self.model.use_params(params): @@ -274,7 +276,8 @@ class NeuralLabeller(NeuralTagger): self.labels[dep] = len(self.labels) token_vector_width = pipeline[0].model.nO self.model = with_flatten( - Softmax(len(self.labels), token_vector_width)) + chain(Maxout(token_vector_width, token_vector_width), + Softmax(len(self.labels), token_vector_width))) def get_loss(self, docs, golds, scores): scores = self.model.ops.flatten(scores) @@ -290,6 +293,7 @@ class NeuralLabeller(NeuralTagger): idx += 1 correct = self.model.ops.xp.array(correct, dtype='i') d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) + d_scores /= d_scores.shape[0] loss = (d_scores**2).sum() d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) return float(loss), d_scores @@ -333,6 +337,9 @@ cdef class NeuralDependencyParser(NeuralParser): name = 'parser' TransitionSystem = ArcEager + def __reduce__(self): + return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None) + cdef class NeuralEntityRecognizer(NeuralParser): name = 'entity' @@ -340,6 +347,10 @@ cdef class NeuralEntityRecognizer(NeuralParser): nr_feature = 6 + def __reduce__(self): + return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None) + + cdef class BeamDependencyParser(BeamParser): TransitionSystem = ArcEager diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd index 829779dc1..0b29412bf 100644 --- a/spacy/syntax/_state.pxd +++ b/spacy/syntax/_state.pxd @@ -335,16 +335,18 @@ cdef cppclass StateC: this._break = this._b_i void clone(const StateC* src) nogil: + this.length = src.length memcpy(this._sent, src._sent, this.length * sizeof(TokenC)) memcpy(this._stack, src._stack, this.length * sizeof(int)) memcpy(this._buffer, src._buffer, this.length * sizeof(int)) memcpy(this._ents, src._ents, this.length * sizeof(Entity)) memcpy(this.shifted, src.shifted, this.length * sizeof(this.shifted[0])) - this.length = src.length this._b_i = src._b_i this._s_i = src._s_i this._e_i = src._e_i this._break = src._break + this.offset = src.offset + this._empty_token = src._empty_token void fast_forward() nogil: # space token attachement policy: diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 7531b2180..0b615ed49 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -9,6 +9,7 @@ import ctypes from libc.stdint cimport uint32_t from libc.string cimport memcpy from cymem.cymem cimport Pool +from collections import OrderedDict from .stateclass cimport StateClass from ._state cimport StateC, is_space_token @@ -310,12 +311,13 @@ cdef class ArcEager(TransitionSystem): @classmethod def get_actions(cls, **kwargs): actions = kwargs.get('actions', - { - SHIFT: [''], - REDUCE: [''], - RIGHT: [], - LEFT: [], - BREAK: ['ROOT']}) + OrderedDict(( + (SHIFT, ['']), + (REDUCE, ['']), + (RIGHT, []), + (LEFT, []), + (BREAK, ['ROOT']) + ))) seen_actions = set() for label in kwargs.get('left_labels', []): if label.upper() != 'ROOT': @@ -348,8 +350,15 @@ cdef class ArcEager(TransitionSystem): def __get__(self): return (SHIFT, REDUCE, LEFT, RIGHT, BREAK) + def has_gold(self, GoldParse gold, start=0, end=None): + end = end or len(gold.heads) + if all([tag is None for tag in gold.heads[start:end]]): + return False + else: + return True + def preprocess_gold(self, GoldParse gold): - if all([h is None for h in gold.heads]): + if not self.has_gold(gold): return None for i in range(gold.length): if gold.heads[i] is None: # Missing values diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 74ab9c26c..f8db0a433 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -2,6 +2,7 @@ from __future__ import unicode_literals from thinc.typedefs cimport weight_t +from collections import OrderedDict from .stateclass cimport StateClass from ._state cimport StateC @@ -51,17 +52,29 @@ cdef bint _entity_is_sunk(StateClass st, Transition* golds) nogil: cdef class BiluoPushDown(TransitionSystem): + def __init__(self, *args, **kwargs): + TransitionSystem.__init__(self, *args, **kwargs) + + def __reduce__(self): + labels_by_action = OrderedDict() + cdef Transition t + for trans in self.c[:self.n_moves]: + label_str = self.strings[trans.label] + labels_by_action.setdefault(trans.move, []).append(label_str) + return (BiluoPushDown, (self.strings, labels_by_action), + None, None) + @classmethod def get_actions(cls, **kwargs): actions = kwargs.get('actions', - { - MISSING: [''], - BEGIN: [], - IN: [], - LAST: [], - UNIT: [], - OUT: [''] - }) + OrderedDict(( + (MISSING, ['']), + (BEGIN, []), + (IN, []), + (LAST, []), + (UNIT, []), + (OUT, ['']) + ))) seen_entities = set() for entity_type in kwargs.get('entity_types', []): if entity_type in seen_entities: @@ -90,13 +103,20 @@ cdef class BiluoPushDown(TransitionSystem): def move_name(self, int move, int label): if move == OUT: return 'O' - elif move == 'MISSING': + elif move == MISSING: return 'M' else: return MOVE_NAMES[move] + '-' + self.strings[label] + def has_gold(self, GoldParse gold, start=0, end=None): + end = end or len(gold.ner) + if all([tag == '-' for tag in gold.ner[start:end]]): + return False + else: + return True + def preprocess_gold(self, GoldParse gold): - if all([tag == '-' for tag in gold.ner]): + if not self.has_gold(gold): return None for i in range(gold.length): gold.c.ner[i] = self.lookup_transition(gold.ner[i]) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 5b7752abb..d6e050713 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -249,11 +249,13 @@ cdef class Parser: with Model.use_device('cpu'): if depth == 0: upper = chain() + upper.is_noop = True else: upper = chain( clone(Maxout(hidden_width), (depth-1)), - zero_init(Affine(nr_class)) + zero_init(Affine(nr_class, drop_factor=0.0)) ) + upper.is_noop = False # TODO: This is an unfortunate hack atm! # Used to set input dimensions in network. lower.begin_training(lower.ops.allocate((500, token_vector_width))) @@ -364,7 +366,7 @@ cdef class Parser: cdef np.ndarray scores c_token_ids = token_ids.data c_is_valid = is_valid.data - cdef int has_hidden = hasattr(vec2scores, 'W') + cdef int has_hidden = not getattr(vec2scores, 'is_noop', False) while not next_step.empty(): if not has_hidden: for i in cython.parallel.prange( @@ -414,7 +416,9 @@ cdef class Parser: free(scores) free(token_ids) - def update(self, docs_tokvecs, golds, drop=0., sgd=None): + def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None): + if losses is not None and self.name not in losses: + losses[self.name] = 0. docs, tokvec_lists = docs_tokvecs tokvecs = self.model[0].ops.flatten(tokvec_lists) if isinstance(docs, Doc) and isinstance(golds, GoldParse): @@ -422,27 +426,33 @@ cdef class Parser: golds = [golds] cuda_stream = get_cuda_stream() - golds = [self.moves.preprocess_gold(g) for g in golds] - states = self.moves.init_batch(docs) + states, golds, max_steps = self._init_gold_batch(docs, golds) state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, - drop) - + 0.0) todo = [(s, g) for (s, g) in zip(states, golds) if not s.is_final() and g is not None] + if not todo: + return None backprops = [] d_tokvecs = state2vec.ops.allocate(tokvecs.shape) cdef float loss = 0. - while len(todo) >= 3: + n_steps = 0 + while todo: states, golds = zip(*todo) token_ids = self.get_token_ids(states) - vector, bp_vector = state2vec.begin_update(token_ids, drop=drop) + vector, bp_vector = state2vec.begin_update(token_ids, drop=0.0) + if drop != 0: + mask = vec2scores.ops.get_dropout_mask(vector.shape, drop) + vector *= mask scores, bp_scores = vec2scores.begin_update(vector, drop=drop) d_scores = self.get_batch_loss(states, golds, scores) - d_vector = bp_scores(d_scores, sgd=sgd) + d_vector = bp_scores(d_scores / d_scores.shape[0], sgd=sgd) + if drop != 0: + d_vector *= mask if isinstance(self.model[0].ops, CupyOps) \ and not isinstance(token_ids, state2vec.ops.xp.ndarray): @@ -456,15 +466,51 @@ cdef class Parser: backprops.append((token_ids, d_vector, bp_vector)) self.transition_batch(states, scores) todo = [st for st in todo if not st[0].is_final()] - if len(backprops) >= 50: - self._make_updates(d_tokvecs, - backprops, sgd, cuda_stream) - backprops = [] - if backprops: - self._make_updates(d_tokvecs, - backprops, sgd, cuda_stream) + if losses is not None: + losses[self.name] += (d_scores**2).sum() + n_steps += 1 + if n_steps >= max_steps: + break + self._make_updates(d_tokvecs, + backprops, sgd, cuda_stream) return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs]) + def _init_gold_batch(self, whole_docs, whole_golds): + """Make a square batch, of length equal to the shortest doc. A long + doc will get multiple states. Let's say we have a doc of length 2*N, + where N is the shortest doc. We'll make two states, one representing + long_doc[:N], and another representing long_doc[N:].""" + cdef: + StateClass state + Transition action + whole_states = self.moves.init_batch(whole_docs) + max_length = max(5, min(50, min([len(doc) for doc in whole_docs]))) + max_moves = 0 + states = [] + golds = [] + for doc, state, gold in zip(whole_docs, whole_states, whole_golds): + gold = self.moves.preprocess_gold(gold) + if gold is None: + continue + oracle_actions = self.moves.get_oracle_sequence(doc, gold) + start = 0 + while start < len(doc): + state = state.copy() + n_moves = 0 + while state.B(0) < start and not state.is_final(): + action = self.moves.c[oracle_actions.pop(0)] + action.do(state.c, action.label) + n_moves += 1 + has_gold = self.moves.has_gold(gold, start=start, + end=start+max_length) + if not state.is_final() and has_gold: + states.append(state) + golds.append(gold) + max_moves = max(max_moves, n_moves) + start += min(max_length, len(doc)-start) + max_moves = max(max_moves, len(oracle_actions)) + return states, golds, max_moves + def _make_updates(self, d_tokvecs, backprops, sgd, cuda_stream=None): # Tells CUDA to block, so our async copies complete. if cuda_stream is not None: @@ -481,6 +527,14 @@ cdef class Parser: xp.add.at(d_tokvecs, ids, d_state_features * active_feats) + @property + def move_names(self): + names = [] + for i in range(self.moves.n_moves): + name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label) + names.append(name) + return names + def get_batch_model(self, batch_size, tokvecs, stream, dropout): lower, upper = self.model state2vec = precompute_hiddens(batch_size, tokvecs, diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index fd38710e7..228a3ff91 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -41,6 +41,11 @@ cdef class StateClass: def is_final(self): return self.c.is_final() + def copy(self): + cdef StateClass new_state = StateClass.init(self.c._sent, self.c.length) + new_state.c.clone(self.c) + return new_state + def print_state(self, words): words = list(words) + ['_'] top = words[self.S(0)] + '_%d' % self.S_(0).head diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index d6750d09c..211b2c950 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -5,7 +5,7 @@ from __future__ import unicode_literals from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF from cymem.cymem cimport Pool from thinc.typedefs cimport weight_t -from collections import defaultdict +from collections import defaultdict, OrderedDict from ..structs cimport TokenC from .stateclass cimport StateClass @@ -26,7 +26,7 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL: cdef class TransitionSystem: - def __init__(self, StringStore string_table, dict labels_by_action): + def __init__(self, StringStore string_table, labels_by_action): self.mem = Pool() self.strings = string_table self.n_moves = 0 @@ -34,14 +34,14 @@ cdef class TransitionSystem: self.c = self.mem.alloc(self._size, sizeof(Transition)) - for action, label_strs in sorted(labels_by_action.items()): + for action, label_strs in labels_by_action.items(): for label_str in label_strs: self.add_action(int(action), label_str) self.root_label = self.strings['ROOT'] self.init_beam_state = _init_state def __reduce__(self): - labels_by_action = {} + labels_by_action = OrderedDict() cdef Transition t for trans in self.c[:self.n_moves]: label_str = self.strings[trans.label] @@ -61,6 +61,29 @@ cdef class TransitionSystem: offset += len(doc) return states + def get_oracle_sequence(self, doc, GoldParse gold): + cdef Pool mem = Pool() + costs = mem.alloc(self.n_moves, sizeof(float)) + is_valid = mem.alloc(self.n_moves, sizeof(int)) + + cdef StateClass state = StateClass(doc, offset=0) + self.initialize_state(state.c) + history = [] + while not state.is_final(): + self.set_costs(is_valid, costs, state, gold) + for i in range(self.n_moves): + if is_valid[i] and costs[i] <= 0: + action = self.c[i] + history.append(i) + action.do(state.c, action.label) + break + else: + print(gold.words) + print(gold.ner) + print(history) + raise ValueError("Could not find gold move") + return history + cdef int initialize_state(self, StateC* state) nogil: pass @@ -92,11 +115,21 @@ cdef class TransitionSystem: StateClass stcls, GoldParse gold) except -1: cdef int i self.set_valid(is_valid, stcls.c) + cdef int n_gold = 0 for i in range(self.n_moves): if is_valid[i]: costs[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label) + n_gold += costs[i] <= 0 else: costs[i] = 9000 + if n_gold <= 0: + print(gold.words) + print(gold.ner) + raise ValueError( + "Could not find a gold-standard action to supervise " + "the entity recognizer\n" + "The transition system has %d actions.\n" + "%s" % (self.n_moves)) def add_action(self, int action, label): if not isinstance(label, int): diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py index aab27714e..70fb103dc 100644 --- a/spacy/tests/tokenizer/test_exceptions.py +++ b/spacy/tests/tokenizer/test_exceptions.py @@ -1,7 +1,4 @@ # coding: utf-8 -"""Test that tokenizer exceptions and emoticons are handled correctly.""" - - from __future__ import unicode_literals import pytest @@ -39,3 +36,12 @@ def test_tokenizer_handles_emoticons(tokenizer): def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length): tokens = tokenizer(text) assert len(tokens) == length + + +@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8), + ('i💙you', 3), ('🤘🤘yay!', 4)]) +def test_tokenizer_handles_emoji(tokenizer, text, length): + exceptions = ["hu"] + tokens = tokenizer(text) + if tokens[0].lang_ not in exceptions: + assert len(tokens) == length diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 0e4faafbe..611a68186 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -598,6 +598,24 @@ cdef class Doc: self.is_tagged = bool(TAG in attrs or POS in attrs) return self + def to_disk(self, path): + """Save the current state to a directory. + + path (unicode or Path): A path to a directory, which will be created if + it doesn't exist. Paths may be either strings or `Path`-like objects. + """ + raise NotImplementedError() + + def from_disk(self, path): + """Loads state from a directory. Modifies the object in place and + returns it. + + path (unicode or Path): A path to a directory. Paths may be either + strings or `Path`-like objects. + RETURNS (Doc): The modified `Doc` object. + """ + raise NotImplementedError() + def to_bytes(self): """Serialize, i.e. export the document contents to a binary string. diff --git a/spacy/util.py b/spacy/util.py index f27df54a8..25fe198f4 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -78,27 +78,86 @@ def ensure_path(path): return path -def resolve_model_path(name): - """Resolve a model name or string to a model path. +def load_model(name): + """Load a model from a shortcut link, package or data path. name (unicode): Package name, shortcut link or model path. - RETURNS (Path): Path to model data directory. + RETURNS (Language): `Language` class with the loaded model. """ data_path = get_data_path() if not data_path or not data_path.exists(): raise IOError("Can't find spaCy data path: %s" % path2str(data_path)) if isinstance(name, basestring_): - if (data_path / name).exists(): # in data dir or shortcut link - return (data_path / name) - if is_package(name): # installed as a package - return get_model_package_path(name) - if Path(name).exists(): # path to model - return Path(name) - elif hasattr(name, 'exists'): # Path or Path-like object - return name + if (data_path / name).exists(): # in data dir or shortcut + return load_model_from_path(data_path / name) + if is_package(name): # installed as package + return load_model_from_pkg(name) + if Path(name).exists(): # path to model data directory + return load_data_from_path(Path(name)) + elif hasattr(name, 'exists'): # Path or Path-like to model data + return load_data_from_path(name) raise IOError("Can't find model '%s'" % name) +def load_model_from_init_py(init_file): + """Helper function to use in the `load()` method of a model package's + __init__.py. + + init_file (unicode): Path to model's __init__.py, i.e. `__file__`. + RETURNS (Language): `Language` class with loaded model. + """ + model_path = Path(init_file).parent + return load_data_from_path(model_path, package=True) + + +def load_model_from_path(model_path): + """Import and load a model package from its file path. + + path (unicode or Path): Path to package directory. + RETURNS (Language): `Language` class with loaded model. + """ + model_path = ensure_path(model_path) + spec = importlib.util.spec_from_file_location('model', model_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module.load() + + +def load_model_from_pkg(name): + """Import and load a model package. + + name (unicode): Name of model package installed via pip. + RETURNS (Language): `Language` class with loaded model. + """ + module = importlib.import_module(name) + return module.load() + + +def load_data_from_path(model_path, package=False): + """Initialie a `Language` class with a loaded model from a model data path. + + model_path (unicode or Path): Path to model data directory. + package (bool): Does the path point to the parent package directory? + RETURNS (Language): `Language` class with loaded model. + """ + model_path = ensure_path(model_path) + meta_path = model_path / 'meta.json' + if not meta_path.is_file(): + raise IOError("Could not read meta.json from %s" % location) + meta = read_json(location) + for setting in ['lang', 'name', 'version']: + if setting not in meta: + raise IOError('No %s setting found in model meta.json' % setting) + if package: + model_data_path = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version']) + model_path = model_path / model_data_path + if not model_path.exists(): + raise ValueError("Can't find model directory: %s" % path2str(model_path)) + cls = get_lang_class(meta['lang']) + nlp = cls(pipeline=meta.get('pipeline', True)) + return nlp.from_disk(model_path) + + def is_package(name): """Check if string maps to a package installed via pip. @@ -112,36 +171,16 @@ def is_package(name): return False -def get_model_package_path(package_name): - """Get path to a model package installed via pip. +def get_package_path(name): + """Get the path to an installed package. - package_name (unicode): Name of installed package. - RETURNS (Path): Path to model data directory. + name (unicode): Package name. + RETURNS (Path): Path to installed package. """ # Here we're importing the module just to find it. This is worryingly # indirect, but it's otherwise very difficult to find the package. - # Python's installation and import rules are very complicated. pkg = importlib.import_module(package_name) - package_path = Path(pkg.__file__).parent.parent - meta = parse_package_meta(package_path / package_name) - model_name = '%s-%s' % (package_name, meta['version']) - return package_path / package_name / model_name - - -def parse_package_meta(package_path, require=True): - """Check if a meta.json exists in a package and return its contents. - - package_path (Path): Path to model package directory. - require (bool): If True, raise error if no meta.json is found. - RETURNS (dict or None): Model meta.json data or None. - """ - location = package_path / 'meta.json' - if location.is_file(): - return read_json(location) - elif require: - raise IOError("Could not read meta.json from %s" % location) - else: - return None + return Path(pkg.__file__).parent def is_in_jupyter(): @@ -174,12 +213,16 @@ def get_async(stream, numpy_array): array.set(numpy_array, stream=stream) return array + def itershuffle(iterable, bufsize=1000): """Shuffle an iterator. This works by holding `bufsize` items back - and yielding them sometime later. Obviously, this is not unbiased -- + and yielding them sometime later. Obviously, this is not unbiased – but should be good enough for batching. Larger bufsize means less bias. - From https://gist.github.com/andres-erbsen/1307752 + + iterable (iterable): Iterator to shuffle. + bufsize (int): Items to hold back. + YIELDS (iterable): The shuffled iterator. """ iterable = iter(iterable) buf = [] @@ -313,10 +356,33 @@ def normalize_slice(length, start, stop, step=None): return start, stop -def check_renamed_kwargs(renamed, kwargs): - for old, new in renamed.items(): - if old in kwargs: - raise TypeError("Keyword argument %s now renamed to %s" % (old, new)) +def compounding(start, stop, compound): + """Yield an infinite series of compounding values. Each time the + generator is called, a value is produced by multiplying the previous + value by the compound rate. + + EXAMPLE: + >>> sizes = compounding(1., 10., 1.5) + >>> assert next(sizes) == 1. + >>> assert next(sizes) == 1 * 1.5 + >>> assert next(sizes) == 1.5 * 1.5 + """ + def clip(value): + return max(value, stop) if (start>stop) else min(value, stop) + curr = float(start) + while True: + yield clip(curr) + curr *= compound + + +def decaying(start, stop, decay): + """Yield an infinite series of linearly decaying values.""" + def clip(value): + return max(value, stop) if (start>stop) else min(value, stop) + nr_upd = 1. + while True: + yield clip(start * 1./(1. + decay * nr_upd)) + nr_upd += 1 def read_json(location): diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index b6418bc43..52fd0b35f 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -44,8 +44,6 @@ cdef class Vocab: vice versa. RETURNS (Vocab): The newly constructed vocab object. """ - util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs) - lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} tag_map = tag_map if tag_map is not None else {} if lemmatizer in (None, True, False): diff --git a/website/_includes/_mixins-base.jade b/website/_includes/_mixins-base.jade index 5a7a535c9..80d63353d 100644 --- a/website/_includes/_mixins-base.jade +++ b/website/_includes/_mixins-base.jade @@ -37,15 +37,17 @@ mixin svg(file, name, width, height) size - [integer] icon width and height (default: 20) mixin icon(name, size) - +svg("icons", name, size || 20).o-icon&attributes(attributes) + - var size = size || 20 + +svg("icons", name, size).o-icon(style="min-width: #{size}px")&attributes(attributes) //- Pro/Con/Neutral icon icon - [string] "pro", "con" or "neutral" (default: "neutral") + size - [integer] icon size (optional) -mixin procon(icon) +mixin procon(icon, size) - colors = { pro: "green", con: "red", neutral: "yellow" } - +icon(icon)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes) + +icon(icon, size)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes) //- Headlines Helper Mixin @@ -184,3 +186,14 @@ mixin landing-header() mixin landing-badge(url, graphic, alt, size) +a(url)(aria-label=alt title=alt).c-landing__badge +svg("graphics", graphic, size || 225) + + +//- Under construction (temporary) + Marks sections that still need to be completed for the v2.0 release. + +mixin under-construction() + +infobox("🚧 Under construction") + | This section is still being written and will be updated for the v2.0 + | release. Is there anything that you think should definitely mentioned or + | explained here? Any examples you'd like to see? #[strong Let us know] + | on the #[+a(gh("spacy") + "/issues") v2.0 alpha thread] on GitHub! diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade index f9960b71f..fc4d66841 100644 --- a/website/_includes/_mixins.jade +++ b/website/_includes/_mixins.jade @@ -103,9 +103,11 @@ mixin button(url, trusted, ...style) label - [string] aside title (optional or false for no label) language - [string] language for syntax highlighting (default: "python") supports basic relevant languages available for PrismJS + icon - [string] icon to display next to code block, mostly used for old/new + height - [integer] optional height to clip code block to -mixin code(label, language, icon) - pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : "")&attributes(attributes) +mixin code(label, language, icon, height) + pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : "" style=height ? "height: #{height}px" : "")&attributes(attributes) if label h4.u-text-label.u-text-label--dark=label @@ -176,7 +178,7 @@ mixin label() //- Tag mixin tag() - span.u-text-tag.u-text-tag--spaced(aria-hidden="true") + span.u-text-tag.u-text-tag--spaced(aria-hidden="true")&attributes(attributes) block @@ -190,6 +192,17 @@ mixin tag-model(...capabs) +help(intro + ext + ".").u-color-theme +//- "New" tag to label features new in a specific version + By using a separate mixin with a version ID, it becomes easy to quickly + enable/disable tags without having to modify the markup in the docs. + version - [string or integer] version number, without "v" prefix + +mixin tag-new(version) + - var version = (typeof version == 'number') ? version.toFixed(1) : version + +tag(data-tooltip="This feature is new and was introduced in spaCy v#{version}.") + | v#{version} + + //- List type - [string] "numbers", "letters", "roman" (bulleted list if none set) start - [integer] start number @@ -350,7 +363,22 @@ mixin pos-row(tag, pos, morph, desc) | #[code=m] +cell.u-text-small=desc + mixin dep-row(label, desc) +row +cell #[code=label] +cell=desc + + +//- Table rows for linguistic annotations + annots [array] - array of cell content + style [array] array of 1 (display as code) or 0 (display as text) + +mixin annotation-row(annots, style) + +row + for cell, i in annots + if style && style[i] + - cell = (typeof(cell) != 'boolean') ? cell : cell ? 'True' : 'False' + +cell #[code=cell] + else + +cell=cell diff --git a/website/_includes/_page-docs.jade b/website/_includes/_page-docs.jade index ec2751c4d..26b82381f 100644 --- a/website/_includes/_page-docs.jade +++ b/website/_includes/_page-docs.jade @@ -6,9 +6,17 @@ include _sidebar main.o-main.o-main--sidebar.o-main--aside article.o-content - +h(1)=title - if tag - +tag=tag + +grid.o-no-block + +grid-col(source ? "two-thirds" : "full") + +h(1)=title + if tag + +tag=tag + + if source + +grid-col("third").u-text-right + .o-inline-list + +button(gh("spacy", source), false, "secondary").u-text-tag Source #[+icon("code", 14)] + if ALPHA +infobox("⚠️ You are viewing the spaCy v2.0 alpha docs") diff --git a/website/assets/img/docs/architecture.svg b/website/assets/img/docs/architecture.svg index d62d08f88..c1d12d79b 100644 --- a/website/assets/img/docs/architecture.svg +++ b/website/assets/img/docs/architecture.svg @@ -1,128 +1,128 @@ - + - - Language - - + + Language + + - MAKES - - + MAKES + + - nlp.vocab.morphology - - Vocab - - + nlp.vocab.morphology + + Vocab + + - nlp.vocab - - StringStore - - + nlp.vocab + + StringStore + + - nlp.vocab.strings - - + nlp.vocab.strings + + - nlp.tokenizer.vocab - - Tokenizer - - + nlp.tokenizer.vocab + + Tokenizer + + - nlp.make_doc() - - + nlp.make_doc() + + - nlp.pipeline - - + nlp.pipeline + + - nlp.pipeline[i].vocab - - pt - - en - - de - - fr - - es - - it - - nl - - sv - - fi - - nb - - hu - - he - - bn - - ja - - zh - - - - + nlp.pipeline[i].vocab + + pt + + en + + de + + fr + + es + + it + + nl + + sv + + fi + + nb + + hu + + he + + bn + + ja + + zh + + + + - doc.vocab - - + doc.vocab + + - MAKES - - Doc - - + MAKES + + Doc + + - MAKES - - + MAKES + + - token.doc - - Token - - Span - - + token.doc + + Token + + Span + + - lexeme.vocab - - Lexeme - - + lexeme.vocab + + Lexeme + + - MAKES - - + MAKES + + - span.doc - - Dependency Parser - - Entity Recognizer - - Tagger - - Matcher - - Lemmatizer - - Morphology + span.doc + + Dependency Parser + + Entity Recognizer + + Tagger + + Matcher + + Lemmatizer + + Morphology diff --git a/website/assets/img/docs/language_data.svg b/website/assets/img/docs/language_data.svg index 4662d4c01..31e1a1b29 100644 --- a/website/assets/img/docs/language_data.svg +++ b/website/assets/img/docs/language_data.svg @@ -1,13 +1,13 @@ - Tokenizer + Tokenizer @@ -17,7 +17,7 @@ - Base data + Base data @@ -33,50 +33,50 @@ - Language data + Language data - stop words + stop words - lexical attributes + lexical attributes - tokenizer exceptions + tokenizer exceptions - prefixes, suffixes, infixes + prefixes, suffixes, infixes - lemma data + lemma data - Lemmatizer + Lemmatizer - char classes + char classes - Token + Token - morph rules + morph rules - tag map + tag map - Morphology + Morphology diff --git a/website/assets/img/docs/pipeline.svg b/website/assets/img/docs/pipeline.svg new file mode 100644 index 000000000..8f9dc6dac --- /dev/null +++ b/website/assets/img/docs/pipeline.svg @@ -0,0 +1,30 @@ + + + + + Doc + + + + Text + + + + nlp + + tokenizer + + vectorizer + + + + tagger + + parser + + ner + diff --git a/website/assets/img/docs/tokenization.svg b/website/assets/img/docs/tokenization.svg new file mode 100644 index 000000000..f5b164725 --- /dev/null +++ b/website/assets/img/docs/tokenization.svg @@ -0,0 +1,123 @@ + + + + + “Let’s + + + go + + + to + + + N.Y.!” + + + + + + Let’s + + + go + + + to + + + N.Y.!” + + + + + Let + + + go + + + to + + + N.Y.!” + + + ’s + + + + + + Let + + + go + + + to + + + N.Y.! + + + ’s + + + + + + + + + Let + + + go + + + to + + + N.Y. + + + ’s + + + + + + ! + + + + Let + + go + + to + + N.Y. + + ’s + + + + ! + + EXCEPTION + + PREFIX + + SUFFIX + + SUFFIX + + EXCEPTION + + DONE + diff --git a/website/assets/img/docs/vocab_stringstore.svg b/website/assets/img/docs/vocab_stringstore.svg new file mode 100644 index 000000000..644453737 --- /dev/null +++ b/website/assets/img/docs/vocab_stringstore.svg @@ -0,0 +1,77 @@ + + + + + 3572 + + Lexeme + + 508 + + Lexeme + + 949 + + Lexeme + + + "coffee" + + 3672 + + "I" + + 508 + + "love" + + 949 + + + + + nsubj + + + + dobj + + String + Store + + Vocab + + Doc + + love + VERB + + Token + + I + PRON + + Token + + coffee + NOUN + + Token + + + + + + + + + + + + + diff --git a/website/docs/api/_annotation/_dep-labels.jade b/website/docs/api/_annotation/_dep-labels.jade index 9e1e89324..427b2f53a 100644 --- a/website/docs/api/_annotation/_dep-labels.jade +++ b/website/docs/api/_annotation/_dep-labels.jade @@ -1,10 +1,5 @@ //- 💫 DOCS > API > ANNOTATION > DEPENDENCY LABELS -+infobox("Tip") - | In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the - | description for the string representation of a label. For example, - | #[code spacy.explain("prt")] will return "particle". - +h(3, "dependency-parsing-english") English dependency labels p diff --git a/website/docs/api/_annotation/_named-entities.jade b/website/docs/api/_annotation/_named-entities.jade index 68b3bd17d..476659d4a 100644 --- a/website/docs/api/_annotation/_named-entities.jade +++ b/website/docs/api/_annotation/_named-entities.jade @@ -1,10 +1,5 @@ //- 💫 DOCS > API > ANNOTATION > NAMED ENTITIES -+infobox("Tip") - | In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the - | description for the string representation of an entity label. For example, - | #[code spacy.explain("LANGUAGE")] will return "any named language". - +table([ "Type", "Description" ]) +row +cell #[code PERSON] diff --git a/website/docs/api/_annotation/_pos-tags.jade b/website/docs/api/_annotation/_pos-tags.jade index d3ceef777..ea3a225bf 100644 --- a/website/docs/api/_annotation/_pos-tags.jade +++ b/website/docs/api/_annotation/_pos-tags.jade @@ -1,10 +1,5 @@ //- 💫 DOCS > API > ANNOTATION > POS TAGS -+infobox("Tip") - | In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the - | description for the string representation of a tag. For example, - | #[code spacy.explain("RB")] will return "adverb". - +h(3, "pos-tagging-english") English part-of-speech tag scheme p diff --git a/website/docs/api/_data.json b/website/docs/api/_data.json index 443ee9a67..2af9bca1b 100644 --- a/website/docs/api/_data.json +++ b/website/docs/api/_data.json @@ -24,11 +24,11 @@ "Vocab": "vocab", "StringStore": "stringstore", "GoldParse": "goldparse", - "GoldCorpus": "goldcorpus" + "GoldCorpus": "goldcorpus", + "Binder": "binder" }, "Other": { - "Annotation Specs": "annotation", - "Feature Scheme": "features" + "Annotation Specs": "annotation" } }, @@ -48,62 +48,74 @@ "spacy": { "title": "spaCy top-level functions", + "source": "spacy/__init__.py", "next": "displacy" }, "displacy": { "title": "displaCy", "tag": "module", + "source": "spacy/displacy", "next": "util" }, "util": { "title": "Utility Functions", + "source": "spacy/util.py", "next": "cli" }, "cli": { - "title": "Command Line Interface" + "title": "Command Line Interface", + "source": "spacy/cli" }, "language": { "title": "Language", - "tag": "class" + "tag": "class", + "source": "spacy/language.py" }, "doc": { "title": "Doc", - "tag": "class" + "tag": "class", + "source": "spacy/tokens/doc.pyx" }, "token": { "title": "Token", - "tag": "class" + "tag": "class", + "source": "spacy/tokens/token.pyx" }, "span": { "title": "Span", - "tag": "class" + "tag": "class", + "source": "spacy/tokens/span.pyx" }, "lexeme": { "title": "Lexeme", - "tag": "class" + "tag": "class", + "source": "spacy/lexeme.pyx" }, "vocab": { "title": "Vocab", - "tag": "class" + "tag": "class", + "source": "spacy/vocab.pyx" }, "stringstore": { "title": "StringStore", - "tag": "class" + "tag": "class", + "source": "spacy/strings.pyx" }, "matcher": { "title": "Matcher", - "tag": "class" + "tag": "class", + "source": "spacy/matcher.pyx" }, "dependenyparser": { @@ -123,7 +135,8 @@ "tokenizer": { "title": "Tokenizer", - "tag": "class" + "tag": "class", + "source": "spacy/tokenizer.pyx" }, "tagger": { @@ -133,19 +146,23 @@ "goldparse": { "title": "GoldParse", - "tag": "class" + "tag": "class", + "source": "spacy/gold.pyx" }, "goldcorpus": { "title": "GoldCorpus", - "tag": "class" + "tag": "class", + "source": "spacy/gold.pyx" + }, + + "binder": { + "title": "Binder", + "tag": "class", + "source": "spacy/tokens/binder.pyx" }, "annotation": { "title": "Annotation Specifications" - }, - - "features": { - "title": "Linear Model Feature Scheme" } } diff --git a/website/docs/api/annotation.jade b/website/docs/api/annotation.jade index bc723b5c6..048e69897 100644 --- a/website/docs/api/annotation.jade +++ b/website/docs/api/annotation.jade @@ -14,11 +14,12 @@ p | (#[code ' ']) is included as a token. +aside-code("Example"). - from spacy.en import English - nlp = English(parser=False) + from spacy.lang.en import English + nlp = English() tokens = nlp('Some\nspaces and\ttab characters') - print([t.orth_ for t in tokens]) - # ['Some', '\n', 'spaces', ' ', 'and', '\t', 'tab', 'characters'] + tokens_text = [t.text for t in tokens] + assert tokens_text == ['Some', '\n', 'spaces', ' ', 'and', + '\t', 'tab', 'characters'] p | The whitespace tokens are useful for much the same reason punctuation is @@ -38,6 +39,11 @@ p +h(2, "pos-tagging") Part-of-speech Tagging ++aside("Tip: Understanding tags") + | You can also use #[code spacy.explain()] to get the escription for the + | string representation of a tag. For example, + | #[code spacy.explain("RB")] will return "adverb". + include _annotation/_pos-tags +h(2, "lemmatization") Lemmatization @@ -50,25 +56,35 @@ p A "lemma" is the uninflected form of a word. In English, this means: +item #[strong Nouns]: The form like "dog", not "dogs"; like "child", not "children" +item #[strong Verbs]: The form like "write", not "writes", "writing", "wrote" or "written" -+aside("About spaCy's custom pronoun lemma") - | Unlike verbs and common nouns, there's no clear base form of a personal - | pronoun. Should the lemma of "me" be "I", or should we normalize person - | as well, giving "it" — or maybe "he"? spaCy's solution is to introduce a - | novel symbol, #[code.u-nowrap -PRON-], which is used as the lemma for - | all personal pronouns. - p | The lemmatization data is taken from | #[+a("https://wordnet.princeton.edu") WordNet]. However, we also add a | special case for pronouns: all pronouns are lemmatized to the special | token #[code -PRON-]. ++infobox("About spaCy's custom pronoun lemma") + | Unlike verbs and common nouns, there's no clear base form of a personal + | pronoun. Should the lemma of "me" be "I", or should we normalize person + | as well, giving "it" — or maybe "he"? spaCy's solution is to introduce a + | novel symbol, #[code -PRON-], which is used as the lemma for + | all personal pronouns. + +h(2, "dependency-parsing") Syntactic Dependency Parsing ++aside("Tip: Understanding labels") + | You can also use #[code spacy.explain()] to get the description for the + | string representation of a label. For example, + | #[code spacy.explain("prt")] will return "particle". + include _annotation/_dep-labels +h(2, "named-entities") Named Entity Recognition ++aside("Tip: Understanding entity types") + | You can also use #[code spacy.explain()] to get the description for the + | string representation of an entity label. For example, + | #[code spacy.explain("LANGUAGE")] will return "any named language". + include _annotation/_named-entities +h(3, "biluo") BILUO Scheme diff --git a/website/docs/api/binder.jade b/website/docs/api/binder.jade new file mode 100644 index 000000000..5e3e7d36c --- /dev/null +++ b/website/docs/api/binder.jade @@ -0,0 +1,5 @@ +//- 💫 DOCS > API > BINDER + +include ../../_includes/_mixins + ++under-construction diff --git a/website/docs/api/cli.jade b/website/docs/api/cli.jade index b78d4b7c9..a0acf3e9a 100644 --- a/website/docs/api/cli.jade +++ b/website/docs/api/cli.jade @@ -166,7 +166,7 @@ p | #[+a("/docs/api/annotation#json-input") JSON format]. +code(false, "bash"). - python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--parser-L1] [--no-tagger] [--no-parser] [--no-ner] + python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities] +table(["Argument", "Type", "Description"]) +row @@ -192,18 +192,13 @@ p +row +cell #[code --n-iter], #[code -n] +cell option - +cell Number of iterations (default: #[code 15]). + +cell Number of iterations (default: #[code 20]). +row - +cell #[code --n_sents], #[code -ns] + +cell #[code --n-sents], #[code -ns] +cell option +cell Number of sentences (default: #[code 0]). - +row - +cell #[code --parser-L1], #[code -L] - +cell option - +cell L1 regularization penalty for parser (default: #[code 0.0]). - +row +cell #[code --use-gpu], #[code -G] +cell flag @@ -220,7 +215,7 @@ p +cell Don't train parser. +row - +cell #[code --no-ner], #[code -N] + +cell #[code --no-entities], #[code -N] +cell flag +cell Don't train NER. @@ -229,6 +224,106 @@ p +cell flag +cell Show help message and available arguments. ++h(3, "train-hyperparams") Environment variables for hyperparameters + +p + | spaCy lets you set hyperparameters for training via environment variables. + | This is useful, because it keeps the command simple and allows you to + | #[+a("https://askubuntu.com/questions/17536/how-do-i-create-a-permanent-bash-alias/17537#17537") create an alias] + | for your custom #[code train] command while still being able to easily + | tweak the hyperparameters. For example: + ++code(false, "bash"). + parser_hidden_depth=2 parser_maxout_pieces=1 train-parser + ++under-construction + ++table(["Name", "Description", "Default"]) + +row + +cell #[code dropout_from] + +cell + +cell #[code 0.2] + + +row + +cell #[code dropout_to] + +cell + +cell #[code 0.2] + + +row + +cell #[code dropout_decay] + +cell + +cell #[code 0.0] + + +row + +cell #[code batch_from] + +cell + +cell #[code 1] + + +row + +cell #[code batch_to] + +cell + +cell #[code 64] + + +row + +cell #[code batch_compound] + +cell + +cell #[code 1.001] + + +row + +cell #[code token_vector_width] + +cell + +cell #[code 128] + + +row + +cell #[code embed_size] + +cell + +cell #[code 7500] + + +row + +cell #[code parser_maxout_pieces] + +cell + +cell #[code 2] + + +row + +cell #[code parser_hidden_depth] + +cell + +cell #[code 1] + + +row + +cell #[code hidden_width] + +cell + +cell #[code 128] + + +row + +cell #[code learn_rate] + +cell + +cell #[code 0.001] + + +row + +cell #[code optimizer_B1] + +cell + +cell #[code 0.9] + + +row + +cell #[code optimizer_B2] + +cell + +cell #[code 0.999] + + +row + +cell #[code optimizer_eps] + +cell + +cell #[code 1e-08] + + +row + +cell #[code L2_penalty] + +cell + +cell #[code 1e-06] + + +row + +cell #[code grad_norm_clip] + +cell + +cell #[code 1.0] + +h(2, "package") Package p diff --git a/website/docs/api/displacy.jade b/website/docs/api/displacy.jade index a5352ade8..a96d8a397 100644 --- a/website/docs/api/displacy.jade +++ b/website/docs/api/displacy.jade @@ -10,6 +10,7 @@ p +h(2, "serve") displacy.serve +tag method + +tag-new(2) p | Serve a dependency parse tree or named entity visualization to view it @@ -71,6 +72,7 @@ p +h(2, "render") displacy.render +tag method + +tag-new(2) p Render a dependency parse tree or named entity visualization. diff --git a/website/docs/api/doc.jade b/website/docs/api/doc.jade index 6a9faf4b4..9b8392fcb 100644 --- a/website/docs/api/doc.jade +++ b/website/docs/api/doc.jade @@ -253,6 +253,47 @@ p +cell #[code Doc] +cell Itself. ++h(2, "to_disk") Doc.to_disk + +tag method + +tag-new(2) + +p Save the current state to a directory. + ++aside-code("Example"). + doc.to_disk('/path/to/doc') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code path] + +cell unicode or #[code Path] + +cell + | A path to a directory, which will be created if it doesn't exist. + | Paths may be either strings or #[code Path]-like objects. + ++h(2, "from_disk") Doc.from_disk + +tag method + +tag-new(2) + +p Loads state from a directory. Modifies the object in place and returns it. + ++aside-code("Example"). + from spacy.tokens import Doc + from spacy.vocab import Vocab + doc = Doc(Vocab()).from_disk('/path/to/doc') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code path] + +cell unicode or #[code Path] + +cell + | A path to a directory. Paths may be either strings or + | #[code Path]-like objects. + + +footrow + +cell returns + +cell #[code Doc] + +cell The modified #[code Doc] object. + +h(2, "to_bytes") Doc.to_bytes +tag method diff --git a/website/docs/api/features.jade b/website/docs/api/features.jade deleted file mode 100644 index 018790145..000000000 --- a/website/docs/api/features.jade +++ /dev/null @@ -1,138 +0,0 @@ -//- 💫 DOCS > API > LINEAR MOEL FEATURES - -include ../../_includes/_mixins - -p - | There are two popular strategies for putting together machine learning - | models for NLP: sparse linear models, and neural networks. To solve NLP - | problems with linear models, feature templates need to be assembled that - | combine multiple atomic predictors. This page documents the atomic - | predictors used in the spaCy 1.0 #[+api("parser") #[code Parser]], - | #[+api("tagger") #[code Tagger]] and - | #[+api("entityrecognizer") #[code EntityRecognizer]]. - -p - | To understand the scheme, recall that spaCy's #[code Parser] and - | #[code EntityRecognizer] are implemented as push-down automata. They - | maintain a "stack" that holds the current entity, and a "buffer" - | consisting of the words to be processed. - -p - | Each state consists of the words on the stack (if any), which consistute - | the current entity being constructed. We also have the current word, and - | the two subsequent words. Finally, we also have the entities previously - | built. - -p - | This gives us a number of tokens to ask questions about, to make the - | features. About each of these tokens, we can ask about a number of - | different properties. Each feature identifier asks about a specific - | property of a specific token of the context. - -+h(2, "tokens") Context tokens - -+table([ "ID", "Description" ]) - +row - +cell #[code S0] - +cell - | The first word on the stack, i.e. the token most recently added - | to the current entity. - - +row - +cell #[code S1] - +cell The second word on the stack, i.e. the second most recently added. - - +row - +cell #[code S2] - +cell The third word on the stack, i.e. the third most recently added. - - +row - +cell #[code N0] - +cell The first word of the buffer, i.e. the current word being tagged. - - +row - +cell #[code N1] - +cell The second word of the buffer. - - +row - +cell #[code N2] - +cell The third word of the buffer. - - +row - +cell #[code P1] - +cell The word immediately before #[code N0]. - - +row - +cell #[code P2] - +cell The second word before #[code N0]. - - +row - +cell #[code E0] - +cell The first word of the previously constructed entity. - - +row - +cell #[code E1] - +cell The first word of the second previously constructed entity. - -p About each of these tokens, we can ask: - -+table([ "ID", "Attribute", "Description" ]) - +row - +cell #[code N0w] - +cell #[code token.orth] - +cell The word form. - - +row - +cell #[code N0W] - +cell #[code token.lemma] - +cell The word's lemma. - - +row - +cell #[code N0p] - +cell #[code token.tag] - +cell The word's (full) POS tag. - - +row - +cell #[code N0c] - +cell #[code token.cluster] - +cell The word's (full) Brown cluster. - - +row - +cell #[code N0c4] - +cell - - +cell First four digit prefix of the word's Brown cluster. - - +row - +cell #[code N0c6] - +cell - - +cell First six digit prefix of the word's Brown cluster. - - +row - +cell #[code N0L] - +cell - - +cell The word's dependency label. Not used as a feature in the NER. - - +row - +cell #[code N0_prefix] - +cell #[code token.prefix] - +cell The first three characters of the word. - - +row - +cell #[code N0_suffix] - +cell #[code token.suffix] - +cell The last three characters of the word. - - +row - +cell #[code N0_shape] - +cell #[code token.shape] - +cell The word's shape, i.e. is it alphabetic, numeric, etc. - - +row - +cell #[code N0_ne_iob] - +cell #[code token.ent_iob] - +cell The Inside/Outside/Begin code of the word's NER tag. - - +row - +cell #[code N0_ne_type] - +cell #[code token.ent_type] - +cell The word's NER type. diff --git a/website/docs/api/goldcorpus.jade b/website/docs/api/goldcorpus.jade index bfff92ad5..3b3d92823 100644 --- a/website/docs/api/goldcorpus.jade +++ b/website/docs/api/goldcorpus.jade @@ -8,6 +8,7 @@ p +h(2, "init") GoldCorpus.__init__ +tag method + +tag-new(2) p Create a #[code GoldCorpus]. diff --git a/website/docs/api/language-models.jade b/website/docs/api/language-models.jade index 0990de358..74007f228 100644 --- a/website/docs/api/language-models.jade +++ b/website/docs/api/language-models.jade @@ -2,7 +2,10 @@ include ../../_includes/_mixins -p spaCy currently supports the following languages and capabilities: +p + | spaCy currently provides models for the following languages and + | capabilities: + +aside-code("Download language models", "bash"). python -m spacy download en @@ -22,12 +25,16 @@ p spaCy currently supports the following languages and capabilities: +row +cell French #[code fr] - each icon in [ "pro", "pro", "con", "pro", "con", "pro", "pro", "con" ] + each icon in [ "pro", "con", "con", "pro", "con", "pro", "pro", "con" ] +cell.u-text-center #[+procon(icon)] -+h(2, "available") Available models + +row + +cell Spanish #[code es] + each icon in [ "pro", "pro", "con", "pro", "pro", "pro", "pro", "con" ] + +cell.u-text-center #[+procon(icon)] -include ../usage/_models-list +p + +button("/docs/usage/models", true, "primary") See available models +h(2, "alpha-support") Alpha tokenization support @@ -52,9 +59,35 @@ p | #[+a("https://github.com/mocobeta/janome") Janome]. +table([ "Language", "Code", "Source" ]) - each language, code in { es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", da: "Danish", hu: "Hungarian", pl: "Polish", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" } + each language, code in { it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", da: "Danish", hu: "Hungarian", pl: "Polish", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" } +row +cell #{language} +cell #[code=code] +cell +src(gh("spaCy", "spacy/lang/" + code)) lang/#{code} + ++h(2, "multi-language") Multi-language support + +tag-new(2) + +p + | As of v2.0, spaCy supports models trained on more than one language. This + | is especially useful for named entity recognition. The language ID used + | for multi-language or language-neutral models is #[code xx]. The + | language class, a generic subclass containing only the base language data, + | can be found in #[+src(gh("spaCy", "spacy/lang/xx")) lang/xx]. + +p + | To load your model with the neutral, multi-language class, simply set + | #[code "language": "xx"] in your + | #[+a("/docs/usage/saving-loading#models-generating") model package]'s + | meta.json. You can also import the class directly, or call + | #[+api("util#get_lang_class") #[code util.get_lang_class()]] for + | lazy-loading. + ++code("Standard import"). + from spacy.lang.xx import MultiLanguage + nlp = MultiLanguage() + ++code("With lazy-loading"). + from spacy.util import get_lang_class + nlp = get_lang_class('xx') diff --git a/website/docs/api/language.jade b/website/docs/api/language.jade index 455165bca..9e45a89d9 100644 --- a/website/docs/api/language.jade +++ b/website/docs/api/language.jade @@ -73,15 +73,26 @@ p +cell The text to be processed. +row - +cell #[code **disabled] - +cell - - +cell Elements of the pipeline that should not be run. + +cell #[code disable] + +cell list + +cell + | Names of pipeline components to + | #[+a("/docs/usage/language-processing-pipeline#disabling") disable]. +footrow +cell returns +cell #[code Doc] +cell A container for accessing the annotations. ++infobox("⚠️ Deprecation note") + .o-block + | Pipeline components to prevent from being loaded can now be added as + | a list to #[code disable], instead of specifying one keyword argument + | per component. + + +code-new doc = nlp(u"I don't want parsed", disable=['parser']) + +code-old doc = nlp(u"I don't want parsed", parse=False) + +h(2, "pipe") Language.pipe +tag method @@ -112,6 +123,13 @@ p +cell int +cell The number of texts to buffer. + +row + +cell #[code disable] + +cell list + +cell + | Names of pipeline components to + | #[+a("/docs/usage/language-processing-pipeline#disabling") disable]. + +footrow +cell yields +cell #[code Doc] @@ -227,8 +245,11 @@ p +h(2, "to_disk") Language.to_disk +tag method + +tag-new(2) -p Save the current state to a directory. +p + | Save the current state to a directory. If a model is loaded, this will + | #[strong include the model]. +aside-code("Example"). nlp.to_disk('/path/to/models') @@ -242,14 +263,21 @@ p Save the current state to a directory. | Paths may be either strings or #[code Path]-like objects. +row - +cell #[code **exclude] - +cell - - +cell Named attributes to prevent from being saved. + +cell #[code disable] + +cell list + +cell + | Names of pipeline components to + | #[+a("/docs/usage/language-processing-pipeline#disabling") disable] + | and prevent from being saved. +h(2, "from_disk") Language.from_disk +tag method + +tag-new(2) -p Loads state from a directory. Modifies the object in place and returns it. +p + | Loads state from a directory. Modifies the object in place and returns + | it. If the saved #[code Language] object contains a model, the + | #[strong model will be loaded]. +aside-code("Example"). from spacy.language import Language @@ -264,15 +292,28 @@ p Loads state from a directory. Modifies the object in place and returns it. | #[code Path]-like objects. +row - +cell #[code **exclude] - +cell - - +cell Named attributes to prevent from being loaded. + +cell #[code disable] + +cell list + +cell + | Names of pipeline components to + | #[+a("/docs/usage/language-processing-pipeline#disabling") disable]. +footrow +cell returns +cell #[code Language] +cell The modified #[code Language] object. ++infobox("⚠️ Deprecation note") + .o-block + | As of spaCy v2.0, the #[code save_to_directory] method has been + | renamed to #[code to_disk], to improve consistency across classes. + | Pipeline components to prevent from being loaded can now be added as + | a list to #[code disable], instead of specifying one keyword argument + | per component. + + +code-new nlp = English().from_disk(disable=['tagger', 'ner']) + +code-old nlp = spacy.load('en', tagger=False, entity=False) + +h(2, "to_bytes") Language.to_bytes +tag method @@ -283,9 +324,12 @@ p Serialize the current state to a binary string. +table(["Name", "Type", "Description"]) +row - +cell #[code **exclude] - +cell - - +cell Named attributes to prevent from being serialized. + +cell #[code disable] + +cell list + +cell + | Names of pipeline components to + | #[+a("/docs/usage/language-processing-pipeline#disabling") disable] + | and prevent from being serialized. +footrow +cell returns @@ -310,15 +354,26 @@ p Load state from a binary string. +cell The data to load from. +row - +cell #[code **exclude] - +cell - - +cell Named attributes to prevent from being loaded. + +cell #[code disable] + +cell list + +cell + | Names of pipeline components to + | #[+a("/docs/usage/language-processing-pipeline#disabling") disable]. +footrow +cell returns +cell #[code Language] +cell The #[code Language] object. ++infobox("⚠️ Deprecation note") + .o-block + | Pipeline components to prevent from being loaded can now be added as + | a list to #[code disable], instead of specifying one keyword argument + | per component. + + +code-new nlp = English().from_bytes(bytes, disable=['tagger', 'ner']) + +code-old nlp = English().from_bytes('en', tagger=False, entity=False) + +h(2, "attributes") Attributes +table(["Name", "Type", "Description"]) @@ -327,6 +382,11 @@ p Load state from a binary string. +cell #[code Vocab] +cell A container for the lexical types. + +row + +cell #[code tokenizer] + +cell #[code Tokenizer] + +cell The tokenizer. + +row +cell #[code make_doc] +cell #[code lambda text: Doc] diff --git a/website/docs/api/lexeme.jade b/website/docs/api/lexeme.jade index dba6fdf59..a0487be9b 100644 --- a/website/docs/api/lexeme.jade +++ b/website/docs/api/lexeme.jade @@ -212,62 +212,74 @@ p The L2 norm of the lexeme's vector representation. +row +cell #[code is_alpha] +cell bool - +cell Equivalent to #[code word.orth_.isalpha()]. + +cell + | Does the lexeme consist of alphabetic characters? Equivalent to + | #[code lexeme.text.isalpha()]. +row +cell #[code is_ascii] +cell bool - +cell Equivalent to #[code [any(ord(c) >= 128 for c in word.orth_)]]. + +cell + | Does the lexeme consist of ASCII characters? Equivalent to + | #[code [any(ord(c) >= 128 for c in lexeme.text)]]. +row +cell #[code is_digit] +cell bool - +cell Equivalent to #[code word.orth_.isdigit()]. + +cell + | Does the lexeme consist of digits? Equivalent to + | #[code lexeme.text.isdigit()]. +row +cell #[code is_lower] +cell bool - +cell Equivalent to #[code word.orth_.islower()]. + +cell + | Is the lexeme in lowercase? Equivalent to + | #[code lexeme.text.islower()]. +row +cell #[code is_title] +cell bool - +cell Equivalent to #[code word.orth_.istitle()]. + +cell + | Is the lexeme in titlecase? Equivalent to + | #[code lexeme.text.istitle()]. +row +cell #[code is_punct] +cell bool - +cell Equivalent to #[code word.orth_.ispunct()]. + +cell Is the lexeme punctuation? +row +cell #[code is_space] +cell bool - +cell Equivalent to #[code word.orth_.isspace()]. + +cell + | Does the lexeme consist of whitespace characters? Equivalent to + | #[code lexeme.text.isspace()]. +row +cell #[code like_url] +cell bool - +cell Does the word resemble a URL? + +cell Does the lexeme resemble a URL? +row +cell #[code like_num] +cell bool - +cell Does the word represent a number? e.g. “10.9”, “10”, “ten”, etc. + +cell Does the lexeme represent a number? e.g. "10.9", "10", "ten", etc. +row +cell #[code like_email] +cell bool - +cell Does the word resemble an email address? + +cell Does the lexeme resemble an email address? +row +cell #[code is_oov] +cell bool - +cell Is the word out-of-vocabulary? + +cell Is the lexeme out-of-vocabulary? +row +cell #[code is_stop] +cell bool - +cell Is the word part of a "stop list"? + +cell Is the lexeme part of a "stop list"? +row +cell #[code lang] diff --git a/website/docs/api/matcher.jade b/website/docs/api/matcher.jade index 5d0e8af95..e2972fdc0 100644 --- a/website/docs/api/matcher.jade +++ b/website/docs/api/matcher.jade @@ -5,13 +5,14 @@ include ../../_includes/_mixins p Match sequences of tokens, based on pattern rules. +infobox("⚠️ Deprecation note") - | As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity] - | are deprecated and have been replaced with a simpler - | #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of - | patterns and a callback for a given match ID. #[code Matcher.get_entity] - | is now called #[+api("matcher#get") #[code matcher.get]]. - | #[code Matcher.load] (not useful, as it didn't allow specifying callbacks), - | and #[code Matcher.has_entity] (now redundant) have been removed. + .o-block + | As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity] + | are deprecated and have been replaced with a simpler + | #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of + | patterns and a callback for a given match ID. #[code Matcher.get_entity] + | is now called #[+api("matcher#get") #[code matcher.get]]. + | #[code Matcher.load] (not useful, as it didn't allow specifying callbacks), + | and #[code Matcher.has_entity] (now redundant) have been removed. +h(2, "init") Matcher.__init__ +tag method @@ -56,17 +57,6 @@ p Find all token sequences matching the supplied patterns on the #[code Doc]. doc = nlp(u'hello world!') matches = matcher(doc) -+infobox("Important note") - | By default, the matcher #[strong does not perform any action] on matches, - | like tagging matched phrases with entity types. Instead, actions need to - | be specified when #[strong adding patterns or entities], by - | passing in a callback function as the #[code on_match] argument on - | #[+api("matcher#add") #[code add]]. This allows you to define custom - | actions per pattern within the same matcher. For example, you might only - | want to merge some entity types, and set custom flags for other matched - | patterns. For more details and examples, see the usage workflow on - | #[+a("/docs/usage/rule-based-matching") rule-based matching]. - +table(["Name", "Type", "Description"]) +row +cell #[code doc] @@ -81,6 +71,17 @@ p Find all token sequences matching the supplied patterns on the #[code Doc]. | matches. A match tuple describes a span #[code doc[start:end]]. | The #[code match_id] is the ID of the added match pattern. ++infobox("Important note") + | By default, the matcher #[strong does not perform any action] on matches, + | like tagging matched phrases with entity types. Instead, actions need to + | be specified when #[strong adding patterns or entities], by + | passing in a callback function as the #[code on_match] argument on + | #[+api("matcher#add") #[code add]]. This allows you to define custom + | actions per pattern within the same matcher. For example, you might only + | want to merge some entity types, and set custom flags for other matched + | patterns. For more details and examples, see the usage workflow on + | #[+a("/docs/usage/rule-based-matching") rule-based matching]. + +h(2, "pipe") Matcher.pipe +tag method @@ -118,6 +119,7 @@ p Match a stream of documents, yielding them in turn. +h(2, "len") Matcher.__len__ +tag method + +tag-new(2) p | Get the number of rules added to the matcher. Note that this only returns @@ -138,6 +140,7 @@ p +h(2, "contains") Matcher.__contains__ +tag method + +tag-new(2) p Check whether the matcher contains rules for a match ID. @@ -159,6 +162,7 @@ p Check whether the matcher contains rules for a match ID. +h(2, "add") Matcher.add +tag method + +tag-new(2) p | Add a rule to the matcher, consisting of an ID key, one or more patterns, and @@ -198,8 +202,23 @@ p | Match pattern. A pattern consists of a list of dicts, where each | dict describes a token. ++infobox("⚠️ Deprecation note") + .o-block + | As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity] + | are deprecated and have been replaced with a simpler + | #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of + | patterns and a callback for a given match ID. + + +code-new. + matcher.add('GoogleNow', merge_phrases, [{ORTH: 'Google'}, {ORTH: 'Now'}]) + + +code-old. + matcher.add_entity('GoogleNow', on_match=merge_phrases) + matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}]) + +h(2, "remove") Matcher.remove +tag method + +tag-new(2) p | Remove a rule from the matcher. A #[code KeyError] is raised if the match @@ -219,6 +238,7 @@ p +h(2, "get") Matcher.get +tag method + +tag-new(2) p | Retrieve the pattern stored for a key. Returns the rule as an diff --git a/website/docs/api/philosophy.jade b/website/docs/api/philosophy.jade deleted file mode 100644 index eda911045..000000000 --- a/website/docs/api/philosophy.jade +++ /dev/null @@ -1,14 +0,0 @@ -//- 💫 DOCS > API > PHILOSOPHY - -include ../../_includes/_mixins - -p Every product needs to know why it exists. Here's what we're trying to with spaCy and why it's different from other NLP libraries. - -+h(2) 1. No job too big. -p Most programs get cheaper to run over time, but NLP programs often get more expensive. The data often grows faster than the hardware improves. For web-scale tasks, Moore's law can't save us — so if we want to read the web, we have to sweat performance. - -+h(2) 2. Take a stand. -p Most NLP toolkits position themselves as platforms, rather than libraries. They offer a pluggable architecture, and leave it to the user to arrange the components they offer into a useful system. This is fine for researchers, but for production users, this does too little. Components go out of date quickly, and configuring a good system takes very detailed knowledge. Compatibility problems can be extremely subtle. spaCy is therefore extremely opinionated. The API does not expose any algorithmic details. You're free to configure another pipeline, but the core library eliminates redundancy, and only offers one choice of each component. - -+h(2) 3. Stay current. -p There's often significant improvement in NLP models year-on-year. This has been especially true recently, given the success of deep learning models. With spaCy, you should be able to build things you couldn't build yesterday. To deliver on that promise, we need to be giving you the latest stuff. diff --git a/website/docs/api/spacy.jade b/website/docs/api/spacy.jade index da8c97b9c..a45307378 100644 --- a/website/docs/api/spacy.jade +++ b/website/docs/api/spacy.jade @@ -11,8 +11,13 @@ p | the name of an installed | #[+a("/docs/usage/saving-loading#generating") model package], a unicode | path or a #[code Path]-like object. spaCy will try resolving the load - | argument in this order. The #[code Language] class to initialise will be - | determined based on the model's settings. + | argument in this order. If a model is loaded from a shortcut link or + | package name, spaCy will assume it's a Python package and import it and + | call the model's own #[code load()] method. If a model is loaded from a + | path, spaCy will assume it's a data directory, read the language and + | pipeline settings off the meta.json and initialise the #[code Language] + | class. The data will be loaded in via + | #[+api("language#from_disk") #[code Language.from_disk()]]. +aside-code("Example"). nlp = spacy.load('en') # shortcut link @@ -20,12 +25,7 @@ p nlp = spacy.load('/path/to/en') # unicode path nlp = spacy.load(Path('/path/to/en')) # pathlib Path -+infobox("⚠️ Deprecation note") - | As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy - | will also raise an error if no model could be loaded and never just - | return an empty #[code Language] object. If you need a blank language, - | you need to import it explicitly (#[code from spacy.lang.en import English]) - | or use #[+api("util#get_lang_class") #[code util.get_lang_class]]. + nlp = spacy.load('en', disable=['parser', 'tagger']) +table(["Name", "Type", "Description"]) +row @@ -33,11 +33,29 @@ p +cell unicode or #[code Path] +cell Model to load, i.e. shortcut link, package name or path. + +row + +cell #[code disable] + +cell list + +cell + | Names of pipeline components to + | #[+a("/docs/usage/language-processing-pipeline#disabling") disable]. + +footrow +cell returns +cell #[code Language] +cell A #[code Language] object with the loaded model. ++infobox("⚠️ Deprecation note") + .o-block + | As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy + | will also raise an error if no model could be loaded and never just + | return an empty #[code Language] object. If you need a blank language, + | you need to import it explicitly (#[code from spacy.lang.en import English]) + | or use #[+api("util#get_lang_class") #[code util.get_lang_class]]. + + +code-new nlp = spacy.load('/model') + +code-old nlp = spacy.load('en', path='/model') + +h(2, "info") spacy.info +tag function @@ -93,3 +111,37 @@ p +cell returns +cell unicode +cell The explanation, or #[code None] if not found in the glossary. + ++h(2, "set_factory") spacy.set_factory + +tag function + +tag-new(2) + +p + | Set a factory that returns a custom + | #[+a("/docs/usage/language-processing-pipeline") processing pipeline] + | component. Factories are useful for creating stateful components, especially ones which depend on shared data. + ++aside-code("Example"). + def my_factory(vocab): + def my_component(doc): + return doc + return my_component + + spacy.set_factory('my_factory', my_factory) + nlp = Language(pipeline=['my_factory']) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code factory_id] + +cell unicode + +cell + | Unique name of factory. If added to a new pipeline, spaCy will + | look up the factory for this ID and use it to create the + | component. + + +row + +cell #[code factory] + +cell callable + +cell + | Callable that takes a #[code Vocab] object and returns a pipeline + | component. diff --git a/website/docs/api/stringstore.jade b/website/docs/api/stringstore.jade index 5f5912edd..f09352c79 100644 --- a/website/docs/api/stringstore.jade +++ b/website/docs/api/stringstore.jade @@ -104,6 +104,7 @@ p +h(2, "to_disk") StringStore.to_disk +tag method + +tag-new(2) p Save the current state to a directory. @@ -118,8 +119,9 @@ p Save the current state to a directory. | A path to a directory, which will be created if it doesn't exist. | Paths may be either strings or #[code Path]-like objects. -+h(2, "from_disk") Tokenizer.from_disk ++h(2, "from_disk") StringStore.from_disk +tag method + +tag-new(2) p Loads state from a directory. Modifies the object in place and returns it. @@ -137,10 +139,10 @@ p Loads state from a directory. Modifies the object in place and returns it. +footrow +cell returns - +cell #[code Tokenizer] - +cell The modified #[code Tokenizer] object. + +cell #[code StringStore] + +cell The modified #[code StringStore] object. -+h(2, "to_bytes") Tokenizer.to_bytes ++h(2, "to_bytes") StringStore.to_bytes +tag method p Serialize the current state to a binary string. @@ -157,9 +159,9 @@ p Serialize the current state to a binary string. +footrow +cell returns +cell bytes - +cell The serialized form of the #[code Tokenizer] object. + +cell The serialized form of the #[code StringStore] object. -+h(2, "from_bytes") Tokenizer.from_bytes ++h(2, "from_bytes") StringStore.from_bytes +tag method p Load state from a binary string. diff --git a/website/docs/api/token.jade b/website/docs/api/token.jade index 9be41081c..ee989047c 100644 --- a/website/docs/api/token.jade +++ b/website/docs/api/token.jade @@ -338,8 +338,10 @@ p The L2 norm of the token's vector representation. +cell #[code ent_iob] +cell int +cell - | IOB code of named entity tag. - | #[code 1="I", 2="O", 3="B"]. #[code 0] means no tag is assigned. + | IOB code of named entity tag. #[code "B"] + | means the token begins an entity, #[code "I"] means it is inside + | an entity, #[code "O"] means it is outside an entity, and + | #[code ""] means no entity tag is set. +row +cell #[code ent_iob_] @@ -368,116 +370,131 @@ p The L2 norm of the token's vector representation. +cell #[code lemma] +cell int +cell - | Base form of the word, with no inflectional suffixes. + | Base form of the token, with no inflectional suffixes. +row +cell #[code lemma_] +cell unicode - +cell Base form of the word, with no inflectional suffixes. + +cell Base form of the token, with no inflectional suffixes. +row +cell #[code lower] +cell int - +cell Lower-case form of the word. + +cell Lower-case form of the token. +row +cell #[code lower_] +cell unicode - +cell Lower-case form of the word. + +cell Lower-case form of the token. +row +cell #[code shape] +cell int - +cell Transform of the word's string, to show orthographic features. + +cell + | Transform of the tokens's string, to show orthographic features. + | For example, "Xxxx" or "dd". +row +cell #[code shape_] +cell unicode - +cell A transform of the word's string, to show orthographic features. + | Transform of the tokens's string, to show orthographic features. + | For example, "Xxxx" or "dd". +row +cell #[code prefix] +cell int +cell Integer ID of a length-N substring from the start of the - | word. Defaults to #[code N=1]. + | token. Defaults to #[code N=1]. +row +cell #[code prefix_] +cell unicode +cell - | A length-N substring from the start of the word. Defaults to + | A length-N substring from the start of the token. Defaults to | #[code N=1]. +row +cell #[code suffix] +cell int +cell - | Length-N substring from the end of the word. Defaults to #[code N=3]. + | Length-N substring from the end of the token. Defaults to #[code N=3]. +row +cell #[code suffix_] +cell unicode - +cell Length-N substring from the end of the word. Defaults to #[code N=3]. + +cell Length-N substring from the end of the token. Defaults to #[code N=3]. +row +cell #[code is_alpha] +cell bool - +cell Equivalent to #[code word.orth_.isalpha()]. + +cell + | Does the token consist of alphabetic characters? Equivalent to + | #[code token.text.isalpha()]. +row +cell #[code is_ascii] +cell bool - +cell Equivalent to #[code [any(ord(c) >= 128 for c in word.orth_)]]. + +cell + | Does the token consist of ASCII characters? Equivalent to + | #[code [any(ord(c) >= 128 for c in token.text)]]. +row +cell #[code is_digit] +cell bool - +cell Equivalent to #[code word.orth_.isdigit()]. + +cell + | Does the token consist of digits? Equivalent to + | #[code token.text.isdigit()]. +row +cell #[code is_lower] +cell bool - +cell Equivalent to #[code word.orth_.islower()]. + +cell + | Is the token in lowercase? Equivalent to + | #[code token.text.islower()]. +row +cell #[code is_title] +cell bool - +cell Equivalent to #[code word.orth_.istitle()]. + +cell + | Is the token in titlecase? Equivalent to + | #[code token.text.istitle()]. +row +cell #[code is_punct] +cell bool - +cell Equivalent to #[code word.orth_.ispunct()]. + +cell Is the token punctuation? +row +cell #[code is_space] +cell bool - +cell Equivalent to #[code word.orth_.isspace()]. + +cell + | Does the token consist of whitespace characters? Equivalent to + | #[code token.text.isspace()]. +row +cell #[code like_url] +cell bool - +cell Does the word resemble a URL? + +cell Does the token resemble a URL? +row +cell #[code like_num] +cell bool - +cell Does the word represent a number? e.g. “10.9”, “10”, “ten”, etc. + +cell Does the token represent a number? e.g. "10.9", "10", "ten", etc. +row +cell #[code like_email] +cell bool - +cell Does the word resemble an email address? + +cell Does the token resemble an email address? +row +cell #[code is_oov] +cell bool - +cell Is the word out-of-vocabulary? + +cell Is the token out-of-vocabulary? +row +cell #[code is_stop] +cell bool - +cell Is the word part of a "stop list"? + +cell Is the token part of a "stop list"? +row +cell #[code pos] diff --git a/website/docs/api/tokenizer.jade b/website/docs/api/tokenizer.jade index 87929e91b..8d933f75b 100644 --- a/website/docs/api/tokenizer.jade +++ b/website/docs/api/tokenizer.jade @@ -198,91 +198,6 @@ p | attributes. The #[code ORTH] fields of the attributes must | exactly match the string when they are concatenated. -+h(2, "to_disk") Tokenizer.to_disk - +tag method - -p Save the current state to a directory. - -+aside-code("Example"). - tokenizer.to_disk('/path/to/tokenizer') - -+table(["Name", "Type", "Description"]) - +row - +cell #[code path] - +cell unicode or #[code Path] - +cell - | A path to a directory, which will be created if it doesn't exist. - | Paths may be either strings or #[code Path]-like objects. - -+h(2, "from_disk") Tokenizer.from_disk - +tag method - -p Loads state from a directory. Modifies the object in place and returns it. - -+aside-code("Example"). - from spacy.tokenizer import Tokenizer - tokenizer = Tokenizer(nlp.vocab) - tokenizer = tokenizer.from_disk('/path/to/tokenizer') - -+table(["Name", "Type", "Description"]) - +row - +cell #[code path] - +cell unicode or #[code Path] - +cell - | A path to a directory. Paths may be either strings or - | #[code Path]-like objects. - - +footrow - +cell returns - +cell #[code Tokenizer] - +cell The modified #[code Tokenizer] object. - -+h(2, "to_bytes") Tokenizer.to_bytes - +tag method - -p Serialize the current state to a binary string. - -+aside-code("Example"). - tokenizer_bytes = tokenizer.to_bytes() - -+table(["Name", "Type", "Description"]) - +row - +cell #[code **exclude] - +cell - - +cell Named attributes to prevent from being serialized. - - +footrow - +cell returns - +cell bytes - +cell The serialized form of the #[code Tokenizer] object. - -+h(2, "from_bytes") Tokenizer.from_bytes - +tag method - -p Load state from a binary string. - -+aside-code("Example"). - fron spacy.tokenizer import Tokenizer - tokenizer_bytes = tokenizer.to_bytes() - new_tokenizer = Tokenizer(nlp.vocab) - new_tokenizer.from_bytes(tokenizer_bytes) - -+table(["Name", "Type", "Description"]) - +row - +cell #[code bytes_data] - +cell bytes - +cell The data to load from. - - +row - +cell #[code **exclude] - +cell - - +cell Named attributes to prevent from being loaded. - - +footrow - +cell returns - +cell #[code Tokenizer] - +cell The #[code Tokenizer] object. - +h(2, "attributes") Attributes +table(["Name", "Type", "Description"]) diff --git a/website/docs/api/util.jade b/website/docs/api/util.jade index ed8b5d8e5..3e132b7b4 100644 --- a/website/docs/api/util.jade +++ b/website/docs/api/util.jade @@ -1,12 +1,10 @@ -//- 💫 DOCS > API > ANNOTATION SPECS +//- 💫 DOCS > API > UTIL include ../../_includes/_mixins p | spaCy comes with a small collection of utility functions located in | #[+src(gh("spaCy", "spacy/util.py")) spacy/util.py]. - -+infobox("Important note") | Because utility functions are mostly intended for | #[strong internal use within spaCy], their behaviour may change with | future releases. The functions documented on this page should be safe @@ -74,14 +72,23 @@ p +cell #[code Language] +cell Language class. -+h(2, "resolve_model_path") util.resolve_model_path ++h(2, "load_model") util.load_model +tag function + +tag-new(2) -p Resolve a model name or string to a model path. +p + | Load a model from a shortcut link, package or data path. If called with a + | shortcut link or package name, spaCy will assume the model is a Python + | package and import and call its #[code load()] method. If called with a + | path, spaCy will assume it's a data directory, read the language and + | pipeline settings from the meta.json and initialise a #[code Language] + | class. The model data will then be loaded in via + | #[+api("language#from_disk") #[code Language.from_disk()]]. +aside-code("Example"). - model_path = util.resolve_model_path('en') - model_path = util.resolve_model_path('/path/to/en') + nlp = util.load_model('en') + nlp = util.load_model('en_core_web_sm') + nlp = util.load_model('/path/to/data') +table(["Name", "Type", "Description"]) +row @@ -91,8 +98,33 @@ p Resolve a model name or string to a model path. +footrow +cell returns - +cell #[code Path] - +cell Path to model data directory. + +cell #[code Language] + +cell #[code Language] class with the loaded model. + ++h(2, "load_model_from_init_py") util.load_model_from_init_py + +tag function + +tag-new(2) + +p + | A helper function to use in the #[code load()] method of a model package's + | #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) __init__.py]. + ++aside-code("Example"). + from spacy.util import load_model_from_init_py + + def load(): + return load_model_from_init_py(__file__) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code init_file] + +cell unicode + +cell Path to model's __init__.py, i.e. #[code __file__]. + + +footrow + +cell returns + +cell #[code Language] + +cell #[code Language] class with the loaded model. +h(2, "is_package") util.is_package +tag function @@ -116,16 +148,18 @@ p +cell #[code bool] +cell #[code True] if installed package, #[code False] if not. -+h(2, "get_model_package_path") util.get_model_package_path ++h(2, "get_package_path") util.get_package_path +tag function + +tag-new(2) p - | Get path to a #[+a("/docs/usage/models") model package] installed via pip. - | Currently imports the package to find it and parse its meta data. + | Get path to an installed package. Mainly used to resolve the location of + | #[+a("/docs/usage/models") model packages]. Currently imports the package + | to find its path. +aside-code("Example"). - util.get_model_package_path('en_core_web_sm') - # /usr/lib/python3.6/site-packages/en_core_web_sm/en_core_web_sm-1.2.0 + util.get_package_path('en_core_web_sm') + # /usr/lib/python3.6/site-packages/en_core_web_sm +table(["Name", "Type", "Description"]) +row @@ -136,39 +170,11 @@ p +footrow +cell returns +cell #[code Path] - +cell Path to model data directory. - -+h(2, "parse_package_meta") util.parse_package_meta - +tag function - -p - | Check if a #[code meta.json] exists in a model package and return its - | contents. - -+aside-code("Example"). - if util.is_package('en_core_web_sm'): - path = util.get_model_package_path('en_core_web_sm') - meta = util.parse_package_meta(path, require=True) - # {'name': 'core_web_sm', 'lang': 'en', ...} - -+table(["Name", "Type", "Description"]) - +row - +cell #[code package_path] - +cell #[code Path] +cell Path to model package directory. - +row - +cell #[code require] - +cell #[code bool] - +cell If #[code True], raise error if no #[code meta.json] is found. - - +footrow - +cell returns - +cell dict / #[code None] - +cell Model meta data or #[code None]. - +h(2, "is_in_jupyter") util.is_in_jupyter +tag function + +tag-new(2) p | Check if user is running spaCy from a #[+a("https://jupyter.org") Jupyter] @@ -221,11 +227,12 @@ p +h(2, "prints") util.prints +tag function + +tag-new(2) p | Print a formatted, text-wrapped message with optional title. If a text | argument is a #[code Path], it's converted to a string. Should only - | be used for interactive components like the #[+a("/docs/usage/cli") CLI]. + | be used for interactive components like the #[+api("cli") cli]. +aside-code("Example"). data_path = Path('/some/path') diff --git a/website/docs/api/vocab.jade b/website/docs/api/vocab.jade index bd18a17da..277fed5d3 100644 --- a/website/docs/api/vocab.jade +++ b/website/docs/api/vocab.jade @@ -159,6 +159,7 @@ p +h(2, "to_disk") Vocab.to_disk +tag method + +tag-new(2) p Save the current state to a directory. @@ -175,6 +176,7 @@ p Save the current state to a directory. +h(2, "from_disk") Vocab.from_disk +tag method + +tag-new(2) p Loads state from a directory. Modifies the object in place and returns it. diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index 8eca16a8c..59057b0bb 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -3,28 +3,25 @@ "Get started": { "Installation": "./", "Models": "models", + "spaCy 101": "spacy-101", "Lightning tour": "lightning-tour", - "Visualizers": "visualizers", - "Troubleshooting": "troubleshooting", "What's new in v2.0": "v2" }, - "Workflows": { - "spaCy 101": "spacy-101", - "Loading the pipeline": "language-processing-pipeline", - "Processing text": "processing-text", - "spaCy's data model": "data-model", + "Guides": { "POS tagging": "pos-tagging", "Using the parse": "dependency-parse", "Entity recognition": "entity-recognition", - "Custom pipelines": "customizing-pipeline", - "Rule-based matching": "rule-based-matching", "Word vectors": "word-vectors-similarities", - "Deep learning": "deep-learning", "Custom tokenization": "customizing-tokenizer", + "Rule-based matching": "rule-based-matching", "Adding languages": "adding-languages", + "Processing pipelines": "language-processing-pipeline", + "Deep learning": "deep-learning", + "Production use": "production-use", "Training": "training", "Training NER": "training-ner", - "Saving & loading": "saving-loading" + "Saving & loading": "saving-loading", + "Visualizers": "visualizers" }, "Examples": { "Tutorials": "tutorials", @@ -38,55 +35,33 @@ "quickstart": true }, - "v2": { - "title": "What's new in v2.0" - }, - "models": { "title": "Models", - "next": "lightning-tour", + "next": "spacy-101", "quickstart": true }, + "spacy-101": { + "title": "spaCy 101", + "next": "lightning-tour" + }, + "lightning-tour": { "title": "Lightning tour", - "next": "spacy-101" + "next": "v2" }, "visualizers": { "title": "Visualizers" }, - "troubleshooting": { - "title": "Troubleshooting", - "next": "resources" + "v2": { + "title": "What's new in v2.0" }, - "resources": { - "title": "Resources" - }, - - "spacy-101": { - "title": "spaCy 101" - }, - - "language-processing-pipeline": { - "title": "Loading a language processing pipeline", - "next": "processing-text" - }, - - "customizing-pipeline": { - "title": "Customizing the pipeline", - "next": "customizing-tokenizer" - }, - - "processing-text": { - "title": "Processing text", - "next": "data-model" - }, - - "data-model": { - "title": "Understanding spaCy's data model" + "pos-tagging": { + "title": "Part-of-speech tagging", + "next": "dependency-parse" }, "dependency-parse": { @@ -95,25 +70,43 @@ }, "entity-recognition": { - "title": "Entity recognition", + "title": "Named Entity Recognition", + "next": "training-ner" + }, + + "word-vectors-similarities": { + "title": "Using word vectors and semantic similarities", + "next": "customizing-tokenizer" + }, + + "customizing-tokenizer": { + "title": "Customising the tokenizer", "next": "rule-based-matching" }, "rule-based-matching": { - "title": "Rule-based matching" + "title": "Rule-based matching", + "next": "adding-languages" }, - "word-vectors-similarities": { - "title": "Using word vectors and semantic similarities" + "adding-languages": { + "title": "Adding languages", + "next": "training" + }, + + "language-processing-pipeline": { + "title": "Language processing pipelines", + "next": "deep-learning" }, "deep-learning": { - "title": "Hooking a deep learning model into spaCy" + "title": "Hooking a deep learning model into spaCy", + "next": "production use" }, - "customizing-tokenizer": { - "title": "Customizing the tokenizer", - "next": "adding-languages" + "production-use": { + "title": "Production use", + "next": "training" }, "training": { @@ -127,17 +120,7 @@ }, "saving-loading": { - "title": "Saving and loading models" - }, - - "pos-tagging": { - "title": "Part-of-speech tagging", - "next": "dependency-parse" - }, - - "adding-languages": { - "title": "Adding languages", - "next": "training" + "title": "Saving, loading and data serialization" }, "showcase": { diff --git a/website/docs/usage/_spacy-101/_named-entities.jade b/website/docs/usage/_spacy-101/_named-entities.jade new file mode 100644 index 000000000..a3c539564 --- /dev/null +++ b/website/docs/usage/_spacy-101/_named-entities.jade @@ -0,0 +1,38 @@ +//- 💫 DOCS > USAGE > SPACY 101 > NAMED ENTITIES + +p + | A named entity is a "real-world object" that's assigned a name – for + | example, a person, a country, a product or a book title. spaCy can + | #[strong recognise] #[+a("/docs/api/annotation#named-entities") various types] + | of named entities in a document, by asking the model for a + | #[strong prediction]. Because models are statistical and strongly depend + | on the examples they were trained on, this doesn't always work + | #[em perfectly] and might need some tuning later, depending on your use + | case. + +p + | Named entities are available as the #[code ents] property of a #[code Doc]: + ++code. + doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion') + + for ent in doc.ents: + print(ent.text, ent.start_char, ent.end_char, ent.label_) + ++aside + | #[strong Text]: The original entity text.#[br] + | #[strong Start]: Index of start of entity in the #[code Doc].#[br] + | #[strong End]: Index of end of entity in the #[code Doc].#[br] + | #[strong Label]: Entity label, i.e. type. + ++table(["Text", "Start", "End", "Label", "Description"]) + - var style = [0, 1, 1, 1, 0] + +annotation-row(["Apple", 0, 5, "ORG", "Companies, agencies, institutions."], style) + +annotation-row(["U.K.", 27, 31, "GPE", "Geopolitical entity, i.e. countries, cities, states."], style) + +annotation-row(["$1 billion", 44, 54, "MONEY", "Monetary values, including unit."], style) + +p + | Using spaCy's built-in #[+a("/docs/usage/visualizers") displaCy visualizer], + | here's what our example sentence and its named entities look like: + ++codepen("2f2ad1408ff79fc6a326ea3aedbb353b", 160) diff --git a/website/docs/usage/_spacy-101/_pipelines.jade b/website/docs/usage/_spacy-101/_pipelines.jade new file mode 100644 index 000000000..edf553805 --- /dev/null +++ b/website/docs/usage/_spacy-101/_pipelines.jade @@ -0,0 +1,60 @@ +//- 💫 DOCS > USAGE > SPACY 101 > PIPELINES + +p + | When you call #[code nlp] on a text, spaCy first tokenizes the text to + | produce a #[code Doc] object. The #[code Doc] is then processed in several + | different steps – this is also referred to as the + | #[strong processing pipeline]. The pipeline used by the + | #[+a("/docs/usage/models") default models] consists of a + | vectorizer, a tagger, a parser and an entity recognizer. Each pipeline + | component returns the processed #[code Doc], which is then passed on to + | the next component. + ++image + include ../../../assets/img/docs/pipeline.svg + .u-text-right + +button("/assets/img/docs/pipeline.svg", false, "secondary").u-text-tag View large graphic + ++aside + | #[strong Name:] ID of the pipeline component.#[br] + | #[strong Component:] spaCy's implementation of the component.#[br] + | #[strong Creates:] Objects, attributes and properties modified and set by + | the component. + ++table(["Name", "Component", "Creates"]) + +row + +cell tokenizer + +cell #[+api("tokenizer") #[code Tokenizer]] + +cell #[code Doc] + + +row("divider") + +cell vectorizer + +cell #[code Vectorizer] + +cell #[code Doc.tensor] + + +row + +cell tagger + +cell #[+api("tagger") #[code Tagger]] + +cell #[code Doc[i].tag] + + +row + +cell parser + +cell #[+api("dependencyparser") #[code DependencyParser]] + +cell + | #[code Doc[i].head], #[code Doc[i].dep], #[code Doc.sents], + | #[code Doc.noun_chunks] + + +row + +cell ner + +cell #[+api("entityrecognizer") #[code EntityRecognizer]] + +cell #[code Doc.ents], #[code Doc[i].ent_iob], #[code Doc[i].ent_type] + +p + | The processing pipeline always #[strong depends on the statistical model] + | and its capabilities. For example, a pipeline can only include an entity + | recognizer component if the model includes data to make predictions of + | entity labels. This is why each model will specify the pipeline to use + | in its meta data, as a simple list containing the component names: + ++code(false, "json"). + "pipeline": ["vectorizer", "tagger", "parser", "ner"] diff --git a/website/docs/usage/_spacy-101/_pos-deps.jade b/website/docs/usage/_spacy-101/_pos-deps.jade new file mode 100644 index 000000000..b42847aee --- /dev/null +++ b/website/docs/usage/_spacy-101/_pos-deps.jade @@ -0,0 +1,62 @@ +//- 💫 DOCS > USAGE > SPACY 101 > POS TAGGING AND DEPENDENCY PARSING + +p + | After tokenization, spaCy can also #[strong parse] and #[strong tag] a + | given #[code Doc]. This is where the statistical model comes in, which + | enables spaCy to #[strong make a prediction] of which tag or label most + | likely applies in this context. A model consists of binary data and is + | produced by showing a system enough examples for it to make predictions + | that generalise across the language – for example, a word following "the" + | in English is most likely a noun. + +p + | Linguistic annotations are available as + | #[+api("token#attributes") #[code Token] attributes]. Like many NLP + | libraries, spaCy #[strong encodes all strings to integers] to reduce + | memory usage and improve efficiency. So to get the readable string + | representation of an attribute, we need to add an underscore #[code _] + | to its name: + ++code. + doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion') + + for token in doc: + print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, + token.shape_, token.is_alpha, token.is_stop) + ++aside + | #[strong Text:] The original word text.#[br] + | #[strong Lemma:] The base form of the word.#[br] + | #[strong POS:] The simple part-of-speech tag.#[br] + | #[strong Tag:] The detailed part-of-speech tag.#[br] + | #[strong Dep:] Syntactic dependency, i.e. the relation between tokens.#[br] + | #[strong Shape:] The word shape – capitalisation, punctuation, digits.#[br] + | #[strong is alpha:] Is the token an alpha character?#[br] + | #[strong is stop:] Is the token part of a stop list, i.e. the most common + | words of the language?#[br] + ++table(["Text", "Lemma", "POS", "Tag", "Dep", "Shape", "alpha", "stop"]) + - var style = [0, 0, 1, 1, 1, 1, 1, 1] + +annotation-row(["Apple", "apple", "PROPN", "NNP", "nsubj", "Xxxxx", true, false], style) + +annotation-row(["is", "be", "VERB", "VBZ", "aux", "xx", true, true], style) + +annotation-row(["looking", "look", "VERB", "VBG", "ROOT", "xxxx", true, false], style) + +annotation-row(["at", "at", "ADP", "IN", "prep", "xx", true, true], style) + +annotation-row(["buying", "buy", "VERB", "VBG", "pcomp", "xxxx", true, false], style) + +annotation-row(["U.K.", "u.k.", "PROPN", "NNP", "compound", "X.X.", false, false], style) + +annotation-row(["startup", "startup", "NOUN", "NN", "dobj", "xxxx", true, false], style) + +annotation-row(["for", "for", "ADP", "IN", "prep", "xxx", true, true], style) + +annotation-row(["$", "$", "SYM", "$", "quantmod", "$", false, false], style) + +annotation-row(["1", "1", "NUM", "CD", "compound", "d", false, false], style) + +annotation-row(["billion", "billion", "NUM", "CD", "pobj", "xxxx", true, false], style) + ++aside("Tip: Understanding tags and labels") + | Most of the tags and labels look pretty abstract, and they vary between + | languages. #[code spacy.explain()] will show you a short description – + | for example, #[code spacy.explain("VBZ")] returns "verb, 3rd person + | singular present". + +p + | Using spaCy's built-in #[+a("/docs/usage/visualizers") displaCy visualizer], + | here's what our example sentence and its dependencies look like: + ++codepen("030d1e4dfa6256cad8fdd59e6aefecbe", 460) diff --git a/website/docs/usage/_spacy-101/_serialization.jade b/website/docs/usage/_spacy-101/_serialization.jade new file mode 100644 index 000000000..a763f422b --- /dev/null +++ b/website/docs/usage/_spacy-101/_serialization.jade @@ -0,0 +1,68 @@ +//- 💫 DOCS > USAGE > SPACY 101 > SERIALIZATION + +p + | If you've been modifying the pipeline, vocabulary vectors and entities, or made + | updates to the model, you'll eventually want + | to #[strong save your progress] – for example, everything that's in your #[code nlp] + | object. This means you'll have to translate its contents and structure + | into a format that can be saved, like a file or a byte string. This + | process is called serialization. spaCy comes with + | #[strong built-in serialization methods] and supports the + | #[+a("http://www.diveintopython3.net/serializing.html#dump") Pickle protocol]. + ++aside("What's pickle?") + | Pickle is Python's built-in object persistance system. It lets you + | transfer arbitrary Python objects between processes. This is usually used + | to load an object to and from disk, but it's also used for distributed + | computing, e.g. with + | #[+a("https://spark.apache.org/docs/0.9.0/python-programming-guide.html") PySpark] + | or #[+a("http://dask.pydata.org/en/latest/") Dask]. When you unpickle an + | object, you're agreeing to execute whatever code it contains. It's like + | calling #[code eval()] on a string – so don't unpickle objects from + | untrusted sources. + +p + | All container classes, i.e. #[+api("language") #[code Language]], + | #[+api("doc") #[code Doc]], #[+api("vocab") #[code Vocab]] and + | #[+api("stringstore") #[code StringStore]] have the following methods + | available: + ++table(["Method", "Returns", "Example"]) + - style = [1, 0, 1] + +annotation-row(["to_bytes", "bytes", "nlp.to_bytes()"], style) + +annotation-row(["from_bytes", "object", "nlp.from_bytes(bytes)"], style) + +annotation-row(["to_disk", "-", "nlp.to_disk('/path')"], style) + +annotation-row(["from_disk", "object", "nlp.from_disk('/path')"], style) + +p + | For example, if you've processed a very large document, you can use + | #[+api("doc#to_disk") #[code Doc.to_disk]] to save it to a file on your + | local machine. This will save the document and its tokens, as well as + | the vocabulary associated with the #[code Doc]. + ++aside("Why saving the vocab?") + | Saving the vocabulary with the #[code Doc] is important, because the + | #[code Vocab] holds the context-independent information about the words, + | tags and labels, and their #[strong integer IDs]. If the #[code Vocab] + | wasn't saved with the #[code Doc], spaCy wouldn't know how to resolve + | those IDs – for example, the word text or the dependency labels. You + | might be saving #[code 446] for "whale", but in a different vocabulary, + | this ID could map to "VERB". Similarly, if your document was processed by + | a German model, its vocab will include the specific + | #[+a("/docs/api/annotation#dependency-parsing-german") German dependency labels]. + ++code. + moby_dick = open('moby_dick.txt', 'r') # open a large document + doc = nlp(moby_dick) # process it + doc.to_disk('/moby_dick.bin') # save the processed Doc + +p + | If you need it again later, you can load it back into an empty #[code Doc] + | with an empty #[code Vocab] by calling + | #[+api("doc#from_disk") #[code from_disk()]]: + ++code. + from spacy.tokens import Doc # to create empty Doc + from spacy.vocab import Vocab # to create empty Vocab + + doc = Doc(Vocab()).from_disk('/moby_dick.bin') # load processed Doc diff --git a/website/docs/usage/_spacy-101/_similarity.jade b/website/docs/usage/_spacy-101/_similarity.jade new file mode 100644 index 000000000..6eed1eb7f --- /dev/null +++ b/website/docs/usage/_spacy-101/_similarity.jade @@ -0,0 +1,44 @@ +//- 💫 DOCS > USAGE > SPACY 101 > SIMILARITY + +p + | spaCy is able to compare two objects, and make a prediction of + | #[strong how similar they are]. Predicting similarity is useful for + | building recommendation systems or flagging duplicates. For example, you + | can suggest a user content that's similar to what they're currently + | looking at, or label a support ticket as a duplicate if it's very + | similar to an already existing one. + +p + | Each #[code Doc], #[code Span] and #[code Token] comes with a + | #[+api("token#similarity") #[code .similarity()]] method that lets you + | compare it with another object, and determine the similarity. Of course + | similarity is always subjective – whether "dog" and "cat" are similar + | really depends on how you're looking at it. spaCy's similarity model + | usually assumes a pretty general-purpose definition of similarity. + ++code. + tokens = nlp(u'dog cat banana') + + for token1 in tokens: + for token2 in tokens: + print(token1.similarity(token2)) + ++aside + | #[strong #[+procon("neutral", 16)] similarity:] identical#[br] + | #[strong #[+procon("pro", 16)] similarity:] similar (higher is more similar) #[br] + | #[strong #[+procon("con", 16)] similarity:] dissimilar (lower is less similar) + ++table(["", "dog", "cat", "banana"]) + each cells, label in {"dog": [1.00, 0.80, 0.24], "cat": [0.80, 1.00, 0.28], "banana": [0.24, 0.28, 1.00]} + +row + +cell.u-text-label.u-color-theme=label + for cell in cells + +cell #[code=cell.toFixed(2)] + | #[+procon(cell < 0.5 ? "con" : cell != 1 ? "pro" : "neutral")] + +p + | In this case, the model's predictions are pretty on point. A dog is very + | similar to a cat, whereas a banana is not very similar to either of them. + | Identical tokens are obviously 100% similar to each other (just not always + | exactly #[code 1.0], because of vector math and floating point + | imprecisions). diff --git a/website/docs/usage/_spacy-101/_tokenization.jade b/website/docs/usage/_spacy-101/_tokenization.jade new file mode 100644 index 000000000..95a9cc520 --- /dev/null +++ b/website/docs/usage/_spacy-101/_tokenization.jade @@ -0,0 +1,62 @@ +//- 💫 DOCS > USAGE > SPACY 101 > TOKENIZATION + +p + | During processing, spaCy first #[strong tokenizes] the text, i.e. + | segments it into words, punctuation and so on. This is done by applying + | rules specific to each language. For example, punctuation at the end of a + | sentence should be split off – whereas "U.K." should remain one token. + | Each #[code Doc] consists of individual tokens, and we can simply iterate + | over them: + ++code. + for token in doc: + print(token.text) + ++table([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).u-text-center + +row + for cell in ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "for", "$", "1", "billion"] + +cell=cell + +p + | Fist, the raw text is split on whitespace characters, similar to + | #[code text.split(' ')]. Then, the tokenizer processes the text from + | left to right. On each substring, it performs two checks: + ++list("numbers") + +item + | #[strong Does the substring match a tokenizer exception rule?] For + | example, "don't" does not contain whitespace, but should be split + | into two tokens, "do" and "n't", while "U.K." should always + | remain one token. + +item + | #[strong Can a prefix, suffix or infixes be split off?]. For example + | punctuation like commas, periods, hyphens or quotes. + +p + | If there's a match, the rule is applied and the tokenizer continues its + | loop, starting with the newly split substrings. This way, spaCy can split + | #[strong complex, nested tokens] like combinations of abbreviations and + | multiple punctuation marks. + ++aside + | #[strong Tokenizer exception:] Special-case rule to split a string into + | several tokens or prevent a token from being split when punctuation rules + | are applied.#[br] + | #[strong Prefix:] Character(s) at the beginning, e.g. + | #[code $], #[code (], #[code “], #[code ¿].#[br] + | #[strong Suffix:] Character(s) at the end, e.g. + | #[code km], #[code )], #[code ”], #[code !].#[br] + | #[strong Infix:] Character(s) in between, e.g. + | #[code -], #[code --], #[code /], #[code …].#[br] + ++image + include ../../../assets/img/docs/tokenization.svg + .u-text-right + +button("/assets/img/docs/tokenization.svg", false, "secondary").u-text-tag View large graphic + +p + | While punctuation rules are usually pretty general, tokenizer exceptions + | strongly depend on the specifics of the individual language. This is + | why each #[+a("/docs/api/language-models") available language] has its + | own subclass like #[code English] or #[code German], that loads in lists + | of hard-coded data and exception rules. diff --git a/website/docs/usage/_spacy-101/_training.jade b/website/docs/usage/_spacy-101/_training.jade new file mode 100644 index 000000000..f4a0c7194 --- /dev/null +++ b/website/docs/usage/_spacy-101/_training.jade @@ -0,0 +1,3 @@ +//- 💫 DOCS > USAGE > SPACY 101 > TRAINING + ++under-construction diff --git a/website/docs/usage/_spacy-101/_vocab-stringstore.jade b/website/docs/usage/_spacy-101/_vocab-stringstore.jade new file mode 100644 index 000000000..dd300b5b9 --- /dev/null +++ b/website/docs/usage/_spacy-101/_vocab-stringstore.jade @@ -0,0 +1,94 @@ +//- 💫 DOCS > USAGE > SPACY 101 > VOCAB & STRINGSTORE + +p + | Whenever possible, spaCy tries to store data in a vocabulary, the + | #[+api("vocab") #[code Vocab]], that will be + | #[strong shared by multiple documents]. To save memory, spaCy also + | encodes all strings to #[strong integer IDs] – in this case for example, + | "coffee" has the ID #[code 3672]. Entity labels like "ORG" and + | part-of-speech tags like "VERB" are also encoded. Internally, spaCy + | only "speaks" in integer IDs. + ++aside + | #[strong Token]: A word, punctuation mark etc. #[em in context], including + | its attributes, tags and dependencies.#[br] + | #[strong Lexeme]: A "word type" with no context. Includes the word shape + | and flags, e.g. if it's lowercase, a digit or punctuation.#[br] + | #[strong Doc]: A processed container of tokens in context.#[br] + | #[strong Vocab]: The collection of lexemes.#[br] + | #[strong StringStore]: The dictionary mapping integer IDs to strings, for + | example #[code 3672] → "coffee". + ++image + include ../../../assets/img/docs/vocab_stringstore.svg + .u-text-right + +button("/assets/img/docs/vocab_stringstore.svg", false, "secondary").u-text-tag View large graphic + +p + | If you process lots of documents containing the word "coffee" in all + | kinds of different contexts, storing the exact string "coffee" every time + | would take up way too much space. So instead, spaCy assigns it an ID + | and stores it in the #[+api("stringstore") #[code StringStore]]. You can + | think of the #[code StringStore] as a + | #[strong lookup table that works in both directions] – you can look up a + | string to get its ID, or an ID to get its string: + ++code. + doc = nlp(u'I like coffee') + assert doc.vocab.strings[u'coffee'] == 3572 + assert doc.vocab.strings[3572] == u'coffee' + +p + | Now that all strings are encoded, the entries in the vocabulary + | #[strong don't need to include the word text] themselves. Instead, + | they can look it up in the #[code StringStore] via its integer ID. Each + | entry in the vocabulary, also called #[+api("lexeme") #[code Lexeme]], + | contains the #[strong context-independent] information about a word. + | For example, no matter if "love" is used as a verb or a noun in some + | context, its spelling and whether it consists of alphabetic characters + | won't ever change. + ++code. + for word in doc: + lexeme = doc.vocab[word.text] + print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_, + lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_) + ++aside + | #[strong Text]: The original text of the lexeme.#[br] + | #[strong Orth]: The integer ID of the lexeme.#[br] + | #[strong Shape]: The abstract word shape of the lexeme.#[br] + | #[strong Prefix]: By default, the first letter of the word string.#[br] + | #[strong Suffix]: By default, the last three letters of the word string.#[br] + | #[strong is alpha]: Does the lexeme consist of alphabetic characters?#[br] + | #[strong is digit]: Does the lexeme consist of digits?#[br] + | #[strong is title]: Does the lexeme consist of alphabetic characters?#[br] + | #[strong Lang]: The language of the parent vocabulary. + ++table(["text", "orth", "shape", "prefix", "suffix", "is_alpha", "is_digit", "is_title", "lang"]) + - var style = [0, 1, 1, 0, 0, 1, 1, 1, 0] + +annotation-row(["I", 508, "X", "I", "I", true, false, true, "en"], style) + +annotation-row(["love", 949, "xxxx", "l", "ove", true, false, false, "en"], style) + +annotation-row(["coffee", 3572, "xxxx", "c", "ffe", true, false, false, "en"], style) + +p + | The specific entries in the voabulary and their IDs don't really matter – + | #[strong as long as they match]. That's why you always need to make sure + | all objects you create have access to the same vocabulary. If they don't, + | the IDs won't match and spaCy will either produce very confusing results, + | or fail alltogether. + ++code. + from spacy.tokens import Doc + from spacy.vocab import Vocab + + doc = nlp(u'I like coffee') # original Doc + new_doc = Doc(Vocab(), words=['I', 'like', 'coffee']) # new Doc with empty Vocab + assert doc.vocab.strings[u'coffee'] == 3572 # ID in vocab of Doc + assert new_doc.vocab.strings[u'coffee'] == 446 # ID in vocab of new Doc + +p + | Even though both #[code Doc] objects contain the same words, the internal + | integer IDs are very different. The same applies for all other strings, + | like the annotation scheme. To avoid mismatched IDs, spaCy will always + | export the vocab if you save a #[code Doc] or #[code nlp] object. diff --git a/website/docs/usage/_spacy-101/_word-vectors.jade b/website/docs/usage/_spacy-101/_word-vectors.jade new file mode 100644 index 000000000..cbb9d06f2 --- /dev/null +++ b/website/docs/usage/_spacy-101/_word-vectors.jade @@ -0,0 +1,152 @@ +//- 💫 DOCS > USAGE > SPACY 101 > WORD VECTORS + +p + | Similarity is determined by comparing #[strong word vectors] or "word + | embeddings", multi-dimensional meaning representations of a word. Word + | vectors can be generated using an algorithm like + | #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. Most of spaCy's + | #[+a("/docs/usage/models") default models] come with + | #[strong 300-dimensional vectors] that look like this: + ++code("banana.vector", false, false, 250). + array([2.02280000e-01, -7.66180009e-02, 3.70319992e-01, + 3.28450017e-02, -4.19569999e-01, 7.20689967e-02, + -3.74760002e-01, 5.74599989e-02, -1.24009997e-02, + 5.29489994e-01, -5.23800015e-01, -1.97710007e-01, + -3.41470003e-01, 5.33169985e-01, -2.53309999e-02, + 1.73800007e-01, 1.67720005e-01, 8.39839995e-01, + 5.51070012e-02, 1.05470002e-01, 3.78719985e-01, + 2.42750004e-01, 1.47449998e-02, 5.59509993e-01, + 1.25210002e-01, -6.75960004e-01, 3.58420014e-01, + -4.00279984e-02, 9.59490016e-02, -5.06900012e-01, + -8.53179991e-02, 1.79800004e-01, 3.38669986e-01, + 1.32300004e-01, 3.10209990e-01, 2.18779996e-01, + 1.68530002e-01, 1.98740005e-01, -5.73849976e-01, + -1.06490001e-01, 2.66689986e-01, 1.28380001e-01, + -1.28030002e-01, -1.32839993e-01, 1.26570001e-01, + 8.67229998e-01, 9.67210010e-02, 4.83060002e-01, + 2.12709993e-01, -5.49900010e-02, -8.24249983e-02, + 2.24079996e-01, 2.39749998e-01, -6.22599982e-02, + 6.21940017e-01, -5.98999977e-01, 4.32009995e-01, + 2.81430006e-01, 3.38420011e-02, -4.88150001e-01, + -2.13589996e-01, 2.74010003e-01, 2.40950003e-01, + 4.59500015e-01, -1.86049998e-01, -1.04970002e+00, + -9.73049998e-02, -1.89080000e-01, -7.09290028e-01, + 4.01950002e-01, -1.87680006e-01, 5.16870022e-01, + 1.25200003e-01, 8.41499984e-01, 1.20970003e-01, + 8.82389992e-02, -2.91959997e-02, 1.21510006e-03, + 5.68250008e-02, -2.74210006e-01, 2.55640000e-01, + 6.97930008e-02, -2.22580001e-01, -3.60060006e-01, + -2.24020004e-01, -5.36990017e-02, 1.20220006e+00, + 5.45350015e-01, -5.79980016e-01, 1.09049998e-01, + 4.21669990e-01, 2.06619993e-01, 1.29360005e-01, + -4.14570011e-02, -6.67770028e-01, 4.04670000e-01, + -1.52179999e-02, -2.76400000e-01, -1.56110004e-01, + -7.91980028e-02, 4.00369987e-02, -1.29439995e-01, + -2.40900001e-04, -2.67850012e-01, -3.81150007e-01, + -9.72450018e-01, 3.17259997e-01, -4.39509988e-01, + 4.19340014e-01, 1.83530003e-01, -1.52600005e-01, + -1.08080000e-01, -1.03579998e+00, 7.62170032e-02, + 1.65189996e-01, 2.65259994e-04, 1.66160002e-01, + -1.52810007e-01, 1.81229994e-01, 7.02740014e-01, + 5.79559989e-03, 5.16639985e-02, -5.97449988e-02, + -2.75510013e-01, -3.90489995e-01, 6.11319989e-02, + 5.54300010e-01, -8.79969969e-02, -4.16810006e-01, + 3.28260005e-01, -5.25489986e-01, -4.42880005e-01, + 8.21829960e-03, 2.44859993e-01, -2.29819998e-01, + -3.49810004e-01, 2.68940002e-01, 3.91660005e-01, + -4.19039994e-01, 1.61909997e-01, -2.62630010e+00, + 6.41340017e-01, 3.97430003e-01, -1.28680006e-01, + -3.19460005e-01, -2.56330013e-01, -1.22199997e-01, + 3.22750002e-01, -7.99330026e-02, -1.53479993e-01, + 3.15050006e-01, 3.05909991e-01, 2.60120004e-01, + 1.85530007e-01, -2.40429997e-01, 4.28860001e-02, + 4.06219989e-01, -2.42559999e-01, 6.38700008e-01, + 6.99829996e-01, -1.40430003e-01, 2.52090007e-01, + 4.89840001e-01, -6.10670000e-02, -3.67659986e-01, + -5.50890028e-01, -3.82649988e-01, -2.08430007e-01, + 2.28320003e-01, 5.12179971e-01, 2.78679997e-01, + 4.76520002e-01, 4.79510017e-02, -3.40079993e-01, + -3.28729987e-01, -4.19669986e-01, -7.54989982e-02, + -3.89539987e-01, -2.96219997e-02, -3.40700001e-01, + 2.21699998e-01, -6.28560036e-02, -5.19029975e-01, + -3.77739996e-01, -4.34770016e-03, -5.83010018e-01, + -8.75459984e-02, -2.39289999e-01, -2.47109994e-01, + -2.58870006e-01, -2.98940003e-01, 1.37150005e-01, + 2.98919994e-02, 3.65439989e-02, -4.96650010e-01, + -1.81600004e-01, 5.29389977e-01, 2.19919994e-01, + -4.45140004e-01, 3.77979994e-01, -5.70620000e-01, + -4.69460003e-02, 8.18059966e-02, 1.92789994e-02, + 3.32459986e-01, -1.46200001e-01, 1.71560004e-01, + 3.99809986e-01, 3.62170011e-01, 1.28160000e-01, + 3.16439986e-01, 3.75690013e-01, -7.46899992e-02, + -4.84800003e-02, -3.14009994e-01, -1.92860007e-01, + -3.12940001e-01, -1.75529998e-02, -1.75139993e-01, + -2.75870003e-02, -1.00000000e+00, 1.83870003e-01, + 8.14339995e-01, -1.89129993e-01, 5.09989977e-01, + -9.19600017e-03, -1.92950002e-03, 2.81890005e-01, + 2.72470005e-02, 4.34089988e-01, -5.49669981e-01, + -9.74259973e-02, -2.45399997e-01, -1.72030002e-01, + -8.86500031e-02, -3.02980006e-01, -1.35910004e-01, + -2.77649999e-01, 3.12860007e-03, 2.05559999e-01, + -1.57720000e-01, -5.23079991e-01, -6.47010028e-01, + -3.70139986e-01, 6.93930015e-02, 1.14009999e-01, + 2.75940001e-01, -1.38750002e-01, -2.72680014e-01, + 6.68910027e-01, -5.64539991e-02, 2.40170002e-01, + -2.67300010e-01, 2.98599988e-01, 1.00830004e-01, + 5.55920005e-01, 3.28489989e-01, 7.68579990e-02, + 1.55279994e-01, 2.56359994e-01, -1.07720003e-01, + -1.23590000e-01, 1.18270002e-01, -9.90289971e-02, + -3.43279988e-01, 1.15019999e-01, -3.78080010e-01, + -3.90120000e-02, -3.45930010e-01, -1.94040000e-01, + -3.35799992e-01, -6.23340011e-02, 2.89189994e-01, + 2.80319989e-01, -5.37410021e-01, 6.27939999e-01, + 5.69549985e-02, 6.21469975e-01, -2.52819985e-01, + 4.16700006e-01, -1.01079997e-02, -2.54339993e-01, + 4.00029987e-01, 4.24320012e-01, 2.26720005e-01, + 1.75530002e-01, 2.30489999e-01, 2.83230007e-01, + 1.38820007e-01, 3.12180002e-03, 1.70570001e-01, + 3.66849989e-01, 2.52470002e-03, -6.40089989e-01, + -2.97650009e-01, 7.89430022e-01, 3.31680000e-01, + -1.19659996e+00, -4.71559986e-02, 5.31750023e-01], dtype=float32) + +p + | The #[code .vector] attribute will return an object's vector. + | #[+api("doc#vector") #[code Doc.vector]] and + | #[+api("span#vector") #[code Span.vector]] will default to an average + | of their token vectors. You can also check if a token has a vector + | assigned, and get the L2 norm, which can be used to normalise + | vectors. + ++code. + tokens = nlp(u'dog cat banana sasquatch') + + for token in tokens: + print(token.text, token.has_vector, token.vector_norm, token.is_oov) + ++aside + | #[strong Text]: The original token text.#[br] + | #[strong has vector]: Does the token have a vector representation?#[br] + | #[strong Vector norm]: The L2 norm of the token's vector (the square root + | of the sum of the values squared)#[br] + | #[strong is OOV]: Is the word out-of-vocabulary? + ++table(["Text", "Has vector", "Vector norm", "OOV"]) + - var style = [0, 1, 1, 1] + +annotation-row(["dog", true, 7.033672992262838, false], style) + +annotation-row(["cat", true, 6.68081871208896, false], style) + +annotation-row(["banana", true, 6.700014292148571, false], style) + +annotation-row(["sasquatch", false, 0, true], style) + +p + | The words "dog", "cat" and "banana" are all pretty common in English, so + | they're part of the model's vocabulary, and come with a vector. The word + | "sasquatch" on the other hand is a lot less common and out-of-vocabulary + | – so its vector representation consists of 300 dimensions of #[code 0], + | which means it's practically nonexistent. + +p + | If your application will benefit from a large vocabulary with more + | vectors, you should consider using one of the + | #[+a("/docs/usage/models#available") larger models] instead of the default, + | smaller ones, which usually come with a clipped vocabulary. diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index d1cb1887c..779e2e100 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -104,6 +104,8 @@ p +image include ../../assets/img/docs/language_data.svg + .u-text-right + +button("/assets/img/docs/language_data.svg", false, "secondary").u-text-tag View large graphic +table(["File name", "Variables", "Description"]) +row @@ -436,6 +438,8 @@ p +h(3, "morph-rules") Morph rules ++under-construction + +h(2, "testing") Testing the new language tokenizer p @@ -533,8 +537,8 @@ p | #[+src(gh("spacy-dev-resources", "training/word_freqs.py")) word_freqs.py] | script from the spaCy developer resources. Note that your corpus should | not be preprocessed (i.e. you need punctuation for example). The - | #[+a("/docs/usage/cli#model") #[code model] command] expects a - | tab-separated word frequencies file with three columns: + | #[+api("cli#model") #[code model]] command expects a tab-separated word + | frequencies file with three columns: +list("numbers") +item The number of times the word occurred in your language sample. @@ -626,37 +630,20 @@ p | trains the model using #[+a("https://radimrehurek.com/gensim/") Gensim]. | The #[code vectors.bin] file should consist of one word and vector per line. -+h(2, "model-directory") Setting up a model directory - -p - | Once you've collected the word frequencies, Brown clusters and word - | vectors files, you can use the - | #[+a("/docs/usage/cli#model") #[code model] command] to create a data - | directory: - -+code(false, "bash"). - python -m spacy model [lang] [model_dir] [freqs_data] [clusters_data] [vectors_data] - -+aside-code("your_data_directory", "yaml"). +//-+aside-code("your_data_directory", "yaml"). ├── vocab/ - | ├── lexemes.bin # via nlp.vocab.dump(path) - | ├── strings.json # via nlp.vocab.strings.dump(file_) - | └── oov_prob # optional - ├── pos/ # optional - | ├── model # via nlp.tagger.model.dump(path) - | └── config.json # via Langage.train - ├── deps/ # optional - | ├── model # via nlp.parser.model.dump(path) - | └── config.json # via Langage.train - └── ner/ # optional - ├── model # via nlp.entity.model.dump(path) - └── config.json # via Langage.train - -p - | This creates a spaCy data directory with a vocabulary model, ready to be - | loaded. By default, the command expects to be able to find your language - | class using #[code spacy.util.get_lang_class(lang_id)]. - + | ├── lexemes.bin + | ├── strings.json + | └── oov_prob + ├── pos/ + | ├── model + | └── config.json + ├── deps/ + | ├── model + | └── config.json + └── ner/ + ├── model + └── config.json +h(2, "train-tagger-parser") Training the tagger and parser @@ -666,13 +653,12 @@ p | If your corpus uses the | #[+a("http://universaldependencies.org/docs/format.html") CoNLL-U] format, | i.e. files with the extension #[code .conllu], you can use the - | #[+a("/docs/usage/cli#convert") #[code convert] command] to convert it to - | spaCy's #[+a("/docs/api/annotation#json-input") JSON format] for training. + | #[+api("cli#convert") #[code convert]] command to convert it to spaCy's + | #[+a("/docs/api/annotation#json-input") JSON format] for training. p | Once you have your UD corpus transformed into JSON, you can train your - | model use the using spaCy's - | #[+a("/docs/usage/cli#train") #[code train] command]: + | model use the using spaCy's #[+api("cli#train") #[code train]] command: +code(false, "bash"). - python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n_iter] [--parser_L1] [--no_tagger] [--no_parser] [--no_ner] + python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities] diff --git a/website/docs/usage/customizing-pipeline.jade b/website/docs/usage/customizing-pipeline.jade deleted file mode 100644 index a4846d02e..000000000 --- a/website/docs/usage/customizing-pipeline.jade +++ /dev/null @@ -1,38 +0,0 @@ -//- 💫 DOCS > USAGE > CUSTOMIZING THE PIPELINE - -include ../../_includes/_mixins - -p - | spaCy provides several linguistic annotation functions by default. Each - | function takes a Doc object, and modifies it in-place. The default - | pipeline is #[code [nlp.tagger, nlp.entity, nlp.parser]]. spaCy 1.0 - | introduced the ability to customise this pipeline with arbitrary - | functions. - -+code. - def arbitrary_fixup_rules(doc): - for token in doc: - if token.text == u'bill' and token.tag_ == u'NNP': - token.tag_ = u'NN' - - def custom_pipeline(nlp): - return (nlp.tagger, arbitrary_fixup_rules, nlp.parser, nlp.entity) - - nlp = spacy.load('en', create_pipeline=custom_pipeline) - -p - | The easiest way to customise the pipeline is to pass a - | #[code create_pipeline] callback to the #[code spacy.load()] function. - -p - | The callback you pass to #[code create_pipeline] should take a single - | argument, and return a sequence of callables. Each callable in the - | sequence should accept a #[code Doc] object and modify it in place. - -p - | Instead of passing a callback, you can also write to the - | #[code .pipeline] attribute directly. - -+code. - nlp = spacy.load('en') - nlp.pipeline = [nlp.tagger] diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade index d43fb438f..86040a4eb 100644 --- a/website/docs/usage/customizing-tokenizer.jade +++ b/website/docs/usage/customizing-tokenizer.jade @@ -11,18 +11,50 @@ p | #[code spaces] booleans, which allow you to maintain alignment of the | tokens into the original string. -+aside("See Also") - | If you haven't read up on spaCy's #[+a("data-model") data model] yet, - | you should probably have a look. The main point to keep in mind is that - | spaCy's #[code Doc] doesn't copy or refer to the original string. The - | string is reconstructed from the tokens when required. ++h(2, "101") Tokenizer 101 +include _spacy-101/_tokenization + ++h(3, "101-data") Tokenizer data + +p + | #[strong Global] and #[strong language-specific] tokenizer data is + | supplied via the language data in #[+src(gh("spaCy", "spacy/lang")) spacy/lang]. + | The tokenizer exceptions define special cases like "don't" in English, + | which needs to be split into two tokens: #[code {ORTH: "do"}] and + | #[code {ORTH: "n't", LEMMA: "not"}]. The prefixes, suffixes and infixes + | mosty define punctuation rules – for example, when to split off periods + | (at the end of a sentence), and when to leave token containing periods + | intact (abbreviations like "U.S."). + ++image + include ../../assets/img/docs/language_data.svg + .u-text-right + +button("/assets/img/docs/language_data.svg", false, "secondary").u-text-tag View large graphic + ++infobox + | For more details on the language-specific data, see the + | usage workflow on #[+a("/docs/usage/adding-languages") adding languages]. +h(2, "special-cases") Adding special case tokenization rules p | Most domains have at least some idiosyncracies that require custom - | tokenization rules. Here's how to add a special case rule to an existing + | tokenization rules. This could be very certain expressions, or + | abbreviations only used in this specific field. + ++aside("Language data vs. custom tokenization") + | Tokenization rules that are specific to one language, but can be + | #[strong generalised across that language] should ideally live in the + | language data in #[+src(gh("spaCy", "spacy/lang")) spacy/lang] – we + | always appreciate pull requests! Anything that's specific to a domain or + | text type – like financial trading abbreviations, or Bavarian youth slang + | – should be added as a special case rule to your tokenizer instance. If + | you're dealing with a lot of customisations, it might make sense to create + | an entirely custom subclass. + +p + | Here's how to add a special case rule to an existing | #[+api("tokenizer") #[code Tokenizer]] instance: +code. @@ -30,15 +62,12 @@ p from spacy.symbols import ORTH, LEMMA, POS nlp = spacy.load('en') - assert [w.text for w in nlp(u'gimme that')] == [u'gimme', u'that'] - nlp.tokenizer.add_special_case(u'gimme', - [ - { - ORTH: u'gim', - LEMMA: u'give', - POS: u'VERB'}, - { - ORTH: u'me'}]) + doc = nlp(u'gimme that') # phrase to tokenize + assert [w.text for w in doc] == [u'gimme', u'that'] # current tokenization + + # add special case rule + special_case = [{ORTH: u'gim', LEMMA: u'give', POS: u'VERB'}, {ORTH: u'me'}] + nlp.tokenizer.add_special_case(u'gimme', special_case) assert [w.text for w in nlp(u'gimme that')] == [u'gim', u'me', u'that'] assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'me', u'that'] @@ -55,9 +84,8 @@ p | The special case rules have precedence over the punctuation splitting: +code. - nlp.tokenizer.add_special_case(u'...gimme...?', - [{ - ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}]) + special_case = [{ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}] + nlp.tokenizer.add_special_case(u'...gimme...?', special_case) assert len(nlp(u'...gimme...?')) == 1 p @@ -137,8 +165,8 @@ p +h(2, "native-tokenizers") Customizing spaCy's Tokenizer class p - | Let's imagine you wanted to create a tokenizer for a new language. There - | are four things you would need to define: + | Let's imagine you wanted to create a tokenizer for a new language or + | specific domain. There are four things you would need to define: +list("numbers") +item @@ -170,14 +198,14 @@ p import re from spacy.tokenizer import Tokenizer - prefix_re = re.compile(r'''[\[\("']''') - suffix_re = re.compile(r'''[\]\)"']''') - def create_tokenizer(nlp): - return Tokenizer(nlp.vocab, - prefix_search=prefix_re.search, - suffix_search=suffix_re.search) + prefix_re = re.compile(r'''[\[\("']''') + suffix_re = re.compile(r'''[\]\)"']''') - nlp = spacy.load('en', tokenizer=create_make_doc) + def create_tokenizer(nlp): + return Tokenizer(nlp.vocab, prefix_search=prefix_re.search, + suffix_search=suffix_re.search) + + nlp = spacy.load('en', tokenizer=create_tokenizer) p | If you need to subclass the tokenizer instead, the relevant methods to @@ -187,29 +215,68 @@ p +h(2, "custom-tokenizer") Hooking an arbitrary tokenizer into the pipeline p - | You can pass a custom tokenizer using the #[code make_doc] keyword, when - | you're creating the pipeline: + | The tokenizer is the first component of the processing pipeline and the + | only one that can't be replaced by writing to #[code nlp.pipeline]. This + | is because it has a different signature from all the other components: + | it takes a text and returns a #[code Doc], whereas all other components + | expect to already receive a tokenized #[code Doc]. -+code. - import spacy ++image + include ../../assets/img/docs/pipeline.svg + .u-text-right + +button("/assets/img/docs/pipeline.svg", false, "secondary").u-text-tag View large graphic - nlp = spacy.load('en', make_doc=my_tokenizer) p - | However, this approach often leaves us with a chicken-and-egg problem. - | To construct the tokenizer, we usually want attributes of the #[code nlp] - | pipeline. Specifically, we want the tokenizer to hold a reference to the - | pipeline's vocabulary object. Let's say we have the following class as - | our tokenizer: - + | To overwrite the existing tokenizer, you need to replace + | #[code nlp.tokenizer] with a custom function that takes a text, and + | returns a #[code Doc]. + ++code. + nlp = spacy.load('en') + nlp.tokenizer = my_tokenizer + ++table(["Argument", "Type", "Description"]) + +row + +cell #[code text] + +cell unicode + +cell The raw text to tokenize. + + +footrow + +cell returns + +cell #[code Doc] + +cell The tokenized document. + ++infobox("Important note: using a custom tokenizer") + .o-block + | In spaCy v1.x, you had to add a custom tokenizer by passing it to the + | #[code make_doc] keyword argument, or by passing a tokenizer "factory" + | to #[code create_make_doc]. This was unnecessarily complicated. Since + | spaCy v2.0, you can simply write to #[code nlp.tokenizer]. If your + | tokenizer needs the vocab, you can write a function and use + | #[code nlp.vocab]. + + +code-new. + nlp.tokenizer = my_tokenizer + nlp.tokenizer = my_tokenizer_factory(nlp.vocab) + +code-old. + nlp = spacy.load('en', make_doc=my_tokenizer) + nlp = spacy.load('en', create_make_doc=my_tokenizer_factory) + ++h(3, "custom-tokenizer-example") Example: A custom whitespace tokenizer + +p + | To construct the tokenizer, we usually want attributes of the #[code nlp] + | pipeline. Specifically, we want the tokenizer to hold a reference to the + | vocabulary object. Let's say we have the following class as + | our tokenizer: +code. - import spacy from spacy.tokens import Doc class WhitespaceTokenizer(object): - def __init__(self, nlp): - self.vocab = nlp.vocab + def __init__(self, vocab): + self.vocab = vocab def __call__(self, text): words = text.split(' ') @@ -218,28 +285,12 @@ p return Doc(self.vocab, words=words, spaces=spaces) p - | As you can see, we need a #[code vocab] instance to construct this — but - | we won't get the #[code vocab] instance until we get back the #[code nlp] - | object from #[code spacy.load()]. The simplest solution is to build the - | object in two steps: + | As you can see, we need a #[code Vocab] instance to construct this — but + | we won't have it until we get back the loaded #[code nlp] object. The + | simplest solution is to build the tokenizer in two steps. This also means + | that you can reuse the "tokenizer factory" and initialise it with + | different instances of #[code Vocab]. +code. nlp = spacy.load('en') - nlp.make_doc = WhitespaceTokenizer(nlp) - -p - | You can instead pass the class to the #[code create_make_doc] keyword, - | which is invoked as callback once the #[code nlp] object is ready: - -+code. - nlp = spacy.load('en', create_make_doc=WhitespaceTokenizer) - -p - | Finally, you can of course create your own subclasses, and create a bound - | #[code make_doc] method. The disadvantage of this approach is that spaCy - | uses inheritance to give each language-specific pipeline its own class. - | If you're working with multiple languages, a naive solution will - | therefore require one custom class per language you're working with. - | This might be at least annoying. You may be able to do something more - | generic by doing some clever magic with metaclasses or mixins, if that's - | the sort of thing you're into. + nlp.tokenizer = WhitespaceTokenizer(nlp.vocab) diff --git a/website/docs/usage/data-model.jade b/website/docs/usage/data-model.jade deleted file mode 100644 index 6be205178..000000000 --- a/website/docs/usage/data-model.jade +++ /dev/null @@ -1,264 +0,0 @@ -//- 💫 DOCS > USAGE > SPACY'S DATA MODEL - -include ../../_includes/_mixins - -p After reading this page, you should be able to: - -+list - +item Understand how spaCy's Doc, Span, Token and Lexeme object work - +item Start using spaCy's Cython API - +item Use spaCy more efficiently - -+h(2, "architecture") Architecture - -+image - include ../../assets/img/docs/architecture.svg - -+h(2, "design-considerations") Design considerations - -+h(3, "no-job-too-big") No job too big - -p - | When writing spaCy, one of my mottos was #[em no job too big]. I wanted - | to make sure that if Google or Facebook were founded tomorrow, spaCy - | would be the obvious choice for them. I wanted spaCy to be the obvious - | choice for web-scale NLP. This meant sweating about performance, because - | for web-scale tasks, Moore's law can't save you. - -p - | Most computational work gets less expensive over time. If you wrote a - | program to solve fluid dynamics in 2008, and you ran it again in 2014, - | you would expect it to be cheaper. For NLP, it often doesn't work out - | that way. The problem is that we're writing programs where the task is - | something like "Process all articles in the English Wikipedia". Sure, - | compute prices dropped from $0.80 per hour to $0.20 per hour on AWS in - | 2008-2014. But the size of Wikipedia grew from 3GB to 11GB. Maybe the - | job is a #[em little] cheaper in 2014 — but not by much. - -+h(3, "annotation-layers") Multiple layers of annotation - -p - | When I tell a certain sort of person that I'm a computational linguist, - | this comic is often the first thing that comes to their mind: - -+image("http://i.imgur.com/n3DTzqx.png", 450) - +image-caption © #[+a("http://xkcd.com") xkcd] - -p - | I've thought a lot about what this comic is really trying to say. It's - | probably not talking about #[em data models] — but in that sense at - | least, it really rings true. - -p - | You'll often need to model a document as a sequence of sentences. Other - | times you'll need to model it as a sequence of words. Sometimes you'll - | care about paragraphs, other times you won't. Sometimes you'll care - | about extracting quotes, which can cross paragraph boundaries. A quote - | can also occur within a sentence. When we consider sentence structure, - | things get even more complicated and contradictory. We have syntactic - | trees, sequences of entities, sequences of phrases, sub-word units, - | multi-word units... - -p - | Different applications are going to need to query different, - | overlapping, and often contradictory views of the document. They're - | often going to need to query them jointly. You need to be able to get - | the syntactic head of a named entity, or the sentiment of a paragraph. - -+h(2, "solutions") Solutions - -+h(3) Fat types, thin tokens - -+h(3) Static model, dynamic views - -p - | Different applications are going to need to query different, - | overlapping, and often contradictory views of the document. For this - | reason, I think it's a bad idea to have too much of the document - | structure reflected in the data model. If you structure the data - | according to the needs of one layer of annotation, you're going to need - | to copy the data and transform it in order to use a different layer of - | annotation. You'll soon have lots of copies, and no single source of - | truth. - -+h(3) Never go full stand-off - -+h(3) Implementation - -+h(3) Cython 101 - -+h(3) #[code cdef class Doc] - -p - | Let's start at the top. Here's the memory layout of the - | #[+api("doc") #[code Doc]] class, minus irrelevant details: - -+code. - from cymem.cymem cimport Pool - from ..vocab cimport Vocab - from ..structs cimport TokenC - - cdef class Doc: - cdef Pool mem - cdef Vocab vocab - - cdef TokenC* c - - cdef int length - cdef int max_length - -p - | So, our #[code Doc] class is a wrapper around a TokenC* array — that's - | where the actual document content is stored. Here's the #[code TokenC] - | struct, in its entirety: - -+h(3) #[code cdef struct TokenC] - -+code. - cdef struct TokenC: - const LexemeC* lex - uint64_t morph - univ_pos_t pos - bint spacy - int tag - int idx - int lemma - int sense - int head - int dep - bint sent_start - - uint32_t l_kids - uint32_t r_kids - uint32_t l_edge - uint32_t r_edge - - int ent_iob - int ent_type # TODO: Is there a better way to do this? Multiple sources of truth.. - hash_t ent_id - -p - | The token owns all of its linguistic annotations, and holds a const - | pointer to a #[code LexemeC] struct. The #[code LexemeC] struct owns all - | of the #[em vocabulary] data about the word — all the dictionary - | definition stuff that we want to be shared by all instances of the type. - | Here's the #[code LexemeC] struct, in its entirety: - -+h(3) #[code cdef struct LexemeC] - -+code. - cdef struct LexemeC: - - int32_t id - - int32_t orth # Allows the string to be retrieved - int32_t length # Length of the string - - uint64_t flags # These are the most useful parts. - int32_t cluster # Distributional similarity cluster - float prob # Probability - float sentiment # Slot for sentiment - - int32_t lang - - int32_t lower # These string views made sense - int32_t norm # when NLP meant linear models. - int32_t shape # Now they're less relevant, and - int32_t prefix # will probably be revised. - int32_t suffix - - float* vector # <-- This was a design mistake, and will change. - -+h(2, "dynamic-views") Dynamic views - -+h(3) Text - -p - | You might have noticed that in all of the structs above, there's not a - | string to be found. The strings are all stored separately, in the - | #[+api("stringstore") #[code StringStore]] class. The lexemes don't know - | the strings — they only know their integer IDs. The document string is - | never stored anywhere, either. Instead, it's reconstructed by iterating - | over the tokens, which look up the #[code orth] attribute of their - | underlying lexeme. Once we have the orth ID, we can fetch the string - | from the vocabulary. Finally, each token knows whether a single - | whitespace character (#[code ' ']) should be used to separate it from - | the subsequent tokens. This allows us to preserve whitespace. - -+code. - cdef print_text(Vocab vocab, const TokenC* tokens, int length): - for i in range(length): - word_string = vocab.strings[tokens.lex.orth] - if tokens.lex.spacy: - word_string += ' ' - print(word_string) - -p - | This is why you get whitespace tokens in spaCy — we need those tokens, - | so that we can reconstruct the document string. I also think you should - | have those tokens anyway. Most NLP libraries strip them, making it very - | difficult to recover the paragraph information once you're at the token - | level. You'll never have that sort of problem with spaCy — because - | there's a single source of truth. - -+h(3) #[code cdef class Token] - -p When you do... - -+code. - doc[i] - -p - | ...you get back an instance of class #[code spacy.tokens.token.Token]. - | This instance owns no data. Instead, it holds the information - | #[code (doc, i)], and uses these to retrieve all information via the - | parent container. - -+h(3) #[code cdef class Span] - -p When you do... - -+code. - doc[i : j] - -p - | ...you get back an instance of class #[code spacy.tokens.span.Span]. - | #[code Span] instances are also returned by the #[code .sents], - | #[code .ents] and #[code .noun_chunks] iterators of the #[code Doc] - | object. A #[code Span] is a slice of tokens, with an optional label - | attached. Its data model is: - -+code. - cdef class Span: - cdef readonly Doc doc - cdef int start - cdef int end - cdef int start_char - cdef int end_char - cdef int label - -p - | Once again, the #[code Span] owns almost no data. Instead, it refers - | back to the parent #[code Doc] container. - -p - | The #[code start] and #[code end] attributes refer to token positions, - | while #[code start_char] and #[code end_char] record the character - | positions of the span. By recording the character offsets, we can still - | use the #[code Span] object if the tokenization of the document changes. - -+h(3) #[code cdef class Lexeme] - -p When you do... - -+code. - vocab[u'the'] - -p - | ...you get back an instance of class #[code spacy.lexeme.Lexeme]. The - | #[code Lexeme]'s data model is: - -+code. - cdef class Lexeme: - cdef LexemeC* c - cdef readonly Vocab vocab diff --git a/website/docs/usage/deep-learning.jade b/website/docs/usage/deep-learning.jade index fec01b4ba..18f33c900 100644 --- a/website/docs/usage/deep-learning.jade +++ b/website/docs/usage/deep-learning.jade @@ -17,6 +17,8 @@ p | #[+a("http://deeplearning.net/software/theano/") Theano] is also | supported. ++under-construction + +code("Runtime usage"). def count_entity_sentiment(nlp, texts): '''Compute the net document sentiment for each entity in the texts.''' @@ -153,7 +155,9 @@ p | adding another LSTM layer, using attention mechanism, using character | features, etc. -+h(2, "attribute-hooks") Attribute hooks (experimental) ++h(2, "attribute-hooks") Attribute hooks + ++under-construction p | Earlier, we saw how to store data in the new generic #[code user_data] diff --git a/website/docs/usage/dependency-parse.jade b/website/docs/usage/dependency-parse.jade index 904522bd4..dfb37f786 100644 --- a/website/docs/usage/dependency-parse.jade +++ b/website/docs/usage/dependency-parse.jade @@ -6,57 +6,85 @@ p | spaCy features a fast and accurate syntactic dependency parser, and has | a rich API for navigating the tree. The parser also powers the sentence | boundary detection, and lets you iterate over base noun phrases, or - | "chunks". + | "chunks". You can check whether a #[+api("doc") #[code Doc]] object has + | been parsed with the #[code doc.is_parsed] attribute, which returns a + | boolean value. If this attribute is #[code False], the default sentence + | iterator will raise an exception. -+aside-code("Example"). - import spacy ++h(2, "noun-chunks") Noun chunks + +tag-model("dependency parse") + +p + | Noun chunks are "base noun phrases" – flat phrases that have a noun as + | their head. You can think of noun chunks as a noun plus the words describing + | the noun – for example, "the lavish green grass" or "the world’s largest + | tech fund". To get the noun chunks in a document, simply iterate over + | #[+api("doc#noun_chunks") #[code Doc.noun_chunks]]. + ++code("Example"). nlp = spacy.load('en') - doc = nlp(u'I like green eggs and ham.') - for np in doc.noun_chunks: - print(np.text, np.root.text, np.root.dep_, np.root.head.text) - # I I nsubj like - # green eggs eggs dobj like - # ham ham conj eggs + doc = nlp(u'Autonomous cars shift insurance liability toward manufacturers') + for chunk in doc.noun_chunks: + print(chunk.text, chunk.root.text, chunk.root.dep_, + chunk.root.head.text) -p - | You can check whether a #[+api("doc") #[code Doc]] object has been - | parsed with the #[code doc.is_parsed] attribute, which returns a boolean - | value. If this attribute is #[code False], the default sentence iterator - | will raise an exception. ++aside + | #[strong Text:] The original noun chunk text.#[br] + | #[strong Root text:] The original text of the word connecting the noun + | chunk to the rest of the parse.#[br] + | #[strong Root dep:] Dependcy relation connecting the root to its head.#[br] + | #[strong Root head text:] The text of the root token's head.#[br] -+h(2, "displacy") The displaCy visualizer - -p - | The best way to understand spaCy's dependency parser is interactively, - | through the #[+a(DEMOS_URL + "/displacy", true) displaCy visualizer]. If - | you want to know how to write rules that hook into some type of syntactic - | construction, just plug the sentence into the visualizer and see how - | spaCy annotates it. ++table(["Text", "root.text", "root.dep_", "root.head.text"]) + - var style = [0, 0, 1, 0] + +annotation-row(["Autonomous cars", "cars", "nsubj", "shift"], style) + +annotation-row(["insurance liability", "liability", "dobj", "shift"], style) + +annotation-row(["manufacturers", "manufacturers", "pobj", "toward"], style) +h(2, "navigating") Navigating the parse tree p - | spaCy uses the terms #[em head] and #[em child] to describe the words - | connected by a single arc in the dependency tree. The term #[em dep] is - | used for the arc label, which describes the type of syntactic relation - | that connects the child to the head. As with other attributes, the value - | of #[code token.dep] is an integer. You can get the string value with - | #[code token.dep_]. + | spaCy uses the terms #[strong head] and #[strong child] to describe the words + | #[strong connected by a single arc] in the dependency tree. The term + | #[strong dep] is used for the arc label, which describes the type of + | syntactic relation that connects the child to the head. As with other + | attributes, the value of #[code .dep] is an integer. You can get + | the string value with #[code .dep_]. -+aside-code("Example"). - from spacy.symbols import det - the, dog = nlp(u'the dog') - assert the.dep == det - assert the.dep_ == 'det' ++code("Example"). + doc = nlp(u'Autonomous cars shift insurance liability toward manufacturers') + for token in doc: + print(token.text, token.dep_, token.head.text, token.head.pos_, + [child for child in token.children]) + ++aside + | #[strong Text]: The original token text.#[br] + | #[strong Dep]: The syntactic relation connecting child to head.#[br] + | #[strong Head text]: The original text of the token head.#[br] + | #[strong Head POS]: The part-of-speech tag of the token head.#[br] + | #[strong Children]: The immediate syntactic dependents of the token. + ++table(["Text", "Dep", "Head text", "Head POS", "Children"]) + - var style = [0, 1, 0, 1, 0] + +annotation-row(["Autonomous", "amod", "cars", "NOUN", ""], style) + +annotation-row(["cars", "nsubj", "shift", "VERB", "Autonomous"], style) + +annotation-row(["shift", "ROOT", "shift", "VERB", "cars, liability"], style) + +annotation-row(["insurance", "compound", "liability", "NOUN", ""], style) + +annotation-row(["liability", "dobj", "shift", "VERB", "insurance, toward"], style) + +annotation-row(["toward", "prep", "liability", "NOUN", "manufacturers"], style) + +annotation-row(["manufacturers", "pobj", "toward", "ADP", ""], style) + ++codepen("dcf8d293367ca185b935ed2ca11ebedd", 370) p - | Because the syntactic relations form a tree, every word has exactly one - | head. You can therefore iterate over the arcs in the tree by iterating - | over the words in the sentence. This is usually the best way to match an - | arc of interest — from below: + | Because the syntactic relations form a tree, every word has + | #[strong exactly one head]. You can therefore iterate over the arcs in + | the tree by iterating over the words in the sentence. This is usually + | the best way to match an arc of interest — from below: +code. from spacy.symbols import nsubj, VERB + # Finding a verb with a subject from below — good verbs = set() for possible_subject in doc: @@ -82,6 +110,8 @@ p | attribute, which provides a sequence of #[+api("token") #[code Token]] | objects. ++h(3, "navigating-around") Iterating around the local tree + p | A few more convenience attributes are provided for iterating around the | local tree from the token. The #[code .lefts] and #[code .rights] @@ -90,75 +120,118 @@ p | two integer-typed attributes, #[code .n_rights] and #[code .n_lefts], | that give the number of left and right children. -+aside-code("Examples"). - apples = nlp(u'bright red apples on the tree')[2] - print([w.text for w in apples.lefts]) - # ['bright', 'red'] - print([w.text for w in apples.rights]) - # ['on'] - assert apples.n_lefts == 2 - assert apples.n_rights == 1 - - from spacy.symbols import nsubj - doc = nlp(u'Credit and mortgage account holders must submit their requests within 30 days.') - root = [w for w in doc if w.head is w][0] - subject = list(root.lefts)[0] - for descendant in subject.subtree: - assert subject.is_ancestor_of(descendant) - - from spacy.symbols import nsubj - doc = nlp(u'Credit and mortgage account holders must submit their requests.') - holders = doc[4] - span = doc[holders.left_edge.i : holders.right_edge.i + 1] - span.merge() - for word in doc: - print(word.text, word.pos_, word.dep_, word.head.text) - # Credit and mortgage account holders nsubj NOUN submit - # must VERB aux submit - # submit VERB ROOT submit - # their DET det requests - # requests NOUN dobj submit ++code. + doc = nlp(u'bright red apples on the tree') + assert [token.text for token in doc[2].lefts]) == [u'bright', u'red'] + assert [token.text for token in doc[2].rights]) == ['on'] + assert doc[2].n_lefts == 2 + assert doc[2].n_rights == 1 p | You can get a whole phrase by its syntactic head using the | #[code .subtree] attribute. This returns an ordered sequence of tokens. - | For the default English model, the parse tree is #[em projective], which - | means that there are no crossing brackets. The tokens returned by - | #[code .subtree] are therefore guaranteed to be contiguous. This is not - | true for the German model, which has many - | #[+a("https://explosion.ai/blog/german-model#word-order", true) non-projective dependencies]. | You can walk up the tree with the #[code .ancestors] attribute, and - | check dominance with the #[code .is_ancestor()] method. + | check dominance with the #[+api("token#is_ancestor") #[code .is_ancestor()]] + | method. + ++aside("Projective vs. non-projective") + | For the #[+a("/docs/usage/models#available") default English model], the + | parse tree is #[strong projective], which means that there are no crossing + | brackets. The tokens returned by #[code .subtree] are therefore guaranteed + | to be contiguous. This is not true for the German model, which has many + | #[+a(COMPANY_URL + "/blog/german-model#word-order", true) non-projective dependencies]. + ++code. + doc = nlp(u'Credit and mortgage account holders must submit their requests') + root = [token for token in doc if token.head is token][0] + subject = list(root.lefts)[0] + for descendant in subject.subtree: + assert subject.is_ancestor(descendant) + print(descendant.text, descendant.dep_, descendant.n_lefts, descendant.n_rights, + [ancestor.text for ancestor in descendant.ancestors]) + ++table(["Text", "Dep", "n_lefts", "n_rights", "ancestors"]) + - var style = [0, 1, 1, 1, 0] + +annotation-row(["Credit", "nmod", 0, 2, "holders, submit"], style) + +annotation-row(["and", "cc", 0, 0, "Credit, holders, submit"], style) + +annotation-row(["mortgage", "compound", 0, 0, "account, Credit, holders, submit"], style) + +annotation-row(["account", "conj", 1, 0, "Credit, holders, submit"], style) + +annotation-row(["holders", "nsubj", 1, 0, "submit"], style) p - | Finally, I often find the #[code .left_edge] and #[code right_edge] - | attributes especially useful. They give you the first and last token + | Finally, the #[code .left_edge] and #[code .right_edge] attributes + | can be especially useful, because they give you the first and last token | of the subtree. This is the easiest way to create a #[code Span] object - | for a syntactic phrase — a useful operation. + | for a syntactic phrase. Note that #[code .right_edge] gives a token + | #[strong within] the subtree — so if you use it as the end-point of a + | range, don't forget to #[code +1]! + ++code. + doc = nlp(u'Credit and mortgage account holders must submit their requests') + span = doc[doc[4].left_edge.i : doc[4].right_edge.i+1] + span.merge() + for token in doc: + print(token.text, token.pos_, token.dep_, token.head.text) + ++table(["Text", "POS", "Dep", "Head text"]) + - var style = [0, 1, 1, 0] + +annotation-row(["Credit and mortgage account holders", "NOUN", "nsubj", "submit"], style) + +annotation-row(["must", "VERB", "aux", "submit"], style) + +annotation-row(["submit", "VERB", "ROOT", "submit"], style) + +annotation-row(["their", "ADJ", "poss", "requests"], style) + +annotation-row(["requests", "NOUN", "dobj", "submit"], style) + ++h(2, "displacy") Visualizing dependencies p - | Note that #[code .right_edge] gives a token #[em within] the subtree — - | so if you use it as the end-point of a range, don't forget to #[code +1]! + | The best way to understand spaCy's dependency parser is interactively. + | To make this easier, spaCy v2.0+ comes with a visualization module. Simply + | pass a #[code Doc] or a list of #[code Doc] objects to + | displaCy and run #[+api("displacy#serve") #[code displacy.serve]] to + | run the web server, or #[+api("displacy#render") #[code displacy.render]] + | to generate the raw markup. If you want to know how to write rules that + | hook into some type of syntactic construction, just plug the sentence into + | the visualizer and see how spaCy annotates it. + ++code. + from spacy import displacy + + doc = nlp(u'Autonomous cars shift insurance liability toward manufacturers') + displacy.serve(doc, style='dep') + ++infobox + | For more details and examples, see the + | #[+a("/docs/usage/visualizers") usage workflow on visualizing spaCy]. You + | can also test displaCy in our #[+a(DEMOS_URL + "/displacy", true) online demo]. +h(2, "disabling") Disabling the parser p - | The parser is loaded and enabled by default. If you don't need any of - | the syntactic information, you should disable the parser. Disabling the - | parser will make spaCy load and run much faster. Here's how to prevent - | the parser from being loaded: + | In the #[+a("/docs/usage/models/available") default models], the parser + | is loaded and enabled as part of the + | #[+a("docs/usage/language-processing-pipelines") standard processing pipeline]. + | If you don't need any of the syntactic information, you should disable + | the parser. Disabling the parser will make spaCy load and run much faster. + | If you want to load the parser, but need to disable it for specific + | documents, you can also control its use on the #[code nlp] object. +code. - import spacy + nlp = spacy.load('en', disable=['parser']) + nlp = English().from_disk('/model', disable=['parser']) + doc = nlp(u"I don't want parsed", disable=['parser']) - nlp = spacy.load('en', parser=False) - -p - | If you need to load the parser, but need to disable it for specific - | documents, you can control its use with the #[code parse] keyword - | argument: - -+code. - nlp = spacy.load('en') - doc1 = nlp(u'Text I do want parsed.') - doc2 = nlp(u"Text I don't want parsed", parse=False) ++infobox("Important note: disabling pipeline components") + .o-block + | Since spaCy v2.0 comes with better support for customising the + | processing pipeline components, the #[code parser] keyword argument + | has been replaced with #[code disable], which takes a list of + | #[+a("/docs/usage/language-processing-pipeline") pipeline component names]. + | This lets you disable both default and custom components when loading + | a model, or initialising a Language class via + | #[+api("language-from_disk") #[code from_disk]]. + +code-new. + nlp = spacy.load('en', disable=['parser']) + doc = nlp(u"I don't want parsed", disable=['parser']) + +code-old. + nlp = spacy.load('en', parser=False) + doc = nlp(u"I don't want parsed", parse=False) diff --git a/website/docs/usage/entity-recognition.jade b/website/docs/usage/entity-recognition.jade index 2c3116b82..527c14dde 100644 --- a/website/docs/usage/entity-recognition.jade +++ b/website/docs/usage/entity-recognition.jade @@ -9,14 +9,12 @@ p | locations, organizations and products. You can add arbitrary classes to | the entity recognition system, and update the model with new examples. -+aside-code("Example"). - import spacy - nlp = spacy.load('en') - doc = nlp(u'London is a big city in the United Kingdom.') - for ent in doc.ents: - print(ent.label_, ent.text) - # GPE London - # GPE United Kingdom ++h(2, "101") Named Entity Recognition 101 + +tag-model("named entities") + +include _spacy-101/_named-entities + ++h(2, "accessing") Accessing entity annotations p | The standard way to access entity annotations is the @@ -26,56 +24,89 @@ p | #[code ent.label] and #[code ent.label_]. The #[code Span] object acts | as a sequence of tokens, so you can iterate over the entity or index into | it. You can also get the text form of the whole entity, as though it were - | a single token. See the #[+api("span") API reference] for more details. + | a single token. p - | You can access token entity annotations using the #[code token.ent_iob] - | and #[code token.ent_type] attributes. The #[code token.ent_iob] - | attribute indicates whether an entity starts, continues or ends on the - | tag (In, Begin, Out). + | You can also access token entity annotations using the + | #[+api("token#attributes") #[code token.ent_iob]] and + | #[+api("token#attributes") #[code token.ent_type]] attributes. + | #[code token.ent_iob] indicates whether an entity starts, continues or + | ends on the tag. If no entity type is set on a token, it will return an + | empty string. + ++aside("IOB Scheme") + | #[code I] – Token is inside an entity.#[br] + | #[code O] – Token is outside an entity.#[br] + | #[code B] – Token is the beginning of an entity.#[br] +code("Example"). - doc = nlp(u'London is a big city in the United Kingdom.') - print(doc[0].text, doc[0].ent_iob, doc[0].ent_type_) - # (u'London', 2, u'GPE') - print(doc[1].text, doc[1].ent_iob, doc[1].ent_type_) - # (u'is', 3, u'') + doc = nlp(u'San Francisco considers banning sidewalk delivery robots') + + # document level + ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents] + assert ents == [(u'San Francisco', 0, 13, u'GPE')] + + # token level + ent_san = [doc[0].text, doc[0].ent_iob_, doc[0].ent_type_] + ent_francisco = [doc[1].text, doc[1].ent_iob_, doc[1].ent_type_] + assert ent_san == [u'San', u'B', u'GPE'] + assert ent_francisco == [u'Francisco', u'I', u'GPE'] + ++table(["Text", "ent_iob", "ent_iob_", "ent_type", "ent_type_", "Description"]) + - var style = [0, 1, 1, 1, 1, 0] + +annotation-row(["San", 3, "B", 381, "GPE", "beginning of an entity"], style) + +annotation-row(["Francisco", 1, "I", 381, "GPE", "inside an entity"], style) + +annotation-row(["considers", 2, "O", 0, '""', "outside an entity"], style) + +annotation-row(["banning", 2, "O", 0, '""', "outside an entity"], style) + +annotation-row(["sidewalk", 2, "O", 0, '""', "outside an entity"], style) + +annotation-row(["delivery", 2, "O", 0, '""', "outside an entity"], style) + +annotation-row(["robots", 2, "O", 0, '""', "outside an entity"], style) +h(2, "setting") Setting entity annotations p | To ensure that the sequence of token annotations remains consistent, you - | have to set entity annotations at the document level — you can't write - | directly to the #[code token.ent_iob] or #[code token.ent_type] - | attributes. The easiest way to set entities is to assign to the - | #[code doc.ents] attribute. + | have to set entity annotations #[strong at the document level]. However, + | you can't write directly to the #[code token.ent_iob] or + | #[code token.ent_type] attributes, so the easiest way to set entities is + | to assign to the #[+api("doc#ents") #[code doc.ents]] attribute + | and create the new entity as a #[+api("span") #[code Span]]. +code("Example"). - doc = nlp(u'London is a big city in the United Kingdom.') - doc.ents = [] - assert doc[0].ent_type_ == '' - doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings['GPE'])] - assert doc[0].ent_type_ == 'GPE' - doc.ents = [] - doc.ents = [(u'LondonCity', doc.vocab.strings['GPE'], 0, 1)] + from spacy.tokens import Span + + doc = nlp(u'Netflix is hiring a new VP of global policy') + # the model didn't recognise any entities :( + + ORG = doc.vocab.strings[u'ORG'] # get integer ID of entity label + netflix_ent = Span(doc, 0, 1, label=ORG) # create a Span for the new entity + doc.ents = [netflix_ent] + + ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents] + assert ents = [(u'Netflix', 0, 7, u'ORG')] p - | The value you assign should be a sequence, the values of which - | can either be #[code Span] objects, or #[code (ent_id, ent_type, start, end)] - | tuples, where #[code start] and #[code end] are token offsets that - | describe the slice of the document that should be annotated. + | Keep in mind that you need to create a #[code Span] with the start and + | end index of the #[strong token], not the start and end index of the + | entity in the document. In this case, "Netflix" is token #[code (0, 1)] – + | but at the document level, the entity will have the start and end + | indices #[code (0, 7)]. + ++h(3, "setting-from-array") Setting entity annotations from array p - | You can also assign entity annotations using the #[code doc.from_array()] - | method. To do this, you should include both the #[code ENT_TYPE] and the - | #[code ENT_IOB] attributes in the array you're importing from. + | You can also assign entity annotations using the + | #[+api("doc#from_array") #[code doc.from_array()]] method. To do this, + | you should include both the #[code ENT_TYPE] and the #[code ENT_IOB] + | attributes in the array you're importing from. -+code("Example"). - from spacy.attrs import ENT_IOB, ENT_TYPE ++code. import numpy + from spacy.attrs import ENT_IOB, ENT_TYPE doc = nlp.make_doc(u'London is a big city in the United Kingdom.') assert list(doc.ents) == [] + header = [ENT_IOB, ENT_TYPE] attr_array = numpy.zeros((len(doc), len(header))) attr_array[0, 0] = 2 # B @@ -83,12 +114,14 @@ p doc.from_array(header, attr_array) assert list(doc.ents)[0].text == u'London' ++h(3, "setting-cython") Setting entity annotations in Cython + p | Finally, you can always write to the underlying struct, if you compile - | a Cython function. This is easy to do, and allows you to write efficient - | native code. + | a #[+a("http://cython.org/") Cython] function. This is easy to do, and + | allows you to write efficient native code. -+code("Example"). ++code. # cython: infer_types=True from spacy.tokens.doc cimport Doc @@ -104,67 +137,30 @@ p | you'll have responsibility for ensuring that the data is left in a | consistent state. - -+h(2, "displacy") Visualizing named entities - -p - | The #[+a(DEMOS_URL + "/displacy-ent/") displaCy #[sup ENT] visualizer] - | lets you explore an entity recognition model's behaviour interactively. - | If you're training a model, it's very useful to run the visualization - | yourself. To help you do that, spaCy v2.0+ comes with a visualization - | module. Simply pass a #[code Doc] or a list of #[code Doc] objects to - | displaCy and run #[+api("displacy#serve") #[code displacy.serve]] to - | run the web server, or #[+api("displacy#render") #[code displacy.render]] - | to generate the raw markup. - -p - | For more details and examples, see the - | #[+a("/docs/usage/visualizers") usage workflow on visualizing spaCy]. - -+code("Named Entity example"). - import spacy - from spacy import displacy - - text = """But Google is starting from behind. The company made a late push - into hardware, and Apple’s Siri, available on iPhones, and Amazon’s Alexa - software, which runs on its Echo and Dot devices, have clear leads in - consumer adoption.""" - - nlp = spacy.load('custom_ner_model') - doc = nlp(text) - displacy.serve(doc, style='ent') - -+codepen("a73f8b68f9af3157855962b283b364e4", 345) - +h(2, "entity-types") Built-in entity types -include ../api/_annotation/_named-entities ++aside("Tip: Understanding entity types") + | You can also use #[code spacy.explain()] to get the description for the + | string representation of an entity label. For example, + | #[code spacy.explain("LANGUAGE")] will return "any named language". -+aside("Install") - | The #[+api("load") #[code spacy.load()]] function configures a pipeline that - | includes all of the available annotators for the given ID. In the example - | above, the #[code 'en'] ID tells spaCy to load the default English - | pipeline. If you have installed the data with - | #[code python -m spacy download en], this will include the entity - | recognition model. +include ../api/_annotation/_named-entities +h(2, "updating") Training and updating p | To provide training examples to the entity recogniser, you'll first need - | to create an instance of the #[code GoldParse] class. You can specify - | your annotations in a stand-off format or as token tags. + | to create an instance of the #[+api("goldparse") #[code GoldParse]] class. + | You can specify your annotations in a stand-off format or as token tags. +code. - import spacy import random + import spacy from spacy.gold import GoldParse - from spacy.language import EntityRecognizer + from spacy.pipeline import EntityRecognizer - train_data = [ - ('Who is Chaka Khan?', [(7, 17, 'PERSON')]), - ('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')]) - ] + train_data = [('Who is Chaka Khan?', [(7, 17, 'PERSON')]), + ('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')])] nlp = spacy.load('en', entity=False, parser=False) ner = EntityRecognizer(nlp.vocab, entity_types=['PERSON', 'LOC']) @@ -237,3 +233,34 @@ p | loss, via the #[+a("http://www.aclweb.org/anthology/C12-1059") dynamic oracle] | imitation learning strategy. The transition system is equivalent to the | BILOU tagging scheme. + ++h(2, "displacy") Visualizing named entities + +p + | The #[+a(DEMOS_URL + "/displacy-ent/") displaCy #[sup ENT] visualizer] + | lets you explore an entity recognition model's behaviour interactively. + | If you're training a model, it's very useful to run the visualization + | yourself. To help you do that, spaCy v2.0+ comes with a visualization + | module. Simply pass a #[code Doc] or a list of #[code Doc] objects to + | displaCy and run #[+api("displacy#serve") #[code displacy.serve]] to + | run the web server, or #[+api("displacy#render") #[code displacy.render]] + | to generate the raw markup. + +p + | For more details and examples, see the + | #[+a("/docs/usage/visualizers") usage workflow on visualizing spaCy]. + ++code("Named Entity example"). + import spacy + from spacy import displacy + + text = """But Google is starting from behind. The company made a late push + into hardware, and Apple’s Siri, available on iPhones, and Amazon’s Alexa + software, which runs on its Echo and Dot devices, have clear leads in + consumer adoption.""" + + nlp = spacy.load('custom_ner_model') + doc = nlp(text) + displacy.serve(doc, style='ent') + ++codepen("a73f8b68f9af3157855962b283b364e4", 345) diff --git a/website/docs/usage/index.jade b/website/docs/usage/index.jade index da13f4d81..cb1ab5754 100644 --- a/website/docs/usage/index.jade +++ b/website/docs/usage/index.jade @@ -175,6 +175,136 @@ p +cell Python 3.5+ +cell Visual Studio 2015 ++h(2, "troubleshooting") Troubleshooting guide + +p + | This section collects some of the most common errors you may come + | across when installing, loading and using spaCy, as well as their solutions. + ++aside("Help us improve this guide") + | Did you come across a problem like the ones listed here and want to + | share the solution? You can find the "Suggest edits" button at the + | bottom of this page that points you to the source. We always + | appreciate #[+a(gh("spaCy") + "/pulls") pull requests]! + ++h(3, "compatible-model") No compatible model found + ++code(false, "text"). + No compatible model found for [lang] (spaCy v#{SPACY_VERSION}). + +p + | This usually means that the model you're trying to download does not + | exist, or isn't available for your version of spaCy. Check the + | #[+a(gh("spacy-models", "compatibility.json")) compatibility table] + | to see which models are available for your spaCy version. If you're using + | an old version, consider upgrading to the latest release. Note that while + | spaCy supports tokenization for + | #[+a("/docs/api/language-models/#alpha-support") a variety of languages], + | not all of them come with statistical models. To only use the tokenizer, + | import the language's #[code Language] class instead, for example + | #[code from spacy.fr import French]. + ++h(3, "symlink-privilege") Symbolic link privilege not held + ++code(false, "text"). + OSError: symbolic link privilege not held + +p + | To create #[+a("/docs/usage/models/#usage") shortcut links] that let you + | load models by name, spaCy creates a symbolic link in the + | #[code spacy/data] directory. This means your user needs permission to do + | this. The above error mostly occurs when doing a system-wide installation, + | which will create the symlinks in a system directory. Run the + | #[code download] or #[code link] command as administrator, or use a + | #[code virtualenv] to install spaCy in a user directory, instead + | of doing a system-wide installation. + ++h(3, "no-cache-dir") No such option: --no-cache-dir + ++code(false, "text"). + no such option: --no-cache-dir + +p + | The #[code download] command uses pip to install the models and sets the + | #[code --no-cache-dir] flag to prevent it from requiring too much memory. + | #[+a("https://pip.pypa.io/en/stable/reference/pip_install/#caching") This setting] + | requires pip v6.0 or newer. Run #[code pip install -U pip] to upgrade to + | the latest version of pip. To see which version you have installed, + | run #[code pip --version]. + ++h(3, "import-error") Import error + ++code(false, "text"). + Import Error: No module named spacy + +p + | This error means that the spaCy module can't be located on your system, or in + | your environment. Make sure you have spaCy installed. If you're using a + | #[code virtualenv], make sure it's activated and check that spaCy is + | installed in that environment – otherwise, you're trying to load a system + | installation. You can also run #[code which python] to find out where + | your Python executable is located. + ++h(3, "import-error-models") Import error: models + ++code(false, "text"). + ImportError: No module named 'en_core_web_sm' + +p + | As of spaCy v1.7, all models can be installed as Python packages. This means + | that they'll become importable modules of your application. When creating + | #[+a("/docs/usage/models/#usage") shortcut links], spaCy will also try + | to import the model to load its meta data. If this fails, it's usually a + | sign that the package is not installed in the current environment. + | Run #[code pip list] or #[code pip freeze] to check which model packages + | you have installed, and install the + | #[+a("/docs/usage/models#available") correct models] if necessary. If you're + | importing a model manually at the top of a file, make sure to use the name + | of the package, not the shortcut link you've created. + ++h(3, "vocab-strings") File not found: vocab/strings.json + ++code(false, "text"). + FileNotFoundError: No such file or directory: [...]/vocab/strings.json + +p + | This error may occur when using #[code spacy.load()] to load + | a language model – either because you haven't set up a + | #[+a("/docs/usage/models/#usage") shortcut link] for it, or because it + | doesn't actually exist. Set up a + | #[+a("/docs/usage/models/#usage") shortcut link] for the model + | you want to load. This can either be an installed model package, or a + | local directory containing the model data. If you want to use one of the + | #[+a("/docs/api/language-models/#alpha-support") alpha tokenizers] for + | languages that don't yet have a statistical model, you should import its + | #[code Language] class instead, for example + | #[code from spacy.lang.bn import Bengali]. + ++h(3, "command-not-found") Command not found + ++code(false, "text"). + command not found: spacy + +p + | This error may occur when running the #[code spacy] command from the + | command line. spaCy does not currently add an entry to our #[code PATH] + | environment variable, as this can lead to unexpected results, especially + | when using #[code virtualenv]. Run the command with #[code python -m], + | for example #[code python -m spacy download en]. For more info on this, + | see #[+api("cli#download") download]. + ++h(3, "module-load") 'module' object has no attribute 'load' + ++code(false, "text"). + AttributeError: 'module' object has no attribute 'load' + +p + | While this could technically have many causes, including spaCy being + | broken, the most likely one is that your script's file or directory name + | is "shadowing" the module – e.g. your file is called #[code spacy.py], + | or a directory you're importing from is called #[code spacy]. So, when + | using spaCy, never call anything else #[code spacy]. + +h(2, "tests") Run tests p diff --git a/website/docs/usage/language-processing-pipeline.jade b/website/docs/usage/language-processing-pipeline.jade index c372dfbf4..ffad01ead 100644 --- a/website/docs/usage/language-processing-pipeline.jade +++ b/website/docs/usage/language-processing-pipeline.jade @@ -2,127 +2,352 @@ include ../../_includes/_mixins -p - | The standard entry point into spaCy is the #[code spacy.load()] - | function, which constructs a language processing pipeline. The standard - | variable name for the language processing pipeline is #[code nlp], for - | Natural Language Processing. The #[code nlp] variable is usually an - | instance of class #[code spacy.language.Language]. For English, the - | #[code spacy.en.English] class is the default. ++h(2, "101") Pipelines 101 + +include _spacy-101/_pipelines + ++h(2, "pipelines") How pipelines work p - | You'll use the nlp instance to produce #[+api("doc") #[code Doc]] - | objects. You'll then use the #[code Doc] object to access linguistic - | annotations to help you with whatever text processing task you're - | trying to do. + | spaCy makes it very easy to create your own pipelines consisting of + | reusable components – this includes spaCy's default vectorizer, tagger, + | parser and entity regcognizer, but also your own custom processing + | functions. A pipeline component can be added to an already existing + | #[code nlp] object, specified when initialising a #[code Language] class, + | or defined within a + | #[+a("/docs/usage/saving-loading#models-generating") model package]. + +p + | When you load a model, spaCy first consults the model's + | #[+a("/docs/usage/saving-loading#models-generating") meta.json]. The + | meta typically includes the model details, the ID of a language class, + | and an optional list of pipeline components. spaCy then does the + | following: + ++aside-code("meta.json (excerpt)", "json"). + { + "name": "example_model", + "lang": "en" + "description": "Example model for spaCy", + "pipeline": ["token_vectors", "tagger"] + } + ++list("numbers") + +item + | Look up #[strong pipeline IDs] in the available + | #[strong pipeline factories]. + +item + | Initialise the #[strong pipeline components] by calling their + | factories with the #[code Vocab] as an argument. This gives each + | factory and component access to the pipeline's shared data, like + | strings, morphology and annotation scheme. + +item + | Load the #[strong language class and data] for the given ID via + | #[+api("util.get_lang_class") #[code get_lang_class]]. + +item + | Pass the path to the #[strong model data] to the #[code Language] + | class and return it. + +p + | So when you call this... +code. - import spacy # See "Installing spaCy" - nlp = spacy.load('en') # You are here. - doc = nlp(u'Hello, spacy!') # See "Using the pipeline" - print((w.text, w.pos_) for w in doc) # See "Doc, Span and Token" - -+aside("Why do we have to preload?") - | Loading the models takes ~200x longer than - | processing a document. We therefore want to amortize the start-up cost - | across multiple invocations. It's often best to wrap the pipeline as a - | singleton. The library avoids doing that for you, because it's a - | difficult design to back out of. - -p The #[code load] function takes the following positional arguments: - -+table([ "Name", "Description" ]) - +row - +cell #[code lang_id] - +cell - | An ID that is resolved to a class or factory function by - | #[code spacy.util.get_lang_class()]. Common values are - | #[code 'en'] for the English pipeline, or #[code 'de'] for the - | German pipeline. You can register your own factory function or - | class with #[code spacy.util.set_lang_class()]. + nlp = spacy.load('en') p - | All keyword arguments are passed forward to the pipeline factory. No - | keyword arguments are required. The built-in factories (e.g. - | #[code spacy.en.English], #[code spacy.de.German]), which are subclasses - | of #[+api("language") #[code Language]], respond to the following - | keyword arguments: + | ... the model tells spaCy to use the pipeline + | #[code ["vectorizer", "tagger", "parser", "ner"]]. spaCy will then look + | up each string in its internal factories registry and initialise the + | individual components. It'll then load #[code spacy.lang.en.English], + | pass it the path to the model's data directory, and return it for you + | to use as the #[code nlp] object. -+table([ "Name", "Description"]) +p + | When you call #[code nlp] on a text, spaCy will #[strong tokenize] it and + | then #[strong call each component] on the #[code Doc], in order. + | Components all return the modified document, which is then processed by + | the component next in the pipeline. + ++code("The pipeline under the hood"). + doc = nlp.make_doc(u'This is a sentence') + for proc in nlp.pipeline: + doc = proc(doc) + ++h(2, "creating") Creating pipeline components and factories + +p + | spaCy lets you customise the pipeline with your own components. Components + | are functions that receive a #[code Doc] object, modify and return it. + | If your component is stateful, you'll want to create a new one for each + | pipeline. You can do that by defining and registering a factory which + | receives the shared #[code Vocab] object and returns a component. + ++h(3, "creating-component") Creating a component + +p + | A component receives a #[code Doc] object and + | #[strong performs the actual processing] – for example, using the current + | weights to make a prediction and set some annotation on the document. By + | adding a component to the pipeline, you'll get access to the #[code Doc] + | at any point #[strong during] processing – instead of only being able to + | modify it afterwards. + ++aside-code("Example"). + def my_component(doc): + # do something to the doc here + return doc + ++table(["Argument", "Type", "Description"]) +row - +cell #[code path] - +cell - | Where to load the data from. If None, the default data path is - | fetched via #[code spacy.util.get_data_path()]. You can - | configure this default using #[code spacy.util.set_data_path()]. - | The data path is expected to be either a string, or an object - | responding to the #[code pathlib.Path] interface. If the path is - | a string, it will be immediately transformed into a - | #[code pathlib.Path] object. spaCy promises to never manipulate - | or open file-system paths as strings. All access to the - | file-system is done via the #[code pathlib.Path] interface. - | spaCy also promises to never check the type of path objects. - | This allows you to customize the loading behaviours in arbitrary - | ways, by creating your own object that implements the - | #[code pathlib.Path] interface. + +cell #[code doc] + +cell #[code Doc] + +cell The #[code Doc] object processed by the previous component. - +row - +cell #[code pipeline] - +cell - | A sequence of functions that take the Doc object and modify it - | in-place. See - | #[+a("customizing-pipeline") Customizing the pipeline]. + +footrow + +cell returns + +cell #[code Doc] + +cell The #[code Doc] object processed by this pipeline component. - +row - +cell #[code create_pipeline] - +cell - | Callback to construct the pipeline sequence. It should accept - | the #[code nlp] instance as its only argument, and return a - | sequence of functions that take the #[code Doc] object and - | modify it in-place. - | See #[+a("customizing-pipeline") Customizing the pipeline]. If - | a value is supplied to the pipeline keyword argument, the - | #[code create_pipeline] keyword argument is ignored. +p + | When creating a new #[code Language] class, you can pass it a list of + | pipeline component functions to execute in that order. You can also + | add it to an existing pipeline by modifying #[code nlp.pipeline] – just + | be careful not to overwrite a pipeline or its components by accident! - +row - +cell #[code make_doc] - +cell A function that takes the input and returns a document object. ++code. + # Create a new Language object with a pipeline + from spacy.language import Language + nlp = Language(pipeline=[my_component]) - +row - +cell #[code create_make_doc] - +cell - | Callback to construct the #[code make_doc] function. It should - | accept the #[code nlp] instance as its only argument. To use the - | built-in annotation processes, it should return an object of - | type #[code Doc]. If a value is supplied to the #[code make_doc] - | keyword argument, the #[code create_make_doc] keyword argument - | is ignored. + # Modify an existing pipeline + nlp = spacy.load('en') + nlp.pipeline.append(my_component) ++h(3, "creating-factory") Creating a factory + +p + | A factory is a #[strong function that returns a pipeline component]. + | It's called with the #[code Vocab] object, to give it access to the + | shared data between components – for example, the strings, morphology, + | vectors or annotation scheme. Factories are useful for creating + | #[strong stateful components], especially ones which + | #[strong depend on shared data]. + ++aside-code("Example"). + def my_factory(vocab): + # load some state + def my_component(doc): + # process the doc + return doc + return my_component + ++table(["Argument", "Type", "Description"]) +row +cell #[code vocab] - +cell Supply a pre-built Vocab instance, instead of constructing one. - - +row - +cell #[code add_vectors] + +cell #[code Vocab] +cell - | Callback that installs word vectors into the Vocab instance. The - | #[code add_vectors] callback should take a - | #[+api("vocab") #[code Vocab]] instance as its only argument, - | and set the word vectors and #[code vectors_length] in-place. See - | #[+a("word-vectors-similarities") Word Vectors and Similarities]. + | Shared data between components, including strings, morphology, + | vectors etc. - +row - +cell #[code tagger] - +cell Supply a pre-built tagger, instead of creating one. + +footrow + +cell returns + +cell callable + +cell The pipeline component. - +row - +cell #[code parser] - +cell Supply a pre-built parser, instead of creating one. +p + | By creating a factory, you're essentially telling spaCy how to get the + | pipeline component #[strong once the vocab is available]. Factories need to + | be registered via #[+api("spacy#set_factory") #[code set_factory()]] and + | by assigning them a unique ID. This ID can be added to the pipeline as a + | string. When creating a pipeline, you're free to mix strings and + | callable components: - +row - +cell #[code entity] - +cell Supply a pre-built entity recognizer, instead of creating one. ++code. + spacy.set_factory('my_factory', my_factory) + nlp = Language(pipeline=['my_factory', my_other_component]) - +row - +cell #[code matcher] - +cell Supply a pre-built matcher, instead of creating one. +p + | If spaCy comes across a string in the pipeline, it will try to resolve it + | by looking it up in the available factories. The factory will then be + | initialised with the #[code Vocab]. Providing factory names instead of + | callables also makes it easy to specify them in the model's + | #[+a("/docs/usage/saving-loading#models-generating") meta.json]. If you're + | training your own model and want to use one of spaCy's default components, + | you won't have to worry about finding and implementing it either – to use + | the default tagger, simply add #[code "tagger"] to the pipeline, and + | #[strong spaCy will know what to do]. + + ++infobox("Important note") + | Because factories are #[strong resolved on initialisation] of the + | #[code Language] class, it's #[strong not possible] to add them to the + | pipeline afterwards, e.g. by modifying #[code nlp.pipeline]. This only + | works with individual component functions. To use factories, you need to + | create a new #[code Language] object, or generate a + | #[+a("/docs/usage/saving-loading#models-generating") model package] with + | a custom pipeline. + ++h(2, "example1") Example: Custom sentence segmentation logic + ++aside("Real-world examples") + | To see real-world examples of pipeline factories and components in action, + | you can have a look at the source of spaCy's built-in components, e.g. + | the #[+src(gh("spacy")) tagger], #[+src(gh("spacy")) parser] or + | #[+src(gh("spacy")) entity recognizer]. + +p + | Let's say you want to implement custom logic to improve spaCy's sentence + | boundary detection. Currently, sentence segmentation is based on the + | dependency parse, which doesn't always produce ideal results. The custom + | logic should therefore be applied #[strong after] tokenization, but + | #[strong before] the dependency parsing – this way, the parser can also + | take advantage of the sentence boundaries. + ++code. + def sbd_component(doc): + for i, token in enumerate(doc[:-2]): + # define sentence start if period + titlecase token + if token.text == '.' and doc[i+1].is_title: + doc[i+1].sent_start = True + return doc + +p + | In this case, we simply want to add the component to the existing + | pipeline of the English model. We can do this by inserting it at index 0 + | of #[code nlp.pipeline]: + ++code. + nlp = spacy.load('en') + nlp.pipeline.insert(0, sbd_component) + +p + | When you call #[code nlp] on some text, spaCy will tokenize it to create + | a #[code Doc] object, and first call #[code sbd_component] on it, followed + | by the model's default pipeline. + ++h(2, "example2") Example: Sentiment model + +p + | Let's say you have trained your own document sentiment model on English + | text. After tokenization, you want spaCy to first execute the + | #[strong default vectorizer], followed by a custom + | #[strong sentiment component] that adds a #[code .sentiment] + | property to the #[code Doc], containing your model's sentiment precition. + +p + | Your component class will have a #[code from_disk()] method that spaCy + | calls to load the model data. When called, the component will compute + | the sentiment score, add it to the #[code Doc] and return the modified + | document. Optionally, the component can include an #[code update()] method + | to allow training the model. + ++code. + import pickle + from pathlib import Path + + class SentimentComponent(object): + def __init__(self, vocab): + self.weights = None + + def __call__(self, doc): + doc.sentiment = sum(self.weights*doc.vector) # set sentiment property + return doc + + def from_disk(self, path): # path = model path + factory ID ('sentiment') + self.weights = pickle.load(Path(path) / 'weights.bin') # load weights + return self + + def update(self, doc, gold): # update weights – allows training! + prediction = sum(self.weights*doc.vector) + self.weights -= 0.001*doc.vector*(prediction-gold.sentiment) + +p + | The factory will initialise the component with the #[code Vocab] object. + | To be able to add it to your model's pipeline as #[code 'sentiment'], + | it also needs to be registered via + | #[+api("spacy#set_factory") #[code set_factory()]]. + ++code. + def sentiment_factory(vocab): + component = SentimentComponent(vocab) # initialise component + return component + + spacy.set_factory('sentiment', sentiment_factory) + +p + | The above code should be #[strong shipped with your model]. You can use + | the #[+api("cli#package") #[code package]] command to create all required + | files and directories. The model package will include an + | #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) __init__.py] + | with a #[code load()] method, that will initialise the language class with + | the model's pipeline and call the #[code from_disk()] method to load + | the model data. + +p + | In the model package's meta.json, specify the language class and pipeline + | IDs: + ++code("meta.json (excerpt)", "json"). + { + "name": "sentiment_model", + "lang": "en", + "version": "1.0.0", + "spacy_version": ">=2.0.0,<3.0.0", + "pipeline": ["vectorizer", "sentiment"] + } + +p + | When you load your new model, spaCy will call the model's #[code load()] + | method. This will return a #[code Language] object with a pipeline + | containing the default vectorizer, and the sentiment component returned + | by your custom #[code "sentiment"] factory. + ++code. + nlp = spacy.load('en_sentiment_model') + doc = nlp(u'I love pizza') + assert doc.sentiment + ++infobox("Saving and loading models") + | For more information and a detailed guide on how to package your model, + | see the documentation on + | #[+a("/docs/usage/saving-loading#models") saving and loading models]. + ++h(2, "disabling") Disabling pipeline components + +p + | If you don't need a particular component of the pipeline – for + | example, the tagger or the parser, you can disable loading it. This can + | sometimes make a big difference and improve loading speed. Disabled + | component names can be provided to #[+api("spacy#load") #[code spacy.load]], + | #[+api("language#from_disk") #[code Language.from_disk]] or the + | #[code nlp] object itself as a list: + ++code. + nlp = spacy.load('en', disable['parser', 'tagger']) + nlp = English().from_disk('/model', disable=['vectorizer', 'ner']) + doc = nlp(u"I don't want parsed", disable=['parser']) + +p + | Note that you can't write directly to #[code nlp.pipeline], as this list + | holds the #[em actual components], not the IDs. However, if you know the + | order of the components, you can still slice the list: + ++code. + nlp = spacy.load('en') + nlp.pipeline = nlp.pipeline[:2] # only use the first two components + ++infobox("Important note: disabling pipeline components") + .o-block + | Since spaCy v2.0 comes with better support for customising the + | processing pipeline components, the #[code parser], #[code tagger] + | and #[code entity] keyword arguments have been replaced with + | #[code disable], which takes a list of pipeline component names. + | This lets you disable both default and custom components when loading + | a model, or initialising a Language class via + | #[+api("language-from_disk") #[code from_disk]]. + +code-new. + nlp = spacy.load('en', disable=['tagger', 'ner']) + doc = nlp(u"I don't want parsed", disable=['parser']) + +code-old. + nlp = spacy.load('en', tagger=False, entity=False) + doc = nlp(u"I don't want parsed", parse=False) diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade index 967d0c61e..8cf651be0 100644 --- a/website/docs/usage/lightning-tour.jade +++ b/website/docs/usage/lightning-tour.jade @@ -6,23 +6,164 @@ p | The following examples and code snippets give you an overview of spaCy's | functionality and its usage. -+h(2, "models") Install and load models ++h(2, "models") Install models and process text +code(false, "bash"). python -m spacy download en + python -m spacy download de +code. import spacy nlp = spacy.load('en') + doc = nlp(u'Hello, world. Here are two sentences.') -+h(2, "examples-resources") Load resources and process text + nlp_de = spacy.load('de') + doc_de = nlp_de(u'Ich bin ein Berliner.') + ++infobox + | #[strong API:] #[+api("spacy#load") #[code spacy.load()]] + | #[strong Usage:] #[+a("/docs/usage/models") Models], + | #[+a("/docs/usage/spacy-101") spaCy 101] + ++h(2, "examples-tokens-sentences") Get tokens, noun chunks & sentences + +tag-model("dependency parse") + ++code. + doc = nlp(u"Peach emoji is where it has always been. Peach is the superior " + u"emoji. It's outranking eggplant 🍑 ") + + assert doc[0].text == u'Peach' + assert doc[1].text == u'emoji' + assert doc[-1].text == u'🍑' + assert doc[17:19].text == u'outranking eggplant' + assert doc.noun_chunks[0].text == u'Peach emoji' + + sentences = list(doc.sents) + assert len(sentences) == 3 + assert sentences[0].text == u'Peach is the superior emoji.' + ++infobox + | #[strong API:] #[+api("doc") #[code Doc]], #[+api("token") #[code Token]] + | #[strong Usage:] #[+a("/docs/usage/spacy-101") spaCy 101] + ++h(2, "examples-pos-tags") Get part-of-speech tags and flags + +tag-model("tagger") + ++code. + doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion') + apple = doc[0] + assert [apple.pos_, apple.pos] == [u'PROPN', 94] + assert [apple.tag_, apple.tag] == [u'NNP', 475] + assert [apple.shape_, apple.shape] == [u'Xxxxx', 684] + assert apple.is_alpha == True + assert apple.is_punct == False + + billion = doc[10] + assert billion.is_digit == False + assert billion.like_num == True + assert billion.like_email == False + ++infobox + | #[strong API:] #[+api("token") #[code Token]] + | #[strong Usage:] #[+a("/docs/usage/pos-tagging") Part-of-speech tagging] + ++h(2, "examples-integer-ids") Use integer IDs for any string + ++code. + hello_id = nlp.vocab.strings['Hello'] + hello_str = nlp.vocab.strings[hello_id] + assert token.text == hello_id == 3125 + assert token.text == hello_str == 'Hello' + ++h(2, "examples-entities") Recongnise and update named entities + +tag-model("NER") + ++code. + doc = nlp(u'San Francisco considers banning sidewalk delivery robots') + ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents] + assert ents == [(u'San Francisco', 0, 13, u'GPE')] + + from spacy.tokens import Span + doc = nlp(u'Netflix is hiring a new VP of global policy') + doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])] + ents = [(e.start_char, e.end_char, e.label_) for ent in doc.ents] + assert ents == [(0, 7, u'ORG')] + ++infobox + | #[strong Usage:] #[+a("/docs/usage/entity-recognition") Named entity recognition] + ++h(2, "displacy") Visualize a dependency parse and named entities in your browser + +tag-model("dependency parse", "NER") + ++code. + from spacy import displacy + + doc_dep = nlp(u'This is a sentence.') + displacy.serve(doc_dep, style='dep') + + doc_ent = nlp(u'When Sebastian Thrun started working on self-driving cars at Google ' + u'in 2007, few people outside of the company took him seriously.') + displacy.serve(doc_ent, style='ent') + ++infobox + | #[strong API:] #[+api("displacy") #[code displacy]] + | #[strong Usage:] #[+a("/docs/usage/visualizers") Visualizers] + ++h(2, "examples-word-vectors") Get word vectors and similarity + +tag-model("word vectors") + ++code. + doc = nlp(u"Apple and banana are similar. Pasta and hippo aren't.") + apple = doc[0] + banana = doc[2] + pasta = doc[6] + hippo = doc[8] + assert apple.similarity(banana) > pasta.similarity(hippo) + assert apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector + ++infobox + | #[strong Usage:] #[+a("/docs/usage/word-vectors-similarities") Word vectors and similarity] + ++h(2, "examples-serialization") Simple and efficient serialization +code. import spacy - en_nlp = spacy.load('en') - de_nlp = spacy.load('de') - en_doc = en_nlp(u'Hello, world. Here are two sentences.') - de_doc = de_nlp(u'ich bin ein Berliner.') + from spacy.tokens.doc import Doc + from spacy.vocab import Vocab + + nlp = spacy.load('en') + moby_dick = open('moby_dick.txt', 'r') + doc = nlp(moby_dick) + doc.to_disk('/moby_dick.bin') + + new_doc = Doc(Vocab()).from_disk('/moby_dick.bin') + ++infobox + | #[strong API:] #[+api("language") #[code Language]], + | #[+api("doc") #[code Doc]] + | #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading] + ++h(2, "rule-matcher") Match text with token rules + ++code. + import spacy + from spacy.matcher import Matcher + + nlp = spacy.load('en') + matcher = Matcher(nlp.vocab) + + def set_sentiment(matcher, doc, i, matches): + doc.sentiment += 0.1 + + pattern1 = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}] + pattern2 = [[{'ORTH': emoji, 'OP': '+'}] for emoji in ['😀', '😂', '🤣', '😍']] + matcher.add('GoogleIO', None, pattern1) # match "Google I/O" or "Google i/o" + matcher.add('HAPPY', set_sentiment, pattern2) # match one or more happy emoji + matches = nlp(LOTS_OF TEXT) + ++infobox + | #[strong API:] #[+api("matcher") #[code Matcher]] + | #[strong Usage:] #[+a("/docs/usage/rule-based-matching") Rule-based matching] +h(2, "multi-threaded") Multi-threaded generator @@ -35,37 +176,25 @@ p if i == 100: break -+h(2, "examples-tokens-sentences") Get tokens and sentences ++infobox + | #[strong API:] #[+api("doc") #[code Doc]] + | #[strong Usage:] #[+a("/docs/usage/production-usage") Production usage] + ++h(2, "examples-dependencies") Get syntactic dependencies + +tag-model("dependency parse") +code. - token = doc[0] - sentence = next(doc.sents) - assert token is sentence[0] - assert sentence.text == 'Hello, world.' + def dependency_labels_to_root(token): + """Walk up the syntactic tree, collecting the arc labels.""" + dep_labels = [] + while token.head is not token: + dep_labels.append(token.dep) + token = token.head + return dep_labels -+h(2, "examples-integer-ids") Use integer IDs for any string - -+code. - hello_id = nlp.vocab.strings['Hello'] - hello_str = nlp.vocab.strings[hello_id] - - assert token.orth == hello_id == 3125 - assert token.orth_ == hello_str == 'Hello' - -+h(2, "examples-string-views-flags") Get and set string views and flags - -+code. - assert token.shape_ == 'Xxxxx' - for lexeme in nlp.vocab: - if lexeme.is_alpha: - lexeme.shape_ = 'W' - elif lexeme.is_digit: - lexeme.shape_ = 'D' - elif lexeme.is_punct: - lexeme.shape_ = 'P' - else: - lexeme.shape_ = 'M' - assert token.shape_ == 'W' ++infobox + | #[strong API:] #[+api("token") #[code Token]] + | #[strong Usage:] #[+a("/docs/usage/dependency-parse") Using the dependency parse] +h(2, "examples-numpy-arrays") Export to numpy arrays @@ -80,107 +209,25 @@ p assert doc[0].like_url == doc_array[0, 1] assert list(doc_array[:, 1]) == [t.like_url for t in doc] -+h(2, "examples-word-vectors") Word vectors - -+code. - doc = nlp("Apples and oranges are similar. Boots and hippos aren't.") - - apples = doc[0] - oranges = doc[2] - boots = doc[6] - hippos = doc[8] - - assert apples.similarity(oranges) > boots.similarity(hippos) - -+h(2, "examples-pos-tags") Part-of-speech tags - -+code. - from spacy.parts_of_speech import ADV - - def is_adverb(token): - return token.pos == spacy.parts_of_speech.ADV - - # These are data-specific, so no constants are provided. You have to look - # up the IDs from the StringStore. - NNS = nlp.vocab.strings['NNS'] - NNPS = nlp.vocab.strings['NNPS'] - def is_plural_noun(token): - return token.tag == NNS or token.tag == NNPS - - def print_coarse_pos(token): - print(token.pos_) - - def print_fine_pos(token): - print(token.tag_) - -+h(2, "examples-dependencies") Syntactic dependencies - -+code. - def dependency_labels_to_root(token): - '''Walk up the syntactic tree, collecting the arc labels.''' - dep_labels = [] - while token.head is not token: - dep_labels.append(token.dep) - token = token.head - return dep_labels - -+h(2, "examples-entities") Named entities - -+code. - def iter_products(docs): - for doc in docs: - for ent in doc.ents: - if ent.label_ == 'PRODUCT': - yield ent - - def word_is_in_entity(word): - return word.ent_type != 0 - - def count_parent_verb_by_person(docs): - counts = defaultdict(lambda: defaultdict(int)) - for doc in docs: - for ent in doc.ents: - if ent.label_ == 'PERSON' and ent.root.head.pos == VERB: - counts[ent.orth_][ent.root.head.lemma_] += 1 - return counts - -+h(2, "examples-inline") Calculate inline mark-up on original string ++h(2, "examples-inline") Calculate inline markup on original string +code. def put_spans_around_tokens(doc, get_classes): - '''Given some function to compute class names, put each token in a - span element, with the appropriate classes computed. - - All whitespace is preserved, outside of the spans. (Yes, I know HTML - won't display it. But the point is no information is lost, so you can - calculate what you need, e.g.
tags,

tags, etc.) - ''' + """Given some function to compute class names, put each token in a + span element, with the appropriate classes computed. All whitespace is + preserved, outside of the spans. (Of course, HTML won't display more than + one whitespace character it – but the point is, no information is lost + and you can calculate what you need, e.g. <br />, <p> etc.) + """ output = [] - template = '{word}{space}' + html = '<span class="{classes}">{word}</span>{space}' for token in doc: if token.is_space: - output.append(token.orth_) + output.append(token.text) else: - output.append( - template.format( - classes=' '.join(get_classes(token)), - word=token.orth_, - space=token.whitespace_)) + classes = ' '.join(get_classes(token)) + output.append(html.format(classes=classes, word=token.text, space=token.whitespace_)) string = ''.join(output) string = string.replace('\n', '') string = string.replace('\t', ' ') return string - -+h(2, "examples-binary") Efficient binary serialization - -+code. - import spacy - from spacy.tokens.doc import Doc - - byte_string = doc.to_bytes() - open('moby_dick.bin', 'wb').write(byte_string) - - nlp = spacy.load('en') - for byte_string in Doc.read_bytes(open('moby_dick.bin', 'rb')): - doc = Doc(nlp.vocab) - doc.from_bytes(byte_string) diff --git a/website/docs/usage/models.jade b/website/docs/usage/models.jade index 2dec5197e..a837b4d29 100644 --- a/website/docs/usage/models.jade +++ b/website/docs/usage/models.jade @@ -195,7 +195,7 @@ p | privileges, the #[code spacy link] command may fail. The easiest solution | is to re-run the command as admin, or use a #[code virtualenv]. For more | info on this, see the - | #[+a("/docs/usage/troubleshooting#symlink-privilege") troubleshooting guide]. + | #[+a("/docs/usage/#symlink-privilege") troubleshooting guide]. +h(3, "usage-import") Importing models as modules @@ -233,4 +233,4 @@ p +infobox("Saving and loading models") | For more information and a detailed guide on how to package your model, | see the documentation on - | #[+a("/docs/usage/saving-loading") saving and loading models]. + | #[+a("/docs/usage/saving-loading#models") saving and loading models]. diff --git a/website/docs/usage/pos-tagging.jade b/website/docs/usage/pos-tagging.jade index cded00b6c..dd72efeba 100644 --- a/website/docs/usage/pos-tagging.jade +++ b/website/docs/usage/pos-tagging.jade @@ -7,22 +7,12 @@ p | assigned to each token in the document. They're useful in rule-based | processes. They can also be useful features in some statistical models. -p - | To use spaCy's tagger, you need to have a data pack installed that - | includes a tagging model. Tagging models are included in the data - | downloads for English and German. After you load the model, the tagger - | is applied automatically, as part of the default pipeline. You can then - | access the tags using the #[+api("token") #[code Token.tag]] and - | #[+api("token") #[code token.pos]] attributes. For English, the tagger - | also triggers some simple rule-based morphological processing, which - | gives you the lemma as well. ++h(2, "101") Part-of-speech tagging 101 + +tag-model("tagger", "dependency parse") -+code("Usage"). - import spacy - nlp = spacy.load('en') - doc = nlp(u'They told us to duck.') - for word in doc: - print(word.text, word.lemma, word.lemma_, word.tag, word.tag_, word.pos, word.pos_) +include _spacy-101/_pos-deps + ++aside("Help – spaCy's output is wrong!") +h(2, "rule-based-morphology") Rule-based morphology @@ -63,7 +53,8 @@ p +list("numbers") +item - | The tokenizer consults a #[strong mapping table] + | The tokenizer consults a + | #[+a("/docs/usage/adding-languages#tokenizer-exceptions") mapping table] | #[code TOKENIZER_EXCEPTIONS], which allows sequences of characters | to be mapped to multiple tokens. Each token may be assigned a part | of speech and one or more morphological features. @@ -77,8 +68,9 @@ p +item | For words whose POS is not set by a prior process, a - | #[strong mapping table] #[code TAG_MAP] maps the tags to a - | part-of-speech and a set of morphological features. + | #[+a("/docs/usage/adding-languages#tag-map") mapping table] + | #[code TAG_MAP] maps the tags to a part-of-speech and a set of + | morphological features. +item | Finally, a #[strong rule-based deterministic lemmatizer] maps the diff --git a/website/docs/usage/processing-text.jade b/website/docs/usage/processing-text.jade deleted file mode 100644 index 4bd6132d2..000000000 --- a/website/docs/usage/processing-text.jade +++ /dev/null @@ -1,134 +0,0 @@ -//- 💫 DOCS > USAGE > PROCESSING TEXT - -include ../../_includes/_mixins - -p - | Once you have loaded the #[code nlp] object, you can call it as though - | it were a function. This allows you to process a single unicode string. - -+code. - doc = nlp(u'Hello, world! A three sentence document.\nWith new lines...') - -p - | The library should perform equally well with short or long documents. - | All algorithms are linear-time in the length of the string, and once the - | data is loaded, there's no significant start-up cost to consider. This - | means that you don't have to strategically merge or split your text — - | you should feel free to feed in either single tweets or whole novels. - -p - | If you run #[code nlp = spacy.load('en')], the #[code nlp] object will - | be an instance of #[code spacy.en.English]. This means that when you run - | #[code doc = nlp(text)], you're executing - | #[code spacy.en.English.__call__], which is implemented on its parent - | class, #[+api("language") #[code Language]]. - -+code. - doc = nlp.make_doc(text) - for proc in nlp.pipeline: - proc(doc) - -p - | I've tried to make sure that the #[code Language.__call__] function - | doesn't do any "heavy lifting", so that you won't have complicated logic - | to replicate if you need to make your own pipeline class. This is all it - | does. - -p - | The #[code .make_doc()] method and #[code .pipeline] attribute make it - | easier to customise spaCy's behaviour. If you're using the default - | pipeline, we can desugar one more time. - -+code. - doc = nlp.tokenizer(text) - nlp.tagger(doc) - nlp.parser(doc) - nlp.entity(doc) - -p Finally, here's where you can find out about each of those components: - -+table(["Name", "Source"]) - +row - +cell #[code tokenizer] - +cell #[+src(gh("spacy", "spacy/tokenizer.pyx")) spacy.tokenizer.Tokenizer] - - +row - +cell #[code tagger] - +cell #[+src(gh("spacy", "spacy/tagger.pyx")) spacy.pipeline.Tagger] - - +row - +cell #[code parser] - +cell #[+src(gh("spacy", "spacy/syntax/parser.pyx")) spacy.pipeline.DependencyParser] - - +row - +cell #[code entity] - +cell #[+src(gh("spacy", "spacy/syntax/parser.pyx")) spacy.pipeline.EntityRecognizer] - -+h(2, "multithreading") Multi-threading with #[code .pipe()] - -p - | If you have a sequence of documents to process, you should use the - | #[+api("language#pipe") #[code .pipe()]] method. The #[code .pipe()] - | method takes an iterator of texts, and accumulates an internal buffer, - | which it works on in parallel. It then yields the documents in order, - | one-by-one. After a long and bitter struggle, the global interpreter - | lock was freed around spaCy's main parsing loop in v0.100.3. This means - | that the #[code .pipe()] method will be significantly faster in most - | practical situations, because it allows shared memory parallelism. - -+code. - for doc in nlp.pipe(texts, batch_size=10000, n_threads=3): - pass - -p - | To make full use of the #[code .pipe()] function, you might want to - | brush up on Python generators. Here are a few quick hints: - -+list - +item - | Generator comprehensions can be written - | (#[code item for item in sequence]) - - +item - | The #[code itertools] built-in library and the #[code cytoolz] - | package provide a lot of handy generator tools - - +item - | Often you'll have an input stream that pairs text with some - | important metadata, e.g. a JSON document. To pair up the metadata - | with the processed #[code Doc] object, you should use the tee - | function to split the generator in two, and then #[code izip] the - | extra stream to the document stream. - -+h(2, "own-annotations") Bringing your own annotations - -p - | spaCy generally assumes by default that your data is raw text. However, - | sometimes your data is partially annotated, e.g. with pre-existing - | tokenization, part-of-speech tags, etc. The most common situation is - | that you have pre-defined tokenization. If you have a list of strings, - | you can create a #[code Doc] object directly. Optionally, you can also - | specify a list of boolean values, indicating whether each word has a - | subsequent space. - -+code. - doc = Doc(nlp.vocab, words=[u'Hello', u',', u'world', u'!'], spaces=[False, True, False, False]) - -p - | If provided, the spaces list must be the same length as the words list. - | The spaces list affects the #[code doc.text], #[code span.text], - | #[code token.idx], #[code span.start_char] and #[code span.end_char] - | attributes. If you don't provide a #[code spaces] sequence, spaCy will - | assume that all words are whitespace delimited. - -+code. - good_spaces = Doc(nlp.vocab, words=[u'Hello', u',', u'world', u'!'], spaces=[False, True, False, False]) - bad_spaces = Doc(nlp.vocab, words=[u'Hello', u',', u'world', u'!']) - assert bad_spaces.text == u'Hello , world !' - assert good_spaces.text == u'Hello, world!' - -p - | Once you have a #[+api("doc") #[code Doc]] object, you can write to its - | attributes to set the part-of-speech tags, syntactic dependencies, named - | entities and other attributes. For details, see the respective usage - | pages. diff --git a/website/docs/usage/production-use.jade b/website/docs/usage/production-use.jade new file mode 100644 index 000000000..e9fd4a30f --- /dev/null +++ b/website/docs/usage/production-use.jade @@ -0,0 +1,78 @@ +//- 💫 DOCS > USAGE > PROCESSING TEXT + +include ../../_includes/_mixins + ++under-construction + ++h(2, "multithreading") Multi-threading with #[code .pipe()] + +p + | If you have a sequence of documents to process, you should use the + | #[+api("language#pipe") #[code Language.pipe()]] method. The method takes + | an iterator of texts, and accumulates an internal buffer, + | which it works on in parallel. It then yields the documents in order, + | one-by-one. After a long and bitter struggle, the global interpreter + | lock was freed around spaCy's main parsing loop in v0.100.3. This means + | that #[code .pipe()] will be significantly faster in most + | practical situations, because it allows shared memory parallelism. + ++code. + for doc in nlp.pipe(texts, batch_size=10000, n_threads=3): + pass + +p + | To make full use of the #[code .pipe()] function, you might want to + | brush up on #[strong Python generators]. Here are a few quick hints: + ++list + +item + | Generator comprehensions can be written as + | #[code (item for item in sequence)]. + + +item + | The + | #[+a("https://docs.python.org/2/library/itertools.html") #[code itertools] built-in library] + | and the + | #[+a("https://github.com/pytoolz/cytoolz") #[code cytoolz] package] + | provide a lot of handy #[strong generator tools]. + + +item + | Often you'll have an input stream that pairs text with some + | important meta data, e.g. a JSON document. To + | #[strong pair up the meta data] with the processed #[code Doc] + | object, you should use the #[code itertools.tee] function to split + | the generator in two, and then #[code izip] the extra stream to the + | document stream. + ++h(2, "own-annotations") Bringing your own annotations + +p + | spaCy generally assumes by default that your data is raw text. However, + | sometimes your data is partially annotated, e.g. with pre-existing + | tokenization, part-of-speech tags, etc. The most common situation is + | that you have pre-defined tokenization. If you have a list of strings, + | you can create a #[code Doc] object directly. Optionally, you can also + | specify a list of boolean values, indicating whether each word has a + | subsequent space. + ++code. + doc = Doc(nlp.vocab, words=[u'Hello', u',', u'world', u'!'], spaces=[False, True, False, False]) + +p + | If provided, the spaces list must be the same length as the words list. + | The spaces list affects the #[code doc.text], #[code span.text], + | #[code token.idx], #[code span.start_char] and #[code span.end_char] + | attributes. If you don't provide a #[code spaces] sequence, spaCy will + | assume that all words are whitespace delimited. + ++code. + good_spaces = Doc(nlp.vocab, words=[u'Hello', u',', u'world', u'!'], spaces=[False, True, False, False]) + bad_spaces = Doc(nlp.vocab, words=[u'Hello', u',', u'world', u'!']) + assert bad_spaces.text == u'Hello , world !' + assert good_spaces.text == u'Hello, world!' + +p + | Once you have a #[+api("doc") #[code Doc]] object, you can write to its + | attributes to set the part-of-speech tags, syntactic dependencies, named + | entities and other attributes. For details, see the respective usage + | pages. diff --git a/website/docs/usage/resources.jade b/website/docs/usage/resources.jade deleted file mode 100644 index 56e92a1e7..000000000 --- a/website/docs/usage/resources.jade +++ /dev/null @@ -1,118 +0,0 @@ -//- 💫 DOCS > USAGE > RESOURCES - -include ../../_includes/_mixins - -p Many of the associated tools and resources that we're developing alongside spaCy can be found in their own repositories. - -+h(2, "developer") Developer tools - -+table(["Name", "Description"]) - +row - +cell - +src(gh("spacy-models")) spaCy Models - - +cell - | Model releases for spaCy. - - +row - +cell - +src(gh("spacy-dev-resources")) spaCy Dev Resources - - +cell - | Scripts, tools and resources for developing spaCy, adding new - | languages and training new models. - - +row - +cell - +src("spacy-benchmarks") spaCy Benchmarks - - +cell - | Runtime performance comparison of spaCy against other NLP - | libraries. - - +row - +cell - +src(gh("spacy-services")) spaCy Services - - +cell - | REST microservices for spaCy demos and visualisers. - - +row - +cell - +src(gh("spacy-notebooks")) spaCy Notebooks - - +cell - | Jupyter notebooks for spaCy examples and tutorials. - -+h(2, "libraries") Libraries and projects -+table(["Name", "Description"]) - +row - +cell - +src(gh("sense2vec")) sense2vec - - +cell - | Use spaCy to go beyond vanilla - | #[+a("https://en.wikipedia.org/wiki/Word2vec") Word2vec]. - -+h(2, "utility") Utility libraries and dependencies - -+table(["Name", "Description"]) - +row - +cell - +src(gh("thinc")) Thinc - - +cell - | spaCy's Machine Learning library for NLP in Python. - - +row - +cell - +src(gh("cymem")) Cymem - - +cell - | Gate Cython calls to malloc/free behind Python ref-counted - | objects. - - +row - +cell - +src(gh("preshed")) Preshed - - +cell - | Cython hash tables that assume keys are pre-hashed - - +row - +cell - +src(gh("murmurhash")) MurmurHash - - +cell - | Cython bindings for - | #[+a("https://en.wikipedia.org/wiki/MurmurHash") MurmurHash2]. - -+h(2, "visualizers") Visualisers and demos - -+table(["Name", "Description"]) - +row - +cell - +src(gh("displacy")) displaCy.js - - +cell - | A lightweight dependency visualisation library for the modern - | web, built with JavaScript, CSS and SVG. - | #[+a(DEMOS_URL + "/displacy") Demo here]. - - +row - +cell - +src(gh("displacy-ent")) displaCy#[sup ENT] - - +cell - | A lightweight and modern named entity visualisation library - | built with JavaScript and CSS. - | #[+a(DEMOS_URL + "/displacy-ent") Demo here]. - - +row - +cell - +src(gh("sense2vec-demo")) sense2vec Demo - - +cell - | Source of our Semantic Analysis of the Reddit Hivemind - | #[+a(DEMOS_URL + "/sense2vec") demo] using - | #[+a(gh("sense2vec")) sense2vec]. diff --git a/website/docs/usage/rule-based-matching.jade b/website/docs/usage/rule-based-matching.jade index a54b70b89..1fd398ad9 100644 --- a/website/docs/usage/rule-based-matching.jade +++ b/website/docs/usage/rule-based-matching.jade @@ -11,7 +11,7 @@ p | You can also associate patterns with entity IDs, to allow some basic | entity linking or disambiguation. -+aside("What about \"real\" regular expressions?") +//-+aside("What about \"real\" regular expressions?") +h(2, "adding-patterns") Adding patterns @@ -119,7 +119,7 @@ p +code. # Add a new custom flag to the vocab, which is always False by default. # BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span. - BAD_HTML_FLAG = doc.vocab.add_flag(lambda text: False) + BAD_HTML_FLAG = nlp.vocab.add_flag(lambda text: False) def merge_and_flag(matcher, doc, i, matches): match_id, start, end = matches[i] @@ -221,7 +221,7 @@ p +cell match 0 or 1 times +cell optional, max one -+h(3, "quantifiers-example1") Quantifiers example: Using linguistic annotations ++h(2, "example1") Example: Using linguistic annotations p | Let's say you're analysing user comments and you want to find out what @@ -283,7 +283,7 @@ p # set manual=True to make displaCy render straight from a dictionary displacy.serve(matched_sents, style='ent', manual=True) -+h(3, "quantifiers-example2") Quantifiers example: Phone numbers ++h(2, "example2") Example: Phone numbers p | Phone numbers can have many different formats and matching them is often @@ -320,3 +320,114 @@ p | It'll produce more predictable results, is much easier to modify and | extend, and doesn't require any training data – only a set of | test cases. + ++h(2, "example3") Example: Hashtags and emoji on social media + +p + | Social media posts, especially tweets, can be difficult to work with. + | They're very short and often contain various emoji and hashtags. By only + | looking at the plain text, you'll lose a lot of valuable semantic + | information. + +p + | Let's say you've extracted a large sample of social media posts on a + | specific topic, for example posts mentioning a brand name or product. + | As the first step of your data exploration, you want to filter out posts + | containing certain emoji and use them to assign a general sentiment + | score, based on whether the expressed emotion is positive or negative, + | e.g. #[span.o-icon.o-icon--inline 😀] or #[span.o-icon.o-icon--inline 😞]. + | You also want to find, merge and label hashtags like + | #[code #MondayMotivation], to be able to ignore or analyse them later. + ++aside("Note on sentiment analysis") + | Ultimately, sentiment analysis is not always #[em that] easy. In + | addition to the emoji, you'll also want to take specific words into + | account and check the #[code subtree] for intensifiers like "very", to + | increase the sentiment score. At some point, you might also want to train + | a sentiment model. However, the approach described in this example is + | very useful for #[strong bootstrapping rules to collect training data]. + | It's also an incredibly fast way to gather first insights into your data + | – with about 1 million tweets, you'd be looking at a processing time of + | #[strong under 1 minute]. + +p + | By default, spaCy's tokenizer will split emoji into separate tokens. This + | means that you can create a pattern for one or more emoji tokens. In this + | case, a sequence of identical emoji should be treated as one instance. + | Valid hashtags usually consist of a #[code #], plus a sequence of + | ASCII characters with no whitespace, making them easy to match as well. + ++code. + from spacy.lang.en import English + from spacy.matcher import Matcher + + nlp = English() # we only want the tokenizer, so no need to load a model + matcher = Matcher(nlp.vocab) + + pos_emoji = [u'😀', u'😃', u'😂', u'🤣', u'😊', u'😍'] # positive emoji + neg_emoji = [u'😞', u'😠', u'😩', u'😢', u'😭', u'😒'] # negative emoji + + # add patterns to match one or more emoji tokens + pos_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in pos_emoji] + neg_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in neg_emoji] + + matcher.add('HAPPY', label_sentiment, *pos_patterns) # add positive pattern + matcher.add('SAD', label_sentiment, *neg_patterns) # add negative pattern + + # add pattern to merge valid hashtag, i.e. '#' plus any ASCII token + matcher.add('HASHTAG', merge_hashtag, [{'ORTH': '#'}, {'IS_ASCII': True}]) + +p + | Because the #[code on_match] callback receives the ID of each match, you + | can use the same function to handle the sentiment assignment for both + | the positive and negative pattern. To keep it simple, we'll either add + | or subtract #[code 0.1] points – this way, the score will also reflect + | combinations of emoji, even positive #[em and] negative ones. + +p + | With a library like + | #[+a("https://github.com/bcongdon/python-emojipedia") Emojipedia], + | we can also retrieve a short description for each emoji – for example, + | #[span.o-icon.o-icon--inline 😍]'s official title is "Smiling Face With + | Heart-Eyes". Assigning it to the merged token's norm will make it + | available as #[code token.norm_]. + ++code. + from emojipedia import Emojipedia # installation: pip install emojipedia + + def label_sentiment(matcher, doc, i, matches): + match_id, start, end = matches[i] + if match_id is 'HAPPY': + doc.sentiment += 0.1 # add 0.1 for positive sentiment + elif match_id is 'SAD': + doc.sentiment -= 0.1 # subtract 0.1 for negative sentiment + span = doc[start : end] + emoji = Emojipedia.search(span[0].text) # get data for emoji + span.merge(norm=emoji.title) # merge span and set NORM to emoji title + +p + | To label the hashtags, we first need to add a new custom flag. + | #[code IS_HASHTAG] will be the flag's ID, which you can use to assign it + | to the hashtag's span, and check its value via a token's + | #[+api("token#check_flag") #[code code check_flag()]] method. On each + | match, we merge the hashtag and assign the flag. + ++code. + # Add a new custom flag to the vocab, which is always False by default + IS_HASHTAG = nlp.vocab.add_flag(lambda text: False) + + def merge_hashtag(matcher, doc, i, matches): + match_id, start, end = matches[i] + span = doc[start : end] + span.merge() # merge hashtag + span.set_flag(IS_HASHTAG, True) # set IS_HASHTAG to True + +p + | To process a stream of social media posts, we can use + | #[+api("language#pipe") #[code Language.pipe()]], which will return a + | stream of #[code Doc] objects that we can pass to + | #[+api("matcher#pipe") #[code Matcher.pipe()]]. + ++code. + docs = nlp.pipe(LOTS_OF_TWEETS) + matches = matcher.pipe(docs) diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade index c4eb08f04..1ecb7d7ee 100644 --- a/website/docs/usage/saving-loading.jade +++ b/website/docs/usage/saving-loading.jade @@ -1,45 +1,87 @@ include ../../_includes/_mixins ++h(2, "101") Serialization 101 + +include _spacy-101/_serialization + ++infobox("Important note") + | In spaCy v2.0, the API for saving and loading has changed to only use the + | four methods listed above consistently across objects and classes. For an + | overview of the changes, see #[+a("/docs/usage/v2#incompat") this table] + | and the notes on #[+a("/docs/usage/v2#migrating-saving-loading") migrating]. + ++h(3, "example-doc") Example: Saving and loading a document + +p + | For simplicity, let's assume you've + | #[+a("/docs/usage/entity-recognition#setting") added custom entities] to + | a #[code Doc], either manually, or by using a + | #[+a("/docs/usage/rule-based-matching#on_match") match pattern]. You can + | save it locally by calling #[+api("doc#to_disk") #[code Doc.to_disk()]], + | and load it again via #[+api("doc#from_disk") #[code Doc.from_disk()]]. + | This will overwrite the existing object and return it. + ++code. + import spacy + from spacy.tokens import Span + + text = u'Netflix is hiring a new VP of global policy' + + nlp = spacy.load('en') + doc = nlp(text) + assert len(doc.ents) == 0 # Doc has no entities + doc.ents += ((Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])) # add entity + doc.to_disk('/path/to/doc') # save Doc to disk + + new_doc = nlp(text) + assert len(new_doc.ents) == 0 # new Doc has no entities + new_doc = new_doc.from_disk('path/to/doc') # load from disk and overwrite + assert len(new_doc.ents) == 1 # entity is now recognised! + assert [(ent.text, ent.label_) for ent in new_doc.ents] == [(u'Netflix', u'ORG')] + ++h(2, "models") Saving models + p | After training your model, you'll usually want to save its state, and load | it back later. You can do this with the - | #[+api("language#save_to_directory") #[code Language.save_to_directory()]] + | #[+api("language#to_disk") #[code Language.to_disk()]] | method: +code. - nlp.save_to_directory('/home/me/data/en_example_model') + nlp.to_disk('/home/me/data/en_example_model') p | The directory will be created if it doesn't exist, and the whole pipeline | will be written out. To make the model more convenient to deploy, we | recommend wrapping it as a Python package. -+h(2, "generating") Generating a model package ++h(3, "models-generating") Generating a model package +infobox("Important note") | The model packages are #[strong not suitable] for the public | #[+a("https://pypi.python.org") pypi.python.org] directory, which is not | designed for binary data and files over 50 MB. However, if your company - | is running an internal installation of pypi, publishing your models on - | there can be a convenient solution to share them with your team. + | is running an #[strong internal installation] of PyPi, publishing your + | models on there can be a convenient way to share them with your team. p | spaCy comes with a handy CLI command that will create all required files, | and walk you through generating the meta data. You can also create the | meta.json manually and place it in the model data directory, or supply a - | path to it using the #[code --meta] flag. For more info on this, see the - | #[+a("/docs/usage/cli/#package") #[code package] command] documentation. + | path to it using the #[code --meta] flag. For more info on this, see + | the #[+api("cli#package") #[code package]] docs. +aside-code("meta.json", "json"). { "name": "example_model", "lang": "en", "version": "1.0.0", - "spacy_version": ">=1.7.0,<2.0.0", + "spacy_version": ">=2.0.0,<3.0.0", "description": "Example model for spaCy", "author": "You", "email": "you@example.com", - "license": "CC BY-SA 3.0" + "license": "CC BY-SA 3.0", + "pipeline": ["token_vectors", "tagger"] } +code(false, "bash"). @@ -58,52 +100,112 @@ p This command will create a model package directory that should look like this: p | You can also find templates for all files in our - | #[+a(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources]. + | #[+src(gh("spacy-dev-resources", "templates/model")) spaCy dev resources]. | If you're creating the package manually, keep in mind that the directories | need to be named according to the naming conventions of - | #[code [language]_[name]] and #[code [language]_[name]-[version]]. The - | #[code lang] setting in the meta.json is also used to create the - | respective #[code Language] class in spaCy, which will later be returned - | by the model's #[code load()] method. + | #[code lang_name] and #[code lang_name-version]. -+h(2, "building") Building a model package ++h(3, "models-custom") Customising the model setup + +p + | The meta.json includes the model details, like name, requirements and + | license, and lets you customise how the model should be initialised and + | loaded. You can define the language data to be loaded and the + | #[+a("/docs/usage/language-processing-pipeline") processing pipeline] to + | execute. + ++table(["Setting", "Type", "Description"]) + +row + +cell #[code lang] + +cell unicode + +cell ID of the language class to initialise. + + +row + +cell #[code pipeline] + +cell list + +cell + | A list of strings mapping to the IDs of pipeline factories to + | apply in that order. If not set, spaCy's + | #[+a("/docs/usage/language-processing/pipelines") default pipeline] + | will be used. + +p + | The #[code load()] method that comes with our model package + | templates will take care of putting all this together and returning a + | #[code Language] object with the loaded pipeline and data. If your model + | requires custom pipeline components, you should + | #[strong ship then with your model] and register their + | #[+a("/docs/usage/language-processing-pipeline#creating-factory") factories] + | via #[+api("spacy#set_factory") #[code set_factory()]]. + ++aside-code("Factory example"). + def my_factory(vocab): + # load some state + def my_component(doc): + # process the doc + return doc + return my_component + ++code. + spacy.set_factory('custom_component', custom_component_factory) + ++infobox("Custom models with pipeline components") + | For more details and an example of how to package a sentiment model + | with a custom pipeline component, see the usage workflow on + | #[+a("/docs/usage/language-processing-pipeline#example2") language processing pipelines]. + ++h(3, "models-building") Building the model package p | To build the package, run the following command from within the - | directory. This will create a #[code .tar.gz] archive in a directory - | #[code /dist]. + | directory. For more information on building Python packages, see the + | docs on Python's + | #[+a("https://setuptools.readthedocs.io/en/latest/") Setuptools]. +code(false, "bash"). python setup.py sdist p - | For more information on building Python packages, see the - | #[+a("https://setuptools.readthedocs.io/en/latest/") Python Setuptools documentation]. - - -+h(2, "loading") Loading a model package - -p - | Model packages can be installed by pointing pip to the model's - | #[code .tar.gz] archive: + | This will create a #[code .tar.gz] archive in a directory #[code /dist]. + | The model can be installed by pointing pip to the path of the archive: +code(false, "bash"). pip install /path/to/en_example_model-1.0.0.tar.gz -p You'll then be able to load the model as follows: +p + | You can then load the model via its name, #[code en_example_model], or + | import it directly as a module and then call its #[code load()] method. -+code. - import en_example_model - nlp = en_example_model.load() ++h(2, "loading") Loading a custom model package p - | To load the model via #[code spacy.load()], you can also - | create a #[+a("/docs/usage/models#usage") shortcut link] that maps the - | package name to a custom model name of your choice: - -+code(false, "bash"). - python -m spacy link en_example_model example + | To load a model from a data directory, you can use + | #[+api("spacy#load") #[code spacy.load()]] with the local path. This will + | look for a meta.json in the directory and use the #[code lang] and + | #[code pipeline] settings to initialise a #[code Language] class with a + | processing pipeline and load in the model data. +code. - import spacy - nlp = spacy.load('example') + nlp = spacy.load('/path/to/model') + +p + | If you want to #[strong load only the binary data], you'll have to create + | a #[code Language] class and call + | #[+api("language#from_disk") #[code from_disk]] instead. + ++code. + from spacy.lang.en import English + nlp = English().from_disk('/path/to/data') + ++infobox("Important note: Loading data in v2.x") + .o-block + | In spaCy 1.x, the distinction between #[code spacy.load()] and the + | #[code Language] class constructor was quite unclear. You could call + | #[code spacy.load()] when no model was present, and it would silently + | return an empty object. Likewise, you could pass a path to + | #[code English], even if the mode required a different language. + | spaCy v2.0 solves this with a clear distinction between setting up + | the instance and loading the data. + + +code-new nlp = English().from_disk('/path/to/data') + +code-old nlp = spacy.load('en', path='/path/to/data') diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index daace114b..6a1f780dc 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -2,9 +2,256 @@ include ../../_includes/_mixins ++h(2, "features") Features + ++under-construction + ++aside + | If one of spaCy's functionalities #[strong needs a model], it means that + | you need to have one our the available + | #[+a("/docs/usage/models") statistical models] installed. Models are used + | to #[strong predict] linguistic annotations – for example, if a word is + | a verb or a noun. + ++table(["Name", "Description", "Needs model"]) + +row + +cell #[strong Tokenization] + +cell + +cell #[+procon("con")] + + +row + +cell #[strong Part-of-speech Tagging] + +cell + +cell #[+procon("pro")] + + +row + +cell #[strong Dependency Parsing] + +cell + +cell #[+procon("pro")] + + +row + +cell #[strong Sentence Boundary Detection] + +cell + +cell #[+procon("pro")] + + +row + +cell #[strong Named Entity Recongition] (NER) + +cell + +cell #[+procon("pro")] + + +row + +cell #[strong Rule-based Matching] + +cell + +cell #[+procon("con")] + + +row + +cell #[strong Similarity] + +cell + +cell #[+procon("pro")] + + +row + +cell #[strong Training] + +cell + +cell #[+procon("neutral")] + + +row + +cell #[strong Serialization] + +cell + +cell #[+procon("neutral")] + ++h(2, "annotations") Linguistic annotations + +p + | spaCy provides a variety of linguistic annotations to give you insights + | into a text's grammatical structure. This includes the word types, + | i.e. the parts of speech, and how the words are related to each other. + | For example, if you're analysing text, it makes a huge difference + | whether a noun is the subject of a sentence, or the object – or whether + | "google" is used as a verb, or refers to the website or company in a + | specific context. + +p + | Once you've downloaded and installed a #[+a("/docs/usage/models") model], + | you can load it via #[+api("spacy#load") #[code spacy.load()]]. This will + | return a #[code Language] object contaning all components and data needed + | to process text. We usually call it #[code nlp]. Calling the #[code nlp] + | object on a string of text will return a processed #[code Doc]: + ++code. + import spacy + + nlp = spacy.load('en') + doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion') + +p + | Even though a #[code Doc] is processed – e.g. split into individual words + | and annotated – it still holds #[strong all information of the original text], + | like whitespace characters. This way, you'll never lose any information + | when processing text with spaCy. + ++h(3, "annotations-token") Tokenization + +include _spacy-101/_tokenization + ++infobox + | To learn more about how spaCy's tokenization rules work in detail, + | how to #[strong customise and replace] the default tokenizer and how to + | #[strong add language-specific data], see the usage guides on + | #[+a("/docs/usage/adding-languages") adding languages] and + | #[+a("/docs/usage/customizing-tokenizer") customising the tokenizer]. + ++h(3, "annotations-pos-deps") Part-of-speech tags and dependencies + +tag-model("dependency parse") + +include _spacy-101/_pos-deps + ++infobox + | To learn more about #[strong part-of-speech tagging] and rule-based + | morphology, and how to #[strong navigate and use the parse tree] + | effectively, see the usage guides on + | #[+a("/docs/usage/pos-tagging") part-of-speech tagging] and + | #[+a("/docs/usage/dependency-parse") using the dependency parse]. + ++h(3, "annotations-ner") Named Entities + +tag-model("named entities") + +include _spacy-101/_named-entities + ++infobox + | To learn more about entity recognition in spaCy, how to + | #[strong add your own entities] to a document and how to + | #[strong train and update] the entity predictions of a model, see the + | usage guides on + | #[+a("/docs/usage/entity-recognition") named entity recognition] and + | #[+a("/docs/usage/training-ner") training the named entity recognizer]. + ++h(2, "vectors-similarity") Word vectors and similarity + +tag-model("vectors") + +include _spacy-101/_similarity + +include _spacy-101/_word-vectors + ++infobox + | To learn more about word vectors, how to #[strong customise them] and + | how to load #[strong your own vectors] into spaCy, see the usage + | guide on + | #[+a("/docs/usage/word-vectors-similarities") using word vectors and semantic similarities]. + ++h(2, "pipelines") Pipelines + +include _spacy-101/_pipelines + ++infobox + | To learn more about #[strong how processing pipelines work] in detail, + | how to enable and disable their components, and how to + | #[strong create your own], see the usage guide on + | #[+a("/docs/usage/language-processing-pipeline") language processing pipelines]. + ++h(2, "vocab-stringstore") Vocab, lexemes and the string store + +include _spacy-101/_vocab-stringstore + ++h(2, "serialization") Serialization + +include _spacy-101/_serialization + ++infobox + | To learn more about #[strong serialization] and how to + | #[strong save and load your own models], see the usage guide on + | #[+a("/docs/usage/saving-loading") saving, loading and data serialization]. + ++h(2, "training") Training + +include _spacy-101/_training + +h(2, "architecture") Architecture ++under-construction + +image include ../../assets/img/docs/architecture.svg .u-text-right +button("/assets/img/docs/architecture.svg", false, "secondary").u-text-tag View large graphic + ++table(["Name", "Description"]) + +row + +cell #[+api("language") #[code Language]] + +cell + | A text-processing pipeline. Usually you'll load this once per + | process as #[code nlp] and pass the instance around your application. + + +row + +cell #[+api("doc") #[code Doc]] + +cell A container for accessing linguistic annotations. + + +row + +cell #[+api("span") #[code Span]] + +cell A slice from a #[code Doc] object. + + +row + +cell #[+api("token") #[code Token]] + +cell + | An individual token — i.e. a word, punctuation symbol, whitespace, + | etc. + + +row + +cell #[+api("lexeme") #[code Lexeme]] + +cell + | An entry in the vocabulary. It's a word type with no context, as + | opposed to a word token. It therefore has no part-of-speech tag, + | dependency parse etc. + + +row + +cell #[+api("vocab") #[code Vocab]] + +cell + | A lookup table for the vocabulary that allows you to access + | #[code Lexeme] objects. + + +row + +cell #[code Morphology] + +cell + + +row + +cell #[+api("stringstore") #[code StringStore]] + +cell Map strings to and from integer IDs. + + +row + +row + +cell #[+api("tokenizer") #[code Tokenizer]] + +cell + | Segment text, and create #[code Doc] objects with the discovered + | segment boundaries. + + +row + +cell #[+api("tagger") #[code Tagger]] + +cell Annotate part-of-speech tags on #[code Doc] objects. + + +row + +cell #[+api("dependencyparser") #[code DependencyParser]] + +cell Annotate syntactic dependencies on #[code Doc] objects. + + +row + +cell #[+api("entityrecognizer") #[code EntityRecognizer]] + +cell + | Annotate named entities, e.g. persons or products, on #[code Doc] + | objects. + + +row + +cell #[+api("matcher") #[code Matcher]] + +cell + | Match sequences of tokens, based on pattern rules, similar to + | regular expressions. + ++h(3, "architecture-other") Other + ++table(["Name", "Description"]) + +row + +cell #[+api("goldparse") #[code GoldParse]] + +cell Collection for training annotations. + + +row + +cell #[+api("goldcorpus") #[code GoldCorpus]] + +cell + | An annotated corpus, using the JSON file format. Manages + | annotations for tagging, dependency parsing and NER. diff --git a/website/docs/usage/training-ner.jade b/website/docs/usage/training-ner.jade index 78eb4905e..4faa47675 100644 --- a/website/docs/usage/training-ner.jade +++ b/website/docs/usage/training-ner.jade @@ -64,44 +64,10 @@ p | predicts the new category with minimal difference from the previous | output. -+h(2, "saving-loading") Saving and loading - -p - | After training our model, you'll usually want to save its state, and load - | it back later. You can do this with the #[code Language.save_to_directory()] - | method: - -+code. - nlp.save_to_directory('/home/me/data/en_technology') - -p - | To make the model more convenient to deploy, we recommend wrapping it as - | a Python package, so that you can install it via pip and load it as a - | module. spaCy comes with a handy #[+a("/docs/usage/cli#package") CLI command] - | to create all required files and directories. - -+code(false, "bash"). - python -m spacy package /home/me/data/en_technology /home/me/my_models - -p - | To build the package and create a #[code .tar.gz] archive, run - | #[code python setup.py sdist] from within its directory. - -+infobox("Saving and loading models") - | For more information and a detailed guide on how to package your model, - | see the documentation on - | #[+a("/docs/usage/saving-loading") saving and loading models]. - -p - | After you've generated and installed the package, you'll be able to - | load the model as follows: - -+code. - import en_technology - nlp = en_technology.load() - +h(2, "example") Example: Adding and training an #[code ANIMAL] entity ++under-construction + p | This script shows how to add a new entity type to an existing pre-trained | NER model. To keep the example short and simple, only four sentences are @@ -170,5 +136,33 @@ p p | After training your model, you can - | #[+a("/docs/usage/saving-loading") save it to a directory]. We recommend wrapping - | models as Python packages, for ease of deployment. + | #[+a("/docs/usage/saving-loading") save it to a directory]. We recommend + | wrapping models as Python packages, for ease of deployment. + ++h(2, "saving-loading") Saving and loading + +p + | After training our model, you'll usually want to save its state, and load + | it back later. You can do this with the + | #[+api("language#to_disk") #[code Language.to_disk()]] method: + ++code. + nlp.to_disk('/home/me/data/en_technology') + +p + | To make the model more convenient to deploy, we recommend wrapping it as + | a Python package, so that you can install it via pip and load it as a + | module. spaCy comes with a handy #[+api("cli#package") #[code package]] + | CLI command to create all required files and directories. + ++code(false, "bash"). + python -m spacy package /home/me/data/en_technology /home/me/my_models + +p + | To build the package and create a #[code .tar.gz] archive, run + | #[code python setup.py sdist] from within its directory. + ++infobox("Saving and loading models") + | For more information and a detailed guide on how to package your model, + | see the documentation on + | #[+a("/docs/usage/saving-loading#models") saving and loading models]. diff --git a/website/docs/usage/training.jade b/website/docs/usage/training.jade index 8a5c111bd..6c6c17e17 100644 --- a/website/docs/usage/training.jade +++ b/website/docs/usage/training.jade @@ -6,6 +6,10 @@ p | Once the model is trained, you can then | #[+a("/docs/usage/saving-loading") save and load] it. ++h(2, "101") Training 101 + +include _spacy-101/_training + +h(2, "train-pos-tagger") Training the part-of-speech tagger +code. @@ -77,59 +81,3 @@ p.o-inline-list p +button(gh("spaCy", "examples/training/train_parser.py"), false, "secondary") Full example - -+h(2, "feature-templates") Customizing the feature extraction - -p - | spaCy currently uses linear models for the tagger, parser and entity - | recognizer, with weights learned using the - | #[+a("https://explosion.ai/blog/part-of-speech-pos-tagger-in-python") Averaged Perceptron algorithm]. - -+aside("Linear Model Feature Scheme") - | For a list of the available feature atoms, see the #[+a("/docs/api/features") Linear Model Feature Scheme]. - -p - | Because it's a linear model, it's important for accuracy to build - | conjunction features out of the atomic predictors. Let's say you have - | two atomic predictors asking, "What is the part-of-speech of the - | previous token?", and "What is the part-of-speech of the previous - | previous token?". These predictors will introduce a number of features, - | e.g. #[code Prev-pos=NN], #[code Prev-pos=VBZ], etc. A conjunction - | template introduces features such as #[code Prev-pos=NN&Prev-pos=VBZ]. - -p - | The feature extraction proceeds in two passes. In the first pass, we - | fill an array with the values of all of the atomic predictors. In the - | second pass, we iterate over the feature templates, and fill a small - | temporary array with the predictors that will be combined into a - | conjunction feature. Finally, we hash this array into a 64-bit integer, - | using the MurmurHash algorithm. You can see this at work in the - | #[+a(gh("thinc", "thinc/linear/features.pyx", "94dbe06fd3c8f24d86ab0f5c7984e52dbfcdc6cb")) #[code thinc.linear.features]] module. - -p - | It's very easy to change the feature templates, to create novel - | combinations of the existing atomic predictors. There's currently no API - | available to add new atomic predictors, though. You'll have to create a - | subclass of the model, and write your own #[code set_featuresC] method. - -p - | The feature templates are passed in using the #[code features] keyword - | argument to the constructors of the #[+api("tagger") #[code Tagger]], - | #[+api("dependencyparser") #[code DependencyParser]] and - | #[+api("entityrecognizer") #[code EntityRecognizer]]: - -+code. - from spacy.vocab import Vocab - from spacy.pipeline import Tagger - from spacy.tagger import P2_orth, P1_orth - from spacy.tagger import P2_cluster, P1_cluster, W_orth, N1_orth, N2_orth - - vocab = Vocab(tag_map={'N': {'pos': 'NOUN'}, 'V': {'pos': 'VERB'}}) - tagger = Tagger(vocab, features=[(P2_orth, P2_cluster), (P1_orth, P1_cluster), - (P2_orth,), (P1_orth,), (W_orth,), - (N1_orth,), (N2_orth,)]) - -p - | Custom feature templates can be passed to the #[code DependencyParser] - | and #[code EntityRecognizer] as well, also using the #[code features] - | keyword argument of the constructor. diff --git a/website/docs/usage/troubleshooting.jade b/website/docs/usage/troubleshooting.jade deleted file mode 100644 index 501a250c8..000000000 --- a/website/docs/usage/troubleshooting.jade +++ /dev/null @@ -1,190 +0,0 @@ -//- 💫 DOCS > USAGE > TROUBLESHOOTING - -include ../../_includes/_mixins - -p - | This section collects some of the most common errors you may come - | across when installing, loading and using spaCy, as well as their solutions. - -+aside("Help us improve this guide") - | Did you come across a problem like the ones listed here and want to - | share the solution? You can find the "Suggest edits" button at the - | bottom of this page that points you to the source. We always - | appreciate #[+a(gh("spaCy") + "/pulls") pull requests]! - -+h(2, "install-loading") Installation and loading - -+h(3, "compatible-model") No compatible model found - -+code(false, "text"). - No compatible model found for [lang] (spaCy v#{SPACY_VERSION}). - -p - | This usually means that the model you're trying to download does not - | exist, or isn't available for your version of spaCy. - -+infobox("Solutions") - | Check the #[+a(gh("spacy-models", "compatibility.json")) compatibility table] - | to see which models are available for your spaCy version. If you're using - | an old version, consider upgrading to the latest release. Note that while - | spaCy supports tokenization for - | #[+a("/docs/api/language-models/#alpha-support") a variety of languages], - | not all of them come with statistical models. To only use the tokenizer, - | import the language's #[code Language] class instead, for example - | #[code from spacy.fr import French]. - -+h(3, "symlink-privilege") Symbolic link privilege not held - -+code(false, "text"). - OSError: symbolic link privilege not held - -p - | To create #[+a("/docs/usage/models/#usage") shortcut links] that let you - | load models by name, spaCy creates a symbolic link in the - | #[code spacy/data] directory. This means your user needs permission to do - | this. The above error mostly occurs when doing a system-wide installation, - | which will create the symlinks in a system directory. - -+infobox("Solutions") - | Run the #[code download] or #[code link] command as administrator, - | or use a #[code virtualenv] to install spaCy in a user directory, instead - | of doing a system-wide installation. - -+h(3, "no-cache-dir") No such option: --no-cache-dir - -+code(false, "text"). - no such option: --no-cache-dir - -p - | The #[code download] command uses pip to install the models and sets the - | #[code --no-cache-dir] flag to prevent it from requiring too much memory. - | #[+a("https://pip.pypa.io/en/stable/reference/pip_install/#caching") This setting] - | requires pip v6.0 or newer. - -+infobox("Solution") - | Run #[code pip install -U pip] to upgrade to the latest version of pip. - | To see which version you have installed, run #[code pip --version]. - -+h(3, "import-error") Import error - -+code(false, "text"). - Import Error: No module named spacy - -p - | This error means that the spaCy module can't be located on your system, or in - | your environment. - -+infobox("Solutions") - | Make sure you have spaCy installed. If you're using a #[code virtualenv], - | make sure it's activated and check that spaCy is installed in that - | environment – otherwise, you're trying to load a system installation. You - | can also run #[code which python] to find out where your Python - | executable is located. - -+h(3, "import-error-models") Import error: models - -+code(false, "text"). - ImportError: No module named 'en_core_web_sm' - -p - | As of spaCy v1.7, all models can be installed as Python packages. This means - | that they'll become importable modules of your application. When creating - | #[+a("/docs/usage/models/#usage") shortcut links], spaCy will also try - | to import the model to load its meta data. If this fails, it's usually a - | sign that the package is not installed in the current environment. - -+infobox("Solutions") - | Run #[code pip list] or #[code pip freeze] to check which model packages - | you have installed, and install the - | #[+a("/docs/usage/models#available") correct models] if necessary. If you're - | importing a model manually at the top of a file, make sure to use the name - | of the package, not the shortcut link you've created. - -+h(3, "vocab-strings") File not found: vocab/strings.json - -+code(false, "text"). - FileNotFoundError: No such file or directory: [...]/vocab/strings.json - -p - | This error may occur when using #[code spacy.load()] to load - | a language model – either because you haven't set up a - | #[+a("/docs/usage/models/#usage") shortcut link] for it, or because it - | doesn't actually exist. - -+infobox("Solutions") - | Set up a #[+a("/docs/usage/models/#usage") shortcut link] for the model - | you want to load. This can either be an installed model package, or a - | local directory containing the model data. If you want to use one of the - | #[+a("/docs/api/language-models/#alpha-support") alpha tokenizers] for - | languages that don't yet have a statistical model, you should import its - | #[code Language] class instead, for example - | #[code from spacy.fr import French]. - -+h(3, "command-not-found") Command not found - -+code(false, "text"). - command not found: spacy - -p - | This error may occur when running the #[code spacy] command from the - | command line. spaCy does not currently add an entry to our #[code PATH] - | environment variable, as this can lead to unexpected results, especially - | when using #[code virtualenv]. Instead, commands need to be prefixed with - | #[code python -m]. - -+infobox("Solution") - | Run the command with #[code python -m], for example - | #[code python -m spacy download en]. For more info on this, see the - | #[+a("/docs/usage/cli") CLI documentation]. - -+h(3, "module-load") 'module' object has no attribute 'load' - -+code(false, "text"). - AttributeError: 'module' object has no attribute 'load' - -p - | While this could technically have many causes, including spaCy being - | broken, the most likely one is that your script's file or directory name - | is "shadowing" the module – e.g. your file is called #[code spacy.py], - | or a directory you're importing from is called #[code spacy]. - -+infobox("Solution") - | When using spaCy, never call anything else #[code spacy]. - -+h(2, "usage") Using spaCy - -+h(3, "pos-lemma-number") POS tag or lemma is returned as number - -+code. - doc = nlp(u'This is text.') - print([word.pos for word in doc]) - # [88, 98, 90, 95] - -p - | Like many NLP libraries, spaCy encodes all strings to integers. This - | reduces memory usage and improves efficiency. The integer mapping also - | makes it easy to interoperate with numpy. To access the string - | representation instead of the integer ID, add an underscore #[code _] - | after the attribute. - -+infobox("Solutions") - | Use #[code pos_] or #[code lemma_] instead. See the - | #[+api("token#attributes") #[code Token] attributes] for a list of available - | attributes and their string representations. - - -+h(3, "pron-lemma") Pronoun lemma is returned as #[code -PRON-] - -+code. - doc = nlp(u'They are') - print(doc[0].lemma_) - # -PRON- - -p - | This is in fact expected behaviour and not a bug. - | Unlike verbs and common nouns, there's no clear base form of a personal - | pronoun. Should the lemma of "me" be "I", or should we normalize person - | as well, giving "it" — or maybe "he"? spaCy's solution is to introduce a - | novel symbol, #[code -PRON-], which is used as the lemma for - | all personal pronouns. For more info on this, see the - | #[+api("annotation#lemmatization") annotation specs] on lemmatization. diff --git a/website/docs/usage/v2.jade b/website/docs/usage/v2.jade index 8faae9d32..25aae8706 100644 --- a/website/docs/usage/v2.jade +++ b/website/docs/usage/v2.jade @@ -8,6 +8,65 @@ p +h(2, "features") New features ++h(3, "features-pipelines") Improved processing pipelines + ++aside-code("Example"). + # Modify an existing pipeline + nlp = spacy.load('en') + nlp.pipeline.append(my_component) + + # Register a factory to create a component + spacy.set_factory('my_factory', my_factory) + nlp = Language(pipeline=['my_factory', mycomponent]) + +p + | It's now much easier to #[strong customise the pipeline] with your own + | components, functions that receive a #[code Doc] object, modify and + | return it. If your component is stateful, you can define and register a + | factory which receives the shared #[code Vocab] object and returns a + |  component. spaCy's default components can be added to your pipeline by + | using their string IDs. This way, you won't have to worry about finding + | and implementing them – simply add #[code "tagger"] to the pipeline, + | and spaCy will know what to do. + ++image + include ../../assets/img/docs/pipeline.svg + ++infobox + | #[strong API:] #[+api("language") #[code Language]] + | #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text] + ++h(3, "features-serializer") Saving, loading and serialization + ++aside-code("Example"). + nlp = spacy.load('en') # shortcut link + nlp = spacy.load('en_core_web_sm') # package + nlp = spacy.load('/path/to/en') # unicode path + nlp = spacy.load(Path('/path/to/en')) # pathlib Path + + nlp.to_disk('/path/to/nlp') + nlp = English().from_disk('/path/to/nlp') + +p + | spay's serialization API has been made consistent across classes and + | objects. All container classes, i.e. #[code Language], #[code Doc], + | #[code Vocab] and #[code StringStore] now have a #[code to_bytes()], + | #[code from_bytes()], #[code to_disk()] and #[code from_disk()] method + | that supports the Pickle protocol. + +p + | The improved #[code spacy.load] makes loading models easier and more + | transparent. You can load a model by supplying its + | #[+a("/docs/usage/models#usage") shortcut link], the name of an installed + | #[+a("/docs/usage/saving-loading#generating") model package] or a path. + | The #[code Language] class to initialise will be determined based on the + | model's settings. For a blank language, you can import the class directly, + | e.g. #[code from spacy.lang.en import English]. + ++infobox + | #[strong API:] #[+api("spacy#load") #[code spacy.load]], #[+api("binder") #[code Binder]] + | #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading] + +h(3, "features-displacy") displaCy visualizer with Jupyter support +aside-code("Example"). @@ -28,78 +87,32 @@ p | #[strong API:] #[+api("displacy") #[code displacy]] | #[strong Usage:] #[+a("/docs/usage/visualizers") Visualizing spaCy] -+h(3, "features-loading") Loading - -+aside-code("Example"). - nlp = spacy.load('en') # shortcut link - nlp = spacy.load('en_core_web_sm') # package - nlp = spacy.load('/path/to/en') # unicode path - nlp = spacy.load(Path('/path/to/en')) # pathlib Path ++h(3, "features-language") Improved language data and lazy loading p - | The improved #[code spacy.load] makes loading models easier and more - | transparent. You can load a model by supplying its - | #[+a("/docs/usage/models#usage") shortcut link], the name of an installed - | #[+a("/docs/usage/saving-loading#generating") model package], a unicode - | path or a #[code Path]-like object. spaCy will try resolving the load - | argument in this order. The #[code path] keyword argument is now deprecated. + | Language-specfic data now lives in its own submodule, #[code spacy.lang]. + | Languages are lazy-loaded, i.e. only loaded when you import a + | #[code Language] class, or load a model that initialises one. This allows + | languages to contain more custom data, e.g. lemmatizer lookup tables, or + | complex regular expressions. The language data has also been tidied up + | and simplified. spaCy now also supports simple lookup-based lemmatization. -p - | The #[code Language] class to initialise will be determined based on the - | model's settings. If no model is found, spaCy will let you know and won't - | just return an empty #[code Language] object anymore. If you want a blank - | language, you can always import the class directly, e.g. - | #[code from spacy.lang.en import English]. - -+infobox - | #[strong API:] #[+api("spacy#load") #[code spacy.load]] - | #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading] - -+h(3, "features-language") Improved language data and processing pipelines - -+aside-code("Example"). - from spacy.language import Language - nlp = Language(pipeline=['token_vectors', 'tags', - 'dependencies']) - -+infobox - | #[strong API:] #[+api("language") #[code Language]] - | #[strong Usage:] #[+a("/docs/usage/adding-languages") Adding languages] - -+h(3, "features-lemmatizer") Simple lookup-based lemmatization - -+aside-code("Example"). - LOOKUP = { - "aba": "abar", - "ababa": "abar", - "ababais": "abar", - "ababan": "abar", - "ababanes": "ababán" - } - -p - | spaCy now supports simple lookup-based lemmatization. The data is stored - | in a dictionary mapping a string to its lemma. To determine a token's - | lemma, spaCy simply looks it up in the table. The lookup lemmatizer can - | be imported from #[code spacy.lemmatizerlookup]. It's initialised with - | the lookup table, and should be returned by the #[code create_lemmatizer] - | classmethod of the language's defaults. ++image + include ../../assets/img/docs/language_data.svg +infobox | #[strong API:] #[+api("language") #[code Language]] + | #[strong Code:] #[+src(gh("spaCy", "spacy/lang")) spacy/lang] | #[strong Usage:] #[+a("/docs/usage/adding-languages") Adding languages] +h(3, "features-matcher") Revised matcher API +aside-code("Example"). from spacy.matcher import Matcher - from spacy.attrs import LOWER, IS_PUNCT matcher = Matcher(nlp.vocab) - matcher.add('HelloWorld', on_match=None, - [{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}], - [{LOWER: 'hello'}, {LOWER: 'world'}]) + matcher.add('HEARTS', None, [{'ORTH': '❤️', 'OP': '+'}]) assert len(matcher) == 1 - assert 'HelloWorld' in matcher + assert 'HEARTS' in matcher p | Patterns can now be added to the matcher by calling @@ -113,12 +126,6 @@ p | #[strong API:] #[+api("matcher") #[code Matcher]] | #[strong Usage:] #[+a("/docs/usage/rule-based-matching") Rule-based matching] -+h(3, "features-serializer") Serialization - -+infobox - | #[strong API:] #[+api("serializer") #[code Serializer]] - | #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading] - +h(3, "features-models") Neural network models for English, German, French and Spanish +infobox @@ -128,33 +135,25 @@ p +h(2, "incompat") Backwards incompatibilities +table(["Old", "New"]) + +row + +cell + | #[code spacy.en] + | #[code spacy.xx] + +cell + | #[code spacy.lang.en] + | #[code spacy.lang.xx] + + +row + +cell #[code spacy.orth] + +cell #[code spacy.lang.xx.lex_attrs] + +row +cell #[code Language.save_to_directory] +cell #[+api("language#to_disk") #[code Language.to_disk]] +row - +cell #[code Tokenizer.load] - +cell - | #[+api("tokenizer#from_disk") #[code Tokenizer.from_disk]] - | #[+api("tokenizer#from_bytes") #[code Tokenizer.from_bytes]] - - +row - +cell #[code Tagger.load] - +cell - | #[+api("tagger#from_disk") #[code Tagger.from_disk]] - | #[+api("tagger#from_bytes") #[code Tagger.from_bytes]] - - +row - +cell #[code DependencyParser.load] - +cell - | #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]] - | #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]] - - +row - +cell #[code EntityRecognizer.load] - +cell - | #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]] - | #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]] + +cell #[code Language.create_make_doc] + +cell #[+api("language#attributes") #[code Language.tokenizer]] +row +cell @@ -188,6 +187,28 @@ p | #[+api("stringstore#to_disk") #[code StringStore.to_disk]] | #[+api("stringstore#to_bytes") #[code StringStore.to_bytes]] + +row + +cell #[code Tokenizer.load] + +cell - + + +row + +cell #[code Tagger.load] + +cell + | #[+api("tagger#from_disk") #[code Tagger.from_disk]] + | #[+api("tagger#from_bytes") #[code Tagger.from_bytes]] + + +row + +cell #[code DependencyParser.load] + +cell + | #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]] + | #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]] + + +row + +cell #[code EntityRecognizer.load] + +cell + | #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]] + | #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]] + +row +cell #[code Matcher.load] +cell - @@ -208,12 +229,100 @@ p +row +cell #[code Doc.read_bytes] - +cell + +cell #[+api("binder") #[code Binder]] +row +cell #[code Token.is_ancestor_of] +cell #[+api("token#is_ancestor") #[code Token.is_ancestor]] - - +h(2, "migrating") Migrating from spaCy 1.x + ++list + +item Saving, loading and serialization. + +item Processing pipelines and language data. + +item Adding patterns and callbacks to the matcher. + +item Models trained with spaCy 1.x. + ++infobox("Some tips") + | Before migrating, we strongly recommend writing a few + | #[strong simple tests] specific to how you're using spaCy in your + | application. This makes it easier to check whether your code requires + | changes, and if so, which parts are affected. + | (By the way, feel free contribute your tests to + | #[+src(gh("spaCy", "spacy/tests")) our test suite] – this will also ensure + | we never accidentally introduce a bug in a workflow that's + | important to you.) If you've trained your own models, keep in mind that + | your train and runtime inputs must match. This means you'll have to + | #[strong retrain your models] with spaCy v2.0 to make them compatible. + + ++h(3, "migrating-saving-loading") Saving, loading and serialization + +p + | Double-check all calls to #[code spacy.load()] and make sure they don't + | use the #[code path] keyword argument. If you're only loading in binary + | data and not a model package that can construct its own #[code Language] + | class and pipeline, you should now use the + | #[+api("language#from_disk") #[code Language.from_disk()]] method. + ++code-new. + nlp = spacy.load('/model') + nlp = English().from_disk('/model/data') ++code-old nlp = spacy.load('en', path='/model') + +p + | Review all other code that writes state to disk or bytes. + | All containers, now share the same, consistent API for saving and + | loading. Replace saving with #[code to_disk()] or #[code to_bytes()], and + | loading with #[code from_disk()] and #[code from_bytes()]. + ++code-new. + nlp.to_disk('/model') + nlp.vocab.to_disk('/vocab') + ++code-old. + nlp.save_to_directory('/model') + nlp.vocab.dump('/vocab') + ++h(3, "migrating-languages") Processing pipelines and language data + +p + | If you're importing language data or #[code Language] classes, make sure + | to change your import statements to import from #[code spacy.lang]. If + | you've added your own custom language, it needs to be moved to + | #[code spacy/lang/xx] and adjusted accordingly. + ++code-new from spacy.lang.en import English ++code-old from spacy.en import English + +p + | If you've been using custom pipeline components, check out the new + | guide on #[+a("/docs/usage/language-processing-pipelines") processing pipelines]. + | Appending functions to the pipeline still works – but you might be able + | to make this more convenient by registering "component factories". + | Components of the processing pipeline can now be disabled by passing a + | list of their names to the #[code disable] keyword argument on loading + | or processing. + ++code-new. + nlp = spacy.load('en', disable=['tagger', 'ner']) + doc = nlp(u"I don't want parsed", disable=['parser']) ++code-old. + nlp = spacy.load('en', tagger=False, entity=False) + doc = nlp(u"I don't want parsed", parse=False) + ++h(3, "migrating-matcher") Adding patterns and callbacks to the matcher + +p + | If you're using the matcher, you can now add patterns in one step. This + | should be easy to update – simply merge the ID, callback and patterns + | into one call to #[+api("matcher#add") #[code matcher.add]]. + ++code-new. + matcher.add('GoogleNow', merge_phrases, [{ORTH: 'Google'}, {ORTH: 'Now'}]) + ++code-old. + matcher.add_entity('GoogleNow', on_match=merge_phrases) + matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}]) + ++h(3, "migrating-models") Trained models diff --git a/website/docs/usage/visualizers.jade b/website/docs/usage/visualizers.jade index 93a4b5567..186fc5db3 100644 --- a/website/docs/usage/visualizers.jade +++ b/website/docs/usage/visualizers.jade @@ -180,8 +180,8 @@ p p | If you don't need the web server and just want to generate the markup | – for example, to export it to a file or serve it in a custom - | way – you can use #[+api("displacy#render") #[code displacy.render]] - | instead. It works the same, but returns a string containing the markup. + | way – you can use #[+api("displacy#render") #[code displacy.render]]. + | It works the same way, but returns a string containing the markup. +code("Example"). import spacy @@ -220,10 +220,32 @@ p | a standalone graphic.) So instead of rendering all #[code Doc]s at one, | loop over them and export them separately. + ++h(3, "examples-export-svg") Example: Export SVG graphics of dependency parses + ++code("Example"). + import spacy + from spacy import displacy + from pathlib import Path + + nlp = spacy.load('en') + sentences = ["This is an example.", "This is another one."] + for sent in sentences: + doc = nlp(sentence) + svg = displacy.render(doc, style='dep') + file_name = '-'.join([w.text for w in doc if not w.is_punct]) + '.svg' + output_path = Path('/images/' + file_name) + output_path.open('w', encoding='utf-8').write(svg) + +p + | The above code will generate the dependency visualizations and them to + | two files, #[code This-is-an-example.svg] and #[code This-is-another-one.svg]. + + +h(2, "jupyter") Using displaCy in Jupyter notebooks p - | displaCy is able to detect whether you're within a + | displaCy is able to detect whether you're working in a | #[+a("https://jupyter.org") Jupyter] notebook, and will return markup | that can be rendered in a cell straight away. When you export your | notebook, the visualizations will be included as HTML. @@ -257,28 +279,6 @@ p html = displacy.render(doc, style='dep') return display(HTML(html)) -+h(2, "examples") Usage examples - -+h(3, "examples-export-svg") Export SVG graphics of dependency parses - -+code("Example"). - import spacy - from spacy import displacy - from pathlib import Path - - nlp = spacy.load('en') - sentences = ["This is an example.", "This is another one."] - for sent in sentences: - doc = nlp(sentence) - svg = displacy.render(doc, style='dep') - file_name = '-'.join([w.text for w in doc if not w.is_punct]) + '.svg' - output_path = Path('/images/' + file_name) - output_path.open('w', encoding='utf-8').write(svg) - -p - | The above code will generate the dependency visualizations and them to - | two files, #[code This-is-an-example.svg] and #[code This-is-another-one.svg]. - +h(2, "manual-usage") Rendering data manually p @@ -314,3 +314,62 @@ p 'text': 'But Google is starting from behind.', 'ents': [{'start': 4, 'end': 10, 'label': 'ORG'}], 'title': None + } + ++h(2, "webapp") Using displaCy in a web application + +p + | If you want to use the visualizers as part of a web application, for + | example to create something like our + | #[+a(DEMOS_URL + "/displacy") online demo], it's not recommended to + | simply wrap and serve the displaCy renderer. Instead, you should only + | rely on the server to perform spaCy's processing capabilities, and use + | #[+a(gh("displacy")) displaCy.js] to render the JSON-formatted output. + ++aside("Why not return the HTML by the server?") + | It's certainly possible to just have your server return the markup. + | But outputting raw, unsanitised HTML is risky and makes your app vulnerable to + | #[+a("https://en.wikipedia.org/wiki/Cross-site_scripting") cross-site scripting] + | (XSS). All your user needs to do is find a way to make spaCy return one + | token #[code <script src="malicious-code.js"><script>]. + | Instead of relying on the server to render and sanitize HTML, you + | can do this on the client in JavaScript. displaCy.js creates + | the markup as DOM nodes and will never insert raw HTML. + +p + | The #[code parse_deps] function takes a #[code Doc] object and returns + | a dictionary in a format that can be rendered by displaCy. + ++code("Example"). + import spacy + from spacy import displacy + + nlp = spacy.load('en') + + def displacy_service(text): + doc = nlp(text) + return displacy.parse_deps(doc) + +p + | Using a library like #[+a("https://falconframework.org/") Falcon] or + | #[+a("http://www.hug.rest/") Hug], you can easily turn the above code + | into a simple REST API that receives a text and returns a JSON-formatted + | parse. In your front-end, include #[+a(gh("displacy")) displacy.js] and + | initialise it with the API URL and the ID or query selector of the + | container to render the visualisation in, e.g. #[code '#displacy'] for + | #[code <div id="displacy">]. + ++code("script.js", "javascript"). + var displacy = new displaCy('http://localhost:8080', { + container: '#displacy' + }) + + function parse(text) { + displacy.parse(text); + } + +p + | When you call #[code parse()], it will make a request to your API, + | receive the JSON-formatted parse and render it in your container. To + | create an interactive experience, you could trigger this function by + | a button and read the text from an #[code <input>] field. diff --git a/website/docs/usage/word-vectors-similarities.jade b/website/docs/usage/word-vectors-similarities.jade index 3cc0a67a8..e5935cfb6 100644 --- a/website/docs/usage/word-vectors-similarities.jade +++ b/website/docs/usage/word-vectors-similarities.jade @@ -6,61 +6,37 @@ p | Dense, real valued vectors representing distributional similarity | information are now a cornerstone of practical NLP. The most common way | to train these vectors is the #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec] - | family of algorithms. - -+aside("Tip") - | If you need to train a word2vec model, we recommend the implementation in - | the Python library #[+a("https://radimrehurek.com/gensim/") Gensim]. - -p - | spaCy makes using word vectors very easy. The - | #[+api("lexeme") #[code Lexeme]], #[+api("token") #[code Token]], - | #[+api("span") #[code Span]] and #[+api("doc") #[code Doc]] classes all - | have a #[code .vector] property, which is a 1-dimensional numpy array of - | 32-bit floats: - -+code. - import numpy - - apples, and_, oranges = nlp(u'apples and oranges') - print(apples.vector.shape) - # (1,) - apples.similarity(oranges) - -p - | By default, #[code Token.vector] returns the vector for its underlying - | lexeme, while #[code Doc.vector] and #[code Span.vector] return an - | average of the vectors of their tokens. You can customize these - | behaviours by modifying the #[code doc.user_hooks], - | #[code doc.user_span_hooks] and #[code doc.user_token_hooks] - | dictionaries. - -+aside-code("Example"). - # TODO - -p - | The default English model installs vectors for one million vocabulary - | entries, using the 300-dimensional vectors trained on the Common Crawl + | family of algorithms. The default + | #[+a("/docs/usage/models#available") English model] installs + | 300-dimensional vectors trained on the Common Crawl | corpus using the #[+a("http://nlp.stanford.edu/projects/glove/") GloVe] | algorithm. The GloVe common crawl vectors have become a de facto | standard for practical NLP. -+aside-code("Example"). - # TODO ++aside("Tip: Training a word2vec model") + | If you need to train a word2vec model, we recommend the implementation in + | the Python library #[+a("https://radimrehurek.com/gensim/") Gensim]. + ++h(2, "101") Similarity and word vectors 101 + +tag-model("vectors") + +include _spacy-101/_similarity +include _spacy-101/_word-vectors + ++h(2, "custom") Customising word vectors + ++under-construction p - | You can load new word vectors from a file-like buffer using the - | #[code vocab.load_vectors()] method. The file should be a - | whitespace-delimited text file, where the word is in the first column, - | and subsequent columns provide the vector data. For faster loading, you - | can use the #[code vocab.vectors_from_bin_loc()] method, which accepts a - | path to a binary file written by #[code vocab.dump_vectors()]. + | By default, #[+api("token#vector") #[code Token.vector]] returns the + | vector for its underlying #[+api("lexeme") #[code Lexeme]], while + | #[+api("doc#vector") #[code Doc.vector]] and + | #[+api("span#vector") #[code Span.vector]] return an average of the + | vectors of their tokens. You can customize these + | behaviours by modifying the #[code doc.user_hooks], + | #[code doc.user_span_hooks] and #[code doc.user_token_hooks] + | dictionaries. -+aside-code("Example"). - # TODO ++h(2, "similarity") Similarity -p - | You can also load vectors from memory, by writing to the #[code lexeme.vector] - | property. If the vectors you are writing are of different dimensionality - | from the ones currently loaded, you should first call - | #[code vocab.resize_vectors(new_size)]. ++under-construction