Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-05-27 18:32:57 -05:00
commit c1263a844b
40 changed files with 720 additions and 271 deletions

View File

@ -1,10 +1,7 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import importlib from .cli.info import info as cli_info
from .compat import basestring_
from .cli.info import info
from .glossary import explain from .glossary import explain
from .deprecated import resolve_load_name from .deprecated import resolve_load_name
from . import util from . import util
@ -12,11 +9,8 @@ from . import util
def load(name, **overrides): def load(name, **overrides):
name = resolve_load_name(name, **overrides) name = resolve_load_name(name, **overrides)
model_path = util.resolve_model_path(name) return util.load_model(name)
meta = util.parse_package_meta(model_path)
if 'lang' not in meta:
raise IOError('No language setting found in model meta.') def info(model=None, markdown=False):
cls = util.get_lang_class(meta['lang']) return cli_info(None, model, markdown)
overrides['meta'] = meta
overrides['path'] = model_path
return cls(**overrides)

View File

@ -19,6 +19,8 @@ import numpy
def _init_for_precomputed(W, ops): def _init_for_precomputed(W, ops):
if (W**2).sum() != 0.:
return
reshaped = W.reshape((W.shape[1], W.shape[0] * W.shape[2])) reshaped = W.reshape((W.shape[1], W.shape[0] * W.shape[2]))
ops.xavier_uniform_init(reshaped) ops.xavier_uniform_init(reshaped)
W[:] = reshaped.reshape(W.shape) W[:] = reshaped.reshape(W.shape)
@ -247,6 +249,7 @@ def doc2feats(cols=None):
model.cols = cols model.cols = cols
return model return model
def print_shape(prefix): def print_shape(prefix):
def forward(X, drop=0.): def forward(X, drop=0.):
return X, lambda dX, **kwargs: dX return X, lambda dX, **kwargs: dX

View File

@ -24,8 +24,9 @@ CONVERTERS = {
n_sents=("Number of sentences per doc", "option", "n", float), n_sents=("Number of sentences per doc", "option", "n", float),
morphology=("Enable appending morphology to tags", "flag", "m", bool) morphology=("Enable appending morphology to tags", "flag", "m", bool)
) )
def convert(_, input_file, output_dir, n_sents, morphology): def convert(cmd, input_file, output_dir, n_sents, morphology):
"""Convert files into JSON format for use with train command and other """
Convert files into JSON format for use with train command and other
experiment management functions. experiment management functions.
""" """
input_path = Path(input_file) input_path = Path(input_file)
@ -39,4 +40,4 @@ def convert(_, input_file, output_dir, n_sents, morphology):
prints("Can't find converter for %s" % input_path.parts[-1], prints("Can't find converter for %s" % input_path.parts[-1],
title="Unknown format", exits=1) title="Unknown format", exits=1)
CONVERTERS[file_ext](input_path, output_path, CONVERTERS[file_ext](input_path, output_path,
n_sents=n_sents, morphology=morphology) n_sents=n_sents, use_morphology=morphology)

View File

@ -17,8 +17,9 @@ from .. import about
direct=("force direct download. Needs model name with version and won't " direct=("force direct download. Needs model name with version and won't "
"perform compatibility check", "flag", "d", bool) "perform compatibility check", "flag", "d", bool)
) )
def download(model, direct=False): def download(cmd, model, direct=False):
"""Download compatible model from default download path using pip. Model """
Download compatible model from default download path using pip. Model
can be shortcut, model name or, if --direct flag is set, full model name can be shortcut, model name or, if --direct flag is set, full model name
with version. with version.
""" """
@ -31,7 +32,7 @@ def download(model, direct=False):
version = get_version(model_name, compatibility) version = get_version(model_name, compatibility)
download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version)) download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
try: try:
link(model_name, model, force=True) link(None, model_name, model, force=True)
except: except:
# Dirty, but since spacy.download and the auto-linking is mostly # Dirty, but since spacy.download and the auto-linking is mostly
# a convenience wrapper, it's best to show a success message and # a convenience wrapper, it's best to show a success message and

View File

@ -14,14 +14,20 @@ from .. import util
model=("optional: shortcut link of model", "positional", None, str), model=("optional: shortcut link of model", "positional", None, str),
markdown=("generate Markdown for GitHub issues", "flag", "md", str) markdown=("generate Markdown for GitHub issues", "flag", "md", str)
) )
def info(model=None, markdown=False): def info(cmd, model=None, markdown=False):
"""Print info about spaCy installation. If a model shortcut link is """Print info about spaCy installation. If a model shortcut link is
speficied as an argument, print model information. Flag --markdown speficied as an argument, print model information. Flag --markdown
prints details in Markdown for easy copy-pasting to GitHub issues. prints details in Markdown for easy copy-pasting to GitHub issues.
""" """
if model: if model:
model_path = util.resolve_model_path(model) if util.is_package(model):
meta = util.parse_package_meta(model_path) model_path = util.get_package_path(model)
else:
model_path = util.get_data_path() / model
meta_path = model_path / 'meta.json'
if not meta_path.is_file():
prints(meta_path, title="Can't find model meta.json", exits=1)
meta = read_json(meta_path)
if model_path.resolve() != model_path: if model_path.resolve() != model_path:
meta['link'] = path2str(model_path) meta['link'] = path2str(model_path)
meta['source'] = path2str(model_path.resolve()) meta['source'] = path2str(model_path.resolve())

View File

@ -14,13 +14,14 @@ from .. import util
link_name=("name of shortuct link to create", "positional", None, str), link_name=("name of shortuct link to create", "positional", None, str),
force=("force overwriting of existing link", "flag", "f", bool) force=("force overwriting of existing link", "flag", "f", bool)
) )
def link(origin, link_name, force=False): def link(cmd, origin, link_name, force=False):
"""Create a symlink for models within the spacy/data directory. Accepts """
Create a symlink for models within the spacy/data directory. Accepts
either the name of a pip package, or the local path to the model data either the name of a pip package, or the local path to the model data
directory. Linking models allows loading them via spacy.load(link_name). directory. Linking models allows loading them via spacy.load(link_name).
""" """
if util.is_package(origin): if util.is_package(origin):
model_path = util.get_model_package_path(origin) model_path = util.get_package_path(model)
else: else:
model_path = Path(origin) model_path = Path(origin)
if not model_path.exists(): if not model_path.exists():

View File

@ -18,8 +18,9 @@ from .. import about
meta=("path to meta.json", "option", "m", str), meta=("path to meta.json", "option", "m", str),
force=("force overwriting of existing folder in output directory", "flag", "f", bool) force=("force overwriting of existing folder in output directory", "flag", "f", bool)
) )
def package(input_dir, output_dir, meta, force): def package(cmd, input_dir, output_dir, meta=None, force=False):
"""Generate Python package for model data, including meta and required """
Generate Python package for model data, including meta and required
installation files. A new directory will be created in the specified installation files. A new directory will be created in the specified
output directory, and model data will be copied over. output directory, and model data will be copied over.
""" """
@ -42,7 +43,7 @@ def package(input_dir, output_dir, meta, force):
meta = util.read_json(meta_path) meta = util.read_json(meta_path)
else: else:
meta = generate_meta() meta = generate_meta()
validate_meta(meta, ['lang', 'name', 'version']) meta = validate_meta(meta, ['lang', 'name', 'version'])
model_name = meta['lang'] + '_' + meta['name'] model_name = meta['lang'] + '_' + meta['name']
model_name_v = model_name + '-' + meta['version'] model_name_v = model_name + '-' + meta['version']
@ -85,20 +86,32 @@ def generate_meta():
('email', 'Author email', False), ('email', 'Author email', False),
('url', 'Author website', False), ('url', 'Author website', False),
('license', 'License', 'CC BY-NC 3.0')] ('license', 'License', 'CC BY-NC 3.0')]
prints("Enter the package settings for your model.", title="Generating meta.json") prints("Enter the package settings for your model.", title="Generating meta.json")
meta = {} meta = {}
for setting, desc, default in settings: for setting, desc, default in settings:
response = util.get_raw_input(desc, default) response = util.get_raw_input(desc, default)
meta[setting] = default if response == '' and default else response meta[setting] = default if response == '' and default else response
meta['pipeline'] = generate_pipeline()
return meta return meta
def generate_pipeline():
prints("If set to 'True', the default pipeline is used. If set to 'False', "
"the pipeline will be disabled. Components should be specified as a "
"comma-separated list of component names, e.g. vectorizer, tagger, "
"parser, ner. For more information, see the docs on processing pipelines.",
title="Enter your model's pipeline components")
pipeline = util.get_raw_input("Pipeline components", True)
replace = {'True': True, 'False': False}
return replace[pipeline] if pipeline in replace else pipeline.split(', ')
def validate_meta(meta, keys): def validate_meta(meta, keys):
for key in keys: for key in keys:
if key not in meta or meta[key] == '': if key not in meta or meta[key] == '':
prints("This setting is required to build your package.", prints("This setting is required to build your package.",
title='No "%s" setting found in meta.json' % key, exits=1) title='No "%s" setting found in meta.json' % key, exits=1)
return meta
def get_template(filepath): def get_template(filepath):

View File

@ -32,9 +32,11 @@ from .. import displacy
no_parser=("Don't train parser", "flag", "P", bool), no_parser=("Don't train parser", "flag", "P", bool),
no_entities=("Don't train NER", "flag", "N", bool) no_entities=("Don't train NER", "flag", "N", bool)
) )
def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
use_gpu=False, no_tagger=False, no_parser=False, no_entities=False): use_gpu=False, no_tagger=False, no_parser=False, no_entities=False):
"""Train a model. Expects data in spaCy's JSON format.""" """
Train a model. Expects data in spaCy's JSON format.
"""
n_sents = n_sents or None n_sents = n_sents or None
output_path = util.ensure_path(output_dir) output_path = util.ensure_path(output_dir)
train_path = util.ensure_path(train_data) train_path = util.ensure_path(train_data)
@ -84,11 +86,11 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
pbar.update(len(docs)) pbar.update(len(docs))
with nlp.use_params(optimizer.averages): with nlp.use_params(optimizer.averages):
scorer = nlp.evaluate(corpus.dev_docs(nlp, gold_preproc=False))
with (output_path / ('model%d.pickle' % i)).open('wb') as file_: with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
dill.dump(nlp, file_, -1) dill.dump(nlp, file_, -1)
with (output_path / ('model%d.pickle' % i)).open('rb') as file_:
nlp_loaded = dill.load(file_)
scorer = nlp_loaded.evaluate(corpus.dev_docs(nlp_loaded, gold_preproc=False))
print_progress(i, losses, scorer.scores) print_progress(i, losses, scorer.scores)
finally: finally:
print("Saving model...") print("Saving model...")

View File

@ -1,8 +1,8 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, UNITS from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES, UNITS
_currency = r"\$|¢|£|€|¥|฿|৳" _currency = r"\$|¢|£|€|¥|฿|৳"
@ -10,16 +10,16 @@ _quotes = QUOTES.replace("'", '')
_list_punct = LIST_PUNCT + '। ॥'.strip().split() _list_punct = LIST_PUNCT + '। ॥'.strip().split()
_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES) _prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS)
_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + _suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
[r'(?<=[0-9])\+', [r'(?<=[0-9])\+',
r'(?<=°[FfCcKk])\.', r'(?<=°[FfCcKk])\.',
r'(?<=[0-9])(?:{})'.format(_currency), r'(?<=[0-9])(?:{})'.format(_currency),
r'(?<=[0-9])(?:{})'.format(UNITS), r'(?<=[0-9])(?:{})'.format(UNITS),
r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)]) r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)])
_infixes = (LIST_ELLIPSES + _infixes = (LIST_ELLIPSES + LIST_ICONS +
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),

View File

@ -20,7 +20,6 @@ _upper = [_latin_upper]
_lower = [_latin_lower] _lower = [_latin_lower]
_uncased = [_bengali, _hebrew] _uncased = [_bengali, _hebrew]
ALPHA = merge_char_classes(_upper + _lower + _uncased) ALPHA = merge_char_classes(_upper + _lower + _uncased)
ALPHA_LOWER = merge_char_classes(_lower + _uncased) ALPHA_LOWER = merge_char_classes(_lower + _uncased)
ALPHA_UPPER = merge_char_classes(_upper + _uncased) ALPHA_UPPER = merge_char_classes(_upper + _uncased)
@ -33,13 +32,14 @@ _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
_punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &' _punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
_quotes = r'\' \'\' " ” “ `` ` ´ , „ » «' _quotes = r'\' \'\' " ” “ `` ` ´ , „ » «'
_hyphens = '- — -- ---' _hyphens = '- — -- ---'
_other_symbols = r'[\p{So}]'
UNITS = merge_chars(_units) UNITS = merge_chars(_units)
CURRENCY = merge_chars(_currency) CURRENCY = merge_chars(_currency)
QUOTES = merge_chars(_quotes) QUOTES = merge_chars(_quotes)
PUNCT = merge_chars(_punct) PUNCT = merge_chars(_punct)
HYPHENS = merge_chars(_hyphens) HYPHENS = merge_chars(_hyphens)
ICONS = _other_symbols
LIST_UNITS = split_chars(_units) LIST_UNITS = split_chars(_units)
LIST_CURRENCY = split_chars(_currency) LIST_CURRENCY = split_chars(_currency)
@ -47,3 +47,4 @@ LIST_QUOTES = split_chars(_quotes)
LIST_PUNCT = split_chars(_punct) LIST_PUNCT = split_chars(_punct)
LIST_HYPHENS = split_chars(_hyphens) LIST_HYPHENS = split_chars(_hyphens)
LIST_ELLIPSES = [r'\.\.+', ''] LIST_ELLIPSES = [r'\.\.+', '']
LIST_ICONS = [_other_symbols]

View File

@ -35,4 +35,4 @@ class English(Language):
Defaults = EnglishDefaults Defaults = EnglishDefaults
__all__ = ['English', 'EnglishDefaults'] __all__ = ['English']

View File

@ -2,15 +2,16 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
from .char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES from .char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
from .char_classes import CURRENCY, UNITS from .char_classes import QUOTES, CURRENCY, UNITS
_prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + _prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
LIST_CURRENCY) LIST_CURRENCY + LIST_ICONS)
_suffixes = (["'s", "'S", "s", "S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + _suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
["'s", "'S", "s", "S"] +
[r'(?<=[0-9])\+', [r'(?<=[0-9])\+',
r'(?<=°[FfCcKk])\.', r'(?<=°[FfCcKk])\.',
r'(?<=[0-9])(?:{})'.format(CURRENCY), r'(?<=[0-9])(?:{})'.format(CURRENCY),
@ -19,7 +20,7 @@ _suffixes = (["'s", "'S", "s", "S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QU
r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER)]) r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER)])
_infixes = (LIST_ELLIPSES + _infixes = (LIST_ELLIPSES + LIST_ICONS +
[r'(?<=[0-9])[+\-\*^](?=[0-9-])', [r'(?<=[0-9])[+\-\*^](?=[0-9-])',
r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),

26
spacy/lang/xx/__init__.py Normal file
View File

@ -0,0 +1,26 @@
# coding: utf8
from __future__ import unicode_literals
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...attrs import LANG
from ...util import update_exc
class MultiLanguageDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'xx'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
class MultiLanguage(Language):
"""Language class to be used for models that support multiple languages.
This module allows models to specify their language ID as 'xx'.
"""
lang = 'xx'
Defaults = MultiLanguageDefaults
__all__ = ['MultiLanguage']

View File

@ -337,6 +337,9 @@ cdef class NeuralDependencyParser(NeuralParser):
name = 'parser' name = 'parser'
TransitionSystem = ArcEager TransitionSystem = ArcEager
def __reduce__(self):
return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None)
cdef class NeuralEntityRecognizer(NeuralParser): cdef class NeuralEntityRecognizer(NeuralParser):
name = 'entity' name = 'entity'
@ -344,6 +347,10 @@ cdef class NeuralEntityRecognizer(NeuralParser):
nr_feature = 6 nr_feature = 6
def __reduce__(self):
return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)
cdef class BeamDependencyParser(BeamParser): cdef class BeamDependencyParser(BeamParser):
TransitionSystem = ArcEager TransitionSystem = ArcEager

View File

@ -335,17 +335,18 @@ cdef cppclass StateC:
this._break = this._b_i this._break = this._b_i
void clone(const StateC* src) nogil: void clone(const StateC* src) nogil:
this.length = src.length
memcpy(this._sent, src._sent, this.length * sizeof(TokenC)) memcpy(this._sent, src._sent, this.length * sizeof(TokenC))
memcpy(this._stack, src._stack, this.length * sizeof(int)) memcpy(this._stack, src._stack, this.length * sizeof(int))
memcpy(this._buffer, src._buffer, this.length * sizeof(int)) memcpy(this._buffer, src._buffer, this.length * sizeof(int))
memcpy(this._ents, src._ents, this.length * sizeof(Entity)) memcpy(this._ents, src._ents, this.length * sizeof(Entity))
memcpy(this.shifted, src.shifted, this.length * sizeof(this.shifted[0])) memcpy(this.shifted, src.shifted, this.length * sizeof(this.shifted[0]))
this.length = src.length
this._b_i = src._b_i this._b_i = src._b_i
this._s_i = src._s_i this._s_i = src._s_i
this._e_i = src._e_i this._e_i = src._e_i
this._break = src._break this._break = src._break
this.offset = src.offset this.offset = src.offset
this._empty_token = src._empty_token
void fast_forward() nogil: void fast_forward() nogil:
# space token attachement policy: # space token attachement policy:

View File

@ -9,6 +9,7 @@ import ctypes
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t
from libc.string cimport memcpy from libc.string cimport memcpy
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from collections import OrderedDict
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ._state cimport StateC, is_space_token from ._state cimport StateC, is_space_token
@ -312,12 +313,13 @@ cdef class ArcEager(TransitionSystem):
@classmethod @classmethod
def get_actions(cls, **kwargs): def get_actions(cls, **kwargs):
actions = kwargs.get('actions', actions = kwargs.get('actions',
{ OrderedDict((
SHIFT: [''], (SHIFT, ['']),
REDUCE: [''], (REDUCE, ['']),
RIGHT: [], (RIGHT, []),
LEFT: [], (LEFT, []),
BREAK: ['ROOT']}) (BREAK, ['ROOT'])
)))
seen_actions = set() seen_actions = set()
for label in kwargs.get('left_labels', []): for label in kwargs.get('left_labels', []):
if label.upper() != 'ROOT': if label.upper() != 'ROOT':

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from thinc.typedefs cimport weight_t from thinc.typedefs cimport weight_t
from collections import OrderedDict
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ._state cimport StateC from ._state cimport StateC
@ -51,17 +52,29 @@ cdef bint _entity_is_sunk(StateClass st, Transition* golds) nogil:
cdef class BiluoPushDown(TransitionSystem): cdef class BiluoPushDown(TransitionSystem):
def __init__(self, *args, **kwargs):
TransitionSystem.__init__(self, *args, **kwargs)
def __reduce__(self):
labels_by_action = OrderedDict()
cdef Transition t
for trans in self.c[:self.n_moves]:
label_str = self.strings[trans.label]
labels_by_action.setdefault(trans.move, []).append(label_str)
return (BiluoPushDown, (self.strings, labels_by_action),
None, None)
@classmethod @classmethod
def get_actions(cls, **kwargs): def get_actions(cls, **kwargs):
actions = kwargs.get('actions', actions = kwargs.get('actions',
{ OrderedDict((
MISSING: [''], (MISSING, ['']),
BEGIN: [], (BEGIN, []),
IN: [], (IN, []),
LAST: [], (LAST, []),
UNIT: [], (UNIT, []),
OUT: [''] (OUT, [''])
}) )))
seen_entities = set() seen_entities = set()
for entity_type in kwargs.get('entity_types', []): for entity_type in kwargs.get('entity_types', []):
if entity_type in seen_entities: if entity_type in seen_entities:
@ -90,7 +103,7 @@ cdef class BiluoPushDown(TransitionSystem):
def move_name(self, int move, int label): def move_name(self, int move, int label):
if move == OUT: if move == OUT:
return 'O' return 'O'
elif move == 'MISSING': elif move == MISSING:
return 'M' return 'M'
else: else:
return MOVE_NAMES[move] + '-' + self.strings[label] return MOVE_NAMES[move] + '-' + self.strings[label]

View File

@ -527,6 +527,14 @@ cdef class Parser:
xp.add.at(d_tokvecs, xp.add.at(d_tokvecs,
ids, d_state_features * active_feats) ids, d_state_features * active_feats)
@property
def move_names(self):
names = []
for i in range(self.moves.n_moves):
name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label)
names.append(name)
return names
def get_batch_model(self, batch_size, tokvecs, stream, dropout): def get_batch_model(self, batch_size, tokvecs, stream, dropout):
lower, upper = self.model lower, upper = self.model
state2vec = precompute_hiddens(batch_size, tokvecs, state2vec = precompute_hiddens(batch_size, tokvecs,

View File

@ -5,7 +5,7 @@ from __future__ import unicode_literals
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from thinc.typedefs cimport weight_t from thinc.typedefs cimport weight_t
from collections import defaultdict from collections import defaultdict, OrderedDict
from ..structs cimport TokenC from ..structs cimport TokenC
from .stateclass cimport StateClass from .stateclass cimport StateClass
@ -26,7 +26,7 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
cdef class TransitionSystem: cdef class TransitionSystem:
def __init__(self, StringStore string_table, dict labels_by_action): def __init__(self, StringStore string_table, labels_by_action):
self.mem = Pool() self.mem = Pool()
self.strings = string_table self.strings = string_table
self.n_moves = 0 self.n_moves = 0
@ -34,14 +34,14 @@ cdef class TransitionSystem:
self.c = <Transition*>self.mem.alloc(self._size, sizeof(Transition)) self.c = <Transition*>self.mem.alloc(self._size, sizeof(Transition))
for action, label_strs in sorted(labels_by_action.items()): for action, label_strs in labels_by_action.items():
for label_str in label_strs: for label_str in label_strs:
self.add_action(int(action), label_str) self.add_action(int(action), label_str)
self.root_label = self.strings['ROOT'] self.root_label = self.strings['ROOT']
self.init_beam_state = _init_state self.init_beam_state = _init_state
def __reduce__(self): def __reduce__(self):
labels_by_action = {} labels_by_action = OrderedDict()
cdef Transition t cdef Transition t
for trans in self.c[:self.n_moves]: for trans in self.c[:self.n_moves]:
label_str = self.strings[trans.label] label_str = self.strings[trans.label]
@ -77,6 +77,11 @@ cdef class TransitionSystem:
history.append(i) history.append(i)
action.do(state.c, action.label) action.do(state.c, action.label)
break break
else:
print(gold.words)
print(gold.ner)
print(history)
raise ValueError("Could not find gold move")
return history return history
cdef int initialize_state(self, StateC* state) nogil: cdef int initialize_state(self, StateC* state) nogil:

View File

@ -1,7 +1,4 @@
# coding: utf-8 # coding: utf-8
"""Test that tokenizer exceptions and emoticons are handled correctly."""
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
@ -39,3 +36,12 @@ def test_tokenizer_handles_emoticons(tokenizer):
def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length): def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
tokens = tokenizer(text) tokens = tokenizer(text)
assert len(tokens) == length assert len(tokens) == length
@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8),
('i💙you', 3), ('🤘🤘yay!', 4)])
def test_tokenizer_handles_emoji(tokenizer, text, length):
exceptions = ["hu"]
tokens = tokenizer(text)
if tokens[0].lang_ not in exceptions:
assert len(tokens) == length

View File

@ -78,27 +78,86 @@ def ensure_path(path):
return path return path
def resolve_model_path(name): def load_model(name):
"""Resolve a model name or string to a model path. """Load a model from a shortcut link, package or data path.
name (unicode): Package name, shortcut link or model path. name (unicode): Package name, shortcut link or model path.
RETURNS (Path): Path to model data directory. RETURNS (Language): `Language` class with the loaded model.
""" """
data_path = get_data_path() data_path = get_data_path()
if not data_path or not data_path.exists(): if not data_path or not data_path.exists():
raise IOError("Can't find spaCy data path: %s" % path2str(data_path)) raise IOError("Can't find spaCy data path: %s" % path2str(data_path))
if isinstance(name, basestring_): if isinstance(name, basestring_):
if (data_path / name).exists(): # in data dir or shortcut link if (data_path / name).exists(): # in data dir or shortcut
return (data_path / name) return load_model_from_path(data_path / name)
if is_package(name): # installed as a package if is_package(name): # installed as package
return get_model_package_path(name) return load_model_from_pkg(name)
if Path(name).exists(): # path to model if Path(name).exists(): # path to model data directory
return Path(name) return load_data_from_path(Path(name))
elif hasattr(name, 'exists'): # Path or Path-like object elif hasattr(name, 'exists'): # Path or Path-like to model data
return name return load_data_from_path(name)
raise IOError("Can't find model '%s'" % name) raise IOError("Can't find model '%s'" % name)
def load_model_from_init_py(init_file):
"""Helper function to use in the `load()` method of a model package's
__init__.py.
init_file (unicode): Path to model's __init__.py, i.e. `__file__`.
RETURNS (Language): `Language` class with loaded model.
"""
model_path = Path(init_file).parent
return load_data_from_path(model_path, package=True)
def load_model_from_path(model_path):
"""Import and load a model package from its file path.
path (unicode or Path): Path to package directory.
RETURNS (Language): `Language` class with loaded model.
"""
model_path = ensure_path(model_path)
spec = importlib.util.spec_from_file_location('model', model_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module.load()
def load_model_from_pkg(name):
"""Import and load a model package.
name (unicode): Name of model package installed via pip.
RETURNS (Language): `Language` class with loaded model.
"""
module = importlib.import_module(name)
return module.load()
def load_data_from_path(model_path, package=False):
"""Initialie a `Language` class with a loaded model from a model data path.
model_path (unicode or Path): Path to model data directory.
package (bool): Does the path point to the parent package directory?
RETURNS (Language): `Language` class with loaded model.
"""
model_path = ensure_path(model_path)
meta_path = model_path / 'meta.json'
if not meta_path.is_file():
raise IOError("Could not read meta.json from %s" % location)
meta = read_json(location)
for setting in ['lang', 'name', 'version']:
if setting not in meta:
raise IOError('No %s setting found in model meta.json' % setting)
if package:
model_data_path = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version'])
model_path = model_path / model_data_path
if not model_path.exists():
raise ValueError("Can't find model directory: %s" % path2str(model_path))
cls = get_lang_class(meta['lang'])
nlp = cls(pipeline=meta.get('pipeline', True))
return nlp.from_disk(model_path)
def is_package(name): def is_package(name):
"""Check if string maps to a package installed via pip. """Check if string maps to a package installed via pip.
@ -112,36 +171,16 @@ def is_package(name):
return False return False
def get_model_package_path(package_name): def get_package_path(name):
"""Get path to a model package installed via pip. """Get the path to an installed package.
package_name (unicode): Name of installed package. name (unicode): Package name.
RETURNS (Path): Path to model data directory. RETURNS (Path): Path to installed package.
""" """
# Here we're importing the module just to find it. This is worryingly # Here we're importing the module just to find it. This is worryingly
# indirect, but it's otherwise very difficult to find the package. # indirect, but it's otherwise very difficult to find the package.
# Python's installation and import rules are very complicated.
pkg = importlib.import_module(package_name) pkg = importlib.import_module(package_name)
package_path = Path(pkg.__file__).parent.parent return Path(pkg.__file__).parent
meta = parse_package_meta(package_path / package_name)
model_name = '%s-%s' % (package_name, meta['version'])
return package_path / package_name / model_name
def parse_package_meta(package_path, require=True):
"""Check if a meta.json exists in a package and return its contents.
package_path (Path): Path to model package directory.
require (bool): If True, raise error if no meta.json is found.
RETURNS (dict or None): Model meta.json data or None.
"""
location = package_path / 'meta.json'
if location.is_file():
return read_json(location)
elif require:
raise IOError("Could not read meta.json from %s" % location)
else:
return None
def is_in_jupyter(): def is_in_jupyter():
@ -177,10 +216,13 @@ def get_async(stream, numpy_array):
def itershuffle(iterable, bufsize=1000): def itershuffle(iterable, bufsize=1000):
"""Shuffle an iterator. This works by holding `bufsize` items back """Shuffle an iterator. This works by holding `bufsize` items back
and yielding them sometime later. Obviously, this is not unbiased -- and yielding them sometime later. Obviously, this is not unbiased
but should be good enough for batching. Larger bufsize means less bias. but should be good enough for batching. Larger bufsize means less bias.
From https://gist.github.com/andres-erbsen/1307752 From https://gist.github.com/andres-erbsen/1307752
iterable (iterable): Iterator to shuffle.
bufsize (int): Items to hold back.
YIELDS (iterable): The shuffled iterator.
""" """
iterable = iter(iterable) iterable = iter(iterable)
buf = [] buf = []
@ -315,17 +357,16 @@ def normalize_slice(length, start, stop, step=None):
def compounding(start, stop, compound): def compounding(start, stop, compound):
'''Yield an infinite series of compounding values. Each time the """Yield an infinite series of compounding values. Each time the
generator is called, a value is produced by multiplying the previous generator is called, a value is produced by multiplying the previous
value by the compound rate. value by the compound rate.
EXAMPLE EXAMPLE:
>>> sizes = compounding(1., 10., 1.5) >>> sizes = compounding(1., 10., 1.5)
>>> assert next(sizes) == 1. >>> assert next(sizes) == 1.
>>> assert next(sizes) == 1 * 1.5 >>> assert next(sizes) == 1 * 1.5
>>> assert next(sizes) == 1.5 * 1.5 >>> assert next(sizes) == 1.5 * 1.5
''' """
def clip(value): def clip(value):
return max(value, stop) if (start>stop) else min(value, stop) return max(value, stop) if (start>stop) else min(value, stop)
curr = float(start) curr = float(start)
@ -335,7 +376,7 @@ def compounding(start, stop, compound):
def decaying(start, stop, decay): def decaying(start, stop, decay):
'''Yield an infinite series of linearly decaying values.''' """Yield an infinite series of linearly decaying values."""
def clip(value): def clip(value):
return max(value, stop) if (start>stop) else min(value, stop) return max(value, stop) if (start>stop) else min(value, stop)
nr_upd = 1. nr_upd = 1.
@ -344,12 +385,6 @@ def decaying(start, stop, decay):
nr_upd += 1 nr_upd += 1
def check_renamed_kwargs(renamed, kwargs):
for old, new in renamed.items():
if old in kwargs:
raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
def read_json(location): def read_json(location):
"""Open and load JSON from file. """Open and load JSON from file.

View File

@ -53,8 +53,6 @@ cdef class Vocab:
vice versa. vice versa.
RETURNS (Vocab): The newly constructed vocab object. RETURNS (Vocab): The newly constructed vocab object.
""" """
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
tag_map = tag_map if tag_map is not None else {} tag_map = tag_map if tag_map is not None else {}
if lemmatizer in (None, True, False): if lemmatizer in (None, True, False):

View File

@ -1,9 +1,9 @@
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-1 -1 907 737" width="906" height="746"> <svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-1 -1 907 737" width="906" height="746">
<style> <style>
.svg__architecture__text-large { fill: #1a1e23; font: 20px "Source Sans Pro" } .svg__architecture__text-large { fill: #1a1e23; font: 20px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__architecture__text-medium { fill: #1a1e23; font: 17px "Source Sans Pro" } .svg__architecture__text-medium { fill: #1a1e23; font: 17px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__architecture__text-small { fill: #1a1e23; font: bold 14px "Source Sans Pro" } .svg__architecture__text-small { fill: #1a1e23; font: bold 14px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__architecture__text-code { fill: #1a1e23; font: 600 12px "Source Code Pro" } .svg__architecture__text-code { fill: #1a1e23; font: 600 12px "Source Code Pro", Monaco, "Courier New", monospace }
</style> </style>
<ellipse cx="404" cy="203" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="74.8" ry="49.8"/> <ellipse cx="404" cy="203" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="74.8" ry="49.8"/>
<text class="svg__architecture__text-large" transform="translate(362.5 206.5)" width="81" height="40">Language</text> <text class="svg__architecture__text-large" transform="translate(362.5 206.5)" width="81" height="40">Language</text>

Before

Width:  |  Height:  |  Size: 14 KiB

After

Width:  |  Height:  |  Size: 14 KiB

View File

@ -1,8 +1,8 @@
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="931" height="456" viewBox="-1 -1 932 480" preserveAspectRatio="xMinYMin meet"> <svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="931" height="456" viewBox="-1 -1 932 480" preserveAspectRatio="xMinYMin meet">
<style> <style>
.svg__langdata__text-large { fill: #1a1e23; font: 20px "Source Sans Pro" } .svg__langdata__text-large { fill: #1a1e23; font: 20px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__langdata__text-small { fill: #1a1e23; font: bold 16px "Source Sans Pro" } .svg__langdata__text-small { fill: #1a1e23; font: bold 16px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__langdata__text-tiny { fill: #1a1e23; font: bold 16px "Source Sans Pro" } .svg__langdata__text-tiny { fill: #1a1e23; font: bold 16px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
</style> </style>
<path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M610 404h-69.8" stroke-dasharray="1 6" stroke-linecap="round"/> <path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M610 404h-69.8" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M534.2 404l8-4-2 4 2 4z"/> <path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M534.2 404l8-4-2 4 2 4z"/>

Before

Width:  |  Height:  |  Size: 9.0 KiB

After

Width:  |  Height:  |  Size: 9.1 KiB

View File

@ -1,8 +1,8 @@
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 923 200" width="923" height="200"> <svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 923 200" width="923" height="200">
<style> <style>
.svg__pipeline__text { fill: #1a1e23; font: 20px "Source Sans Pro" } .svg__pipeline__text { fill: #1a1e23; font: 20px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__pipeline__text-small { fill: #1a1e23; font: bold 18px "Source Sans Pro" } .svg__pipeline__text-small { fill: #1a1e23; font: bold 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__pipeline__text-code { fill: #1a1e23; font: 600 16px "Source Code Pro" } .svg__pipeline__text-code { fill: #1a1e23; font: 600 16px "Source Code Pro", Monaco, "Courier New", monospace }
</style> </style>
<rect width="601" height="127" x="159" y="21" fill="none" stroke="#09a3d5" stroke-width="3" rx="19.1" stroke-dasharray="3 6" ry="19.1"/> <rect width="601" height="127" x="159" y="21" fill="none" stroke="#09a3d5" stroke-width="3" rx="19.1" stroke-dasharray="3 6" ry="19.1"/>
<path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M801 55h120v60H801z"/> <path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M801 55h120v60H801z"/>

Before

Width:  |  Height:  |  Size: 3.1 KiB

After

Width:  |  Height:  |  Size: 3.2 KiB

View File

@ -0,0 +1,123 @@
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="600" height="380" viewBox="-20 -10 550 400">
<style>
.svg__tokenization__text { fill: #1a1e23; font: 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__tokenization__text-small { fill: #fff; font: 600 13px "Source Code Pro", Monaco, "Courier New", monospace }
</style>
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M71 39v12H16v11M71 39v12h20v11"/>
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M1 1h140v38.2H1z"/>
<text class="svg__tokenization__text" dy="1em" width="43" height="19" transform="translate(48.5 9.5)">“Lets</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M175 39v23"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M150 1h50v38.2h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 9.5)" width="19" height="19">go</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M235 39v23"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M210 1h50v38.2h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 9.5)" width="15" height="19">to</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M341 39v23"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M270 1h141v38.2H270z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(320.5 9.5)" width="38" height="19">N.Y.!”</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 100v20"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 62h30v38.2H1z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 70.5)" width="7" height="19"></text>
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M91 100v11H66v9M91 100v11h29v9"/>
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M41 62h100v38.2H41z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(72.5 70.5)" width="35" height="19">Lets</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M175 100v20"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M150 62h50v38.2h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 70.5)" width="19" height="19">go</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M235 100v20"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M210 62h50v38.2h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 70.5)" width="15" height="19">to</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M341 100v20"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M270 62h141v38.2H270z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(320.5 70.5)" width="38" height="19">N.Y.!”</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 120h30v38H1z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 128.5)" width="7" height="19"></text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 158v24"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 120h50v38H41z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 128.5)" width="23" height="19">Let</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 158v24"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 120h50v38h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 128.5)" width="19" height="19">go</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 158v24"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 120h50v38h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 128.5)" width="15" height="19">to</text>
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M341 158v13h-20v11M341 158v13h55v11"/>
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M270 120h141v38H270z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(320.5 128.5)" width="38" height="19">N.Y.!”</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 158v24"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 120h40v38h-40z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 128.5)" width="11" height="19">s</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 220v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 181.8h30V220H1z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 190.5)" width="7" height="19"></text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 220v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 181.8h50V220H41z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 190.5)" width="23" height="19">Let</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 220v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 181.8h50V220h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 190.5)" width="19" height="19">go</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 220v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 181.8h50V220h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 190.5)" width="15" height="19">to</text>
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M321 220v11h-20v12M321 220v11h34v12"/>
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M270 181.8h101V220H270z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(304.5 190.5)" width="30" height="19">N.Y.!</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 220v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 181.8h40V220h-40z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 190.5)" width="11" height="19">s</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M396 220v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 181.8h30V220h-30z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(391.5 190.5)" width="7" height="19"></text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 281v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 242.7h30V281H1z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 251.5)" width="7" height="19"></text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 281v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 242.7h50V281H41z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 251.5)" width="23" height="19">Let</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 281v20-17 20"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 242.7h50V281h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 251.5)" width="19" height="19">go</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 281v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 242.7h50V281h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 251.5)" width="15" height="19">to</text>
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M301 281v23"/>
<path fill="#f8cecc" stroke="#b85450" stroke-width="2" d="M270 242.7h61V281h-61z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(286.5 251.5)" width="26" height="19">N.Y.</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 281v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 242.7h40V281h-40z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 251.5)" width="11" height="19">s</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M396 281v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 242.7h30V281h-30z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(391.5 251.5)" width="7" height="19"></text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M355 281v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M340 242.7h30V281h-30z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(351.5 251.5)" width="5" height="19">!</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 304h30v38H1z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 312.5)" width="7" height="19"></text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 304h50v38H41z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 312.5)" width="23" height="19">Let</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 304h50v38h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 312.5)" width="19" height="19">go</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 304h50v38h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 312.5)" width="15" height="19">to</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M270 304h61v38h-61z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(286.5 312.5)" width="26" height="19">N.Y.</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 304h40v38h-40z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 312.5)" width="11" height="19">s</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 304h30v38h-30z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(391.5 312.5)" width="7" height="19"></text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M340 304h30v38h-30z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(351.5 312.5)" width="5" height="19">!</text>
<rect width="104" height="19" x="437" y="72" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(455.5 74.5)" width="65" height="12">EXCEPTION</text>
<rect width="104" height="19" x="437" y="11" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 13.5)" width="43" height="12">PREFIX</text>
<rect width="104" height="19" x="437" y="130" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 132.5)" width="43" height="12">SUFFIX</text>
<rect width="104" height="19" x="437" y="191" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 193.5)" width="43" height="12">SUFFIX</text>
<rect width="104" height="19" x="437" y="252" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(455.5 254.5)" width="65" height="12">EXCEPTION</text>
<rect width="104" height="19" x="437" y="313" fill="#82b366" stroke="#82b366" stroke-width="2" rx="2.9" ry="2.9"/>
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(473.5 315.5)" width="29" height="12">DONE</text>
</svg>

After

Width:  |  Height:  |  Size: 12 KiB

View File

@ -1,9 +1,9 @@
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-10 -10 582 365" width="572" height="355"> <svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-10 -10 582 365" width="572" height="355">
<style> <style>
.svg__vocab__text { fill: #1a1e23; font: 18px "Source Sans Pro" } .svg__vocab__text { fill: #1a1e23; font: 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__vocab__text-large { fill: #fff; font: bold 18px "Source Sans Pro"; text-transform: uppercase } .svg__vocab__text-large { fill: #fff; font: bold 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif; text-transform: uppercase }
.svg__vocab__text-box { fill: #fff; font: bold 12px "Source Code Pro" } .svg__vocab__text-box { fill: #fff; font: bold 12px "Source Code Pro", Monaco, "Courier New", monospace }
.svg__vocab__text-code { fill: #1a1e23; font: bold 12px "Source Code Pro" } .svg__vocab__text-code { fill: #1a1e23; font: bold 12px "Source Code Pro", Monaco, "Courier New", monospace }
</style> </style>
<rect width="570" height="88" x="1" y="135" fill="#d5e8d4" stroke="#82b366" stroke-width="2" rx="13.2" ry="13.2"/> <rect width="570" height="88" x="1" y="135" fill="#d5e8d4" stroke="#82b366" stroke-width="2" rx="13.2" ry="13.2"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 164h100v40H444z"/> <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 164h100v40H444z"/>

Before

Width:  |  Height:  |  Size: 7.6 KiB

After

Width:  |  Height:  |  Size: 7.8 KiB

View File

@ -158,7 +158,8 @@
"binder": { "binder": {
"title": "Binder", "title": "Binder",
"tag": "class" "tag": "class",
"source": "spacy/tokens/binder.pyx"
}, },
"annotation": { "annotation": {

View File

@ -2,7 +2,10 @@
include ../../_includes/_mixins include ../../_includes/_mixins
p spaCy currently supports the following languages and capabilities: p
| spaCy currently provides models for the following languages and
| capabilities:
+aside-code("Download language models", "bash"). +aside-code("Download language models", "bash").
python -m spacy download en python -m spacy download en
@ -22,12 +25,16 @@ p spaCy currently supports the following languages and capabilities:
+row +row
+cell French #[code fr] +cell French #[code fr]
each icon in [ "pro", "pro", "con", "pro", "con", "pro", "pro", "con" ] each icon in [ "pro", "con", "con", "pro", "con", "pro", "pro", "con" ]
+cell.u-text-center #[+procon(icon)] +cell.u-text-center #[+procon(icon)]
+h(2, "available") Available models +row
+cell Spanish #[code es]
each icon in [ "pro", "pro", "con", "pro", "pro", "pro", "pro", "con" ]
+cell.u-text-center #[+procon(icon)]
include ../usage/_models-list p
+button("/docs/usage/models", true, "primary") See available models
+h(2, "alpha-support") Alpha tokenization support +h(2, "alpha-support") Alpha tokenization support
@ -52,9 +59,35 @@ p
| #[+a("https://github.com/mocobeta/janome") Janome]. | #[+a("https://github.com/mocobeta/janome") Janome].
+table([ "Language", "Code", "Source" ]) +table([ "Language", "Code", "Source" ])
each language, code in { es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", da: "Danish", hu: "Hungarian", pl: "Polish", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" } each language, code in { it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", da: "Danish", hu: "Hungarian", pl: "Polish", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" }
+row +row
+cell #{language} +cell #{language}
+cell #[code=code] +cell #[code=code]
+cell +cell
+src(gh("spaCy", "spacy/lang/" + code)) lang/#{code} +src(gh("spaCy", "spacy/lang/" + code)) lang/#{code}
+h(2, "multi-language") Multi-language support
+tag-new(2)
p
| As of v2.0, spaCy supports models trained on more than one language. This
| is especially useful for named entity recognition. The language ID used
| for multi-language or language-neutral models is #[code xx]. The
| language class, a generic subclass containing only the base language data,
| can be found in #[+src(gh("spaCy", "spacy/lang/xx")) lang/xx].
p
| To load your model with the neutral, multi-language class, simply set
| #[code "language": "xx"] in your
| #[+a("/docs/usage/saving-loading#models-generating") model package]'s
| meta.json. You can also import the class directly, or call
| #[+api("util#get_lang_class") #[code util.get_lang_class()]] for
| lazy-loading.
+code("Standard import").
from spacy.lang.xx import MultiLanguage
nlp = MultiLanguage()
+code("With lazy-loading").
from spacy.util import get_lang_class
nlp = get_lang_class('xx')

View File

@ -11,8 +11,13 @@ p
| the name of an installed | the name of an installed
| #[+a("/docs/usage/saving-loading#generating") model package], a unicode | #[+a("/docs/usage/saving-loading#generating") model package], a unicode
| path or a #[code Path]-like object. spaCy will try resolving the load | path or a #[code Path]-like object. spaCy will try resolving the load
| argument in this order. The #[code Language] class to initialise will be | argument in this order. If a model is loaded from a shortcut link or
| determined based on the model's settings. | package name, spaCy will assume it's a Python package and import it and
| call the model's own #[code load()] method. If a model is loaded from a
| path, spaCy will assume it's a data directory, read the language and
| pipeline settings off the meta.json and initialise the #[code Language]
| class. The data will be loaded in via
| #[+api("language#from_disk") #[code Language.from_disk()]].
+aside-code("Example"). +aside-code("Example").
nlp = spacy.load('en') # shortcut link nlp = spacy.load('en') # shortcut link
@ -20,7 +25,7 @@ p
nlp = spacy.load('/path/to/en') # unicode path nlp = spacy.load('/path/to/en') # unicode path
nlp = spacy.load(Path('/path/to/en')) # pathlib Path nlp = spacy.load(Path('/path/to/en')) # pathlib Path
nlp = spacy.load('en', disable['parser', 'tagger']) nlp = spacy.load('en', disable=['parser', 'tagger'])
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row

View File

@ -1,12 +1,10 @@
//- 💫 DOCS > API > ANNOTATION SPECS //- 💫 DOCS > API > UTIL
include ../../_includes/_mixins include ../../_includes/_mixins
p p
| spaCy comes with a small collection of utility functions located in | spaCy comes with a small collection of utility functions located in
| #[+src(gh("spaCy", "spacy/util.py")) spacy/util.py]. | #[+src(gh("spaCy", "spacy/util.py")) spacy/util.py].
+infobox("Important note")
| Because utility functions are mostly intended for | Because utility functions are mostly intended for
| #[strong internal use within spaCy], their behaviour may change with | #[strong internal use within spaCy], their behaviour may change with
| future releases. The functions documented on this page should be safe | future releases. The functions documented on this page should be safe
@ -74,15 +72,23 @@ p
+cell #[code Language] +cell #[code Language]
+cell Language class. +cell Language class.
+h(2, "resolve_model_path") util.resolve_model_path +h(2, "load_model") util.load_model
+tag function +tag function
+tag-new(2) +tag-new(2)
p Resolve a model name or string to a model path. p
| Load a model from a shortcut link, package or data path. If called with a
| shortcut link or package name, spaCy will assume the model is a Python
| package and import and call its #[code load()] method. If called with a
| path, spaCy will assume it's a data directory, read the language and
| pipeline settings from the meta.json and initialise a #[code Language]
| class. The model data will then be loaded in via
| #[+api("language#from_disk") #[code Language.from_disk()]].
+aside-code("Example"). +aside-code("Example").
model_path = util.resolve_model_path('en') nlp = util.load_model('en')
model_path = util.resolve_model_path('/path/to/en') nlp = util.load_model('en_core_web_sm')
nlp = util.load_model('/path/to/data')
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
@ -92,8 +98,33 @@ p Resolve a model name or string to a model path.
+footrow +footrow
+cell returns +cell returns
+cell #[code Path] +cell #[code Language]
+cell Path to model data directory. +cell #[code Language] class with the loaded model.
+h(2, "load_model_from_init_py") util.load_model_from_init_py
+tag function
+tag-new(2)
p
| A helper function to use in the #[code load()] method of a model package's
| #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) __init__.py].
+aside-code("Example").
from spacy.util import load_model_from_init_py
def load():
return load_model_from_init_py(__file__)
+table(["Name", "Type", "Description"])
+row
+cell #[code init_file]
+cell unicode
+cell Path to model's __init__.py, i.e. #[code __file__].
+footrow
+cell returns
+cell #[code Language]
+cell #[code Language] class with the loaded model.
+h(2, "is_package") util.is_package +h(2, "is_package") util.is_package
+tag function +tag function
@ -117,16 +148,18 @@ p
+cell #[code bool] +cell #[code bool]
+cell #[code True] if installed package, #[code False] if not. +cell #[code True] if installed package, #[code False] if not.
+h(2, "get_model_package_path") util.get_model_package_path +h(2, "get_package_path") util.get_package_path
+tag function +tag function
+tag-new(2)
p p
| Get path to a #[+a("/docs/usage/models") model package] installed via pip. | Get path to an installed package. Mainly used to resolve the location of
| Currently imports the package to find it and parse its meta data. | #[+a("/docs/usage/models") model packages]. Currently imports the package
| to find its path.
+aside-code("Example"). +aside-code("Example").
util.get_model_package_path('en_core_web_sm') util.get_package_path('en_core_web_sm')
# /usr/lib/python3.6/site-packages/en_core_web_sm/en_core_web_sm-1.2.0 # /usr/lib/python3.6/site-packages/en_core_web_sm
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
@ -137,37 +170,8 @@ p
+footrow +footrow
+cell returns +cell returns
+cell #[code Path] +cell #[code Path]
+cell Path to model data directory.
+h(2, "parse_package_meta") util.parse_package_meta
+tag function
p
| Check if a #[code meta.json] exists in a model package and return its
| contents.
+aside-code("Example").
if util.is_package('en_core_web_sm'):
path = util.get_model_package_path('en_core_web_sm')
meta = util.parse_package_meta(path, require=True)
# {'name': 'core_web_sm', 'lang': 'en', ...}
+table(["Name", "Type", "Description"])
+row
+cell #[code package_path]
+cell #[code Path]
+cell Path to model package directory. +cell Path to model package directory.
+row
+cell #[code require]
+cell #[code bool]
+cell If #[code True], raise error if no #[code meta.json] is found.
+footrow
+cell returns
+cell dict / #[code None]
+cell Model meta data or #[code None].
+h(2, "is_in_jupyter") util.is_in_jupyter +h(2, "is_in_jupyter") util.is_in_jupyter
+tag function +tag function
+tag-new(2) +tag-new(2)

View File

@ -5,7 +5,7 @@ p
| #[strong how similar they are]. Predicting similarity is useful for | #[strong how similar they are]. Predicting similarity is useful for
| building recommendation systems or flagging duplicates. For example, you | building recommendation systems or flagging duplicates. For example, you
| can suggest a user content that's similar to what they're currently | can suggest a user content that's similar to what they're currently
| looking at, or label a support ticket as a duplicate, if it's very | looking at, or label a support ticket as a duplicate if it's very
| similar to an already existing one. | similar to an already existing one.
p p

View File

@ -16,3 +16,47 @@ p
+row +row
for cell in ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "for", "$", "1", "billion"] for cell in ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "for", "$", "1", "billion"]
+cell=cell +cell=cell
p
| Fist, the raw text is split on whitespace characters, similar to
| #[code text.split(' ')]. Then, the tokenizer processes the text from
| left to right. On each substring, it performs two checks:
+list("numbers")
+item
| #[strong Does the substring match a tokenizer exception rule?] For
| example, "don't" does not contain whitespace, but should be split
| into two tokens, "do" and "n't", while "U.K." should always
| remain one token.
+item
| #[strong Can a prefix, suffix or infixes be split off?]. For example
| punctuation like commas, periods, hyphens or quotes.
p
| If there's a match, the rule is applied and the tokenizer continues its
| loop, starting with the newly split substrings. This way, spaCy can split
| #[strong complex, nested tokens] like combinations of abbreviations and
| multiple punctuation marks.
+aside
| #[strong Tokenizer exception:] Special-case rule to split a string into
| several tokens or prevent a token from being split when punctuation rules
| are applied.#[br]
| #[strong Prefix:] Character(s) at the beginning, e.g.
| #[code $], #[code (], #[code “], #[code ¿].#[br]
| #[strong Suffix:] Character(s) at the end, e.g.
| #[code km], #[code &#41;], #[code ”], #[code !].#[br]
| #[strong Infix:] Character(s) in between, e.g.
| #[code -], #[code --], #[code /], #[code …].#[br]
+image
include ../../../assets/img/docs/tokenization.svg
.u-text-right
+button("/assets/img/docs/tokenization.svg", false, "secondary").u-text-tag View large graphic
p
| While punctuation rules are usually pretty general, tokenizer exceptions
| strongly depend on the specifics of the individual language. This is
| why each #[+a("/docs/api/language-models") available language] has its
| own subclass like #[code English] or #[code German], that loads in lists
| of hard-coded data and exception rules.

View File

@ -89,4 +89,6 @@ p
p p
| Even though both #[code Doc] objects contain the same words, the internal | Even though both #[code Doc] objects contain the same words, the internal
| integer IDs are very different. | integer IDs are very different. The same applies for all other strings,
| like the annotation scheme. To avoid mismatched IDs, spaCy will always
| export the vocab if you save a #[code Doc] or #[code nlp] object.

View File

@ -19,19 +19,17 @@ p
p p
| When you load a model, spaCy first consults the model's | When you load a model, spaCy first consults the model's
| #[+a("/docs/usage/saving-loading#models-generating") meta.json] for its | #[+a("/docs/usage/saving-loading#models-generating") meta.json]. The
| #[code setup] details. This typically includes the ID of a language class, | meta typically includes the model details, the ID of a language class,
| and an optional list of pipeline components. spaCy then does the | and an optional list of pipeline components. spaCy then does the
| following: | following:
+aside-code("meta.json (excerpt)", "json"). +aside-code("meta.json (excerpt)", "json").
{ {
"name": "example_model", "name": "example_model",
"lang": "en"
"description": "Example model for spaCy", "description": "Example model for spaCy",
"setup": { "pipeline": ["token_vectors", "tagger"]
"lang": "en",
"pipeline": ["token_vectors", "tagger"]
}
} }
+list("numbers") +list("numbers")
@ -146,7 +144,7 @@ p
+table(["Argument", "Type", "Description"]) +table(["Argument", "Type", "Description"])
+row +row
+cell #[code vocab] +cell #[code vocab]
+cell #[coce Vocab] +cell #[code Vocab]
+cell +cell
| Shared data between components, including strings, morphology, | Shared data between components, including strings, morphology,
| vectors etc. | vectors etc.
@ -287,17 +285,15 @@ p
p p
| In the model package's meta.json, specify the language class and pipeline | In the model package's meta.json, specify the language class and pipeline
| IDs in #[code setup]: | IDs:
+code("meta.json (excerpt)", "json"). +code("meta.json (excerpt)", "json").
{ {
"name": "my_sentiment_model", "name": "sentiment_model",
"lang": "en",
"version": "1.0.0", "version": "1.0.0",
"spacy_version": "&gt;=2.0.0,&lt;3.0.0", "spacy_version": "&gt;=2.0.0,&lt;3.0.0",
"setup": { "pipeline": ["vectorizer", "sentiment"]
"lang": "en",
"pipeline": ["vectorizer", "sentiment"]
}
} }
p p
@ -307,7 +303,7 @@ p
| by your custom #[code "sentiment"] factory. | by your custom #[code "sentiment"] factory.
+code. +code.
nlp = spacy.load('my_sentiment_model') nlp = spacy.load('en_sentiment_model')
doc = nlp(u'I love pizza') doc = nlp(u'I love pizza')
assert doc.sentiment assert doc.sentiment

View File

@ -129,15 +129,18 @@ p
+code. +code.
import spacy import spacy
from spacy.tokens.doc import Doc from spacy.tokens.doc import Doc
from spacy.vocab import Vocab
nlp = spacy.load('en') nlp = spacy.load('en')
moby_dick = open('moby_dick.txt', 'r') moby_dick = open('moby_dick.txt', 'r')
doc = nlp(moby_dick) doc = nlp(moby_dick)
doc.to_disk('/moby_dick.bin') doc.to_disk('/moby_dick.bin')
new_doc = Doc().from_disk('/moby_dick.bin') new_doc = Doc(Vocab()).from_disk('/moby_dick.bin')
+infobox +infobox
| #[strong API:] #[+api("language") #[code Language]],
| #[+api("doc") #[code Doc]]
| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading] | #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
+h(2, "rule-matcher") Match text with token rules +h(2, "rule-matcher") Match text with token rules
@ -148,9 +151,14 @@ p
nlp = spacy.load('en') nlp = spacy.load('en')
matcher = Matcher(nlp.vocab) matcher = Matcher(nlp.vocab)
# match "Google I/O" or "Google i/o"
pattern = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}] def set_sentiment(matcher, doc, i, matches):
matcher.add('GoogleIO', None, pattern) doc.sentiment += 0.1
pattern1 = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}]
pattern2 = [[{'ORTH': emoji, 'OP': '+'}] for emoji in ['😀', '😂', '🤣', '😍']]
matcher.add('GoogleIO', None, pattern1) # match "Google I/O" or "Google i/o"
matcher.add('HAPPY', set_sentiment, pattern2) # match one or more happy emoji
matches = nlp(LOTS_OF TEXT) matches = nlp(LOTS_OF TEXT)
+infobox +infobox

View File

@ -11,7 +11,7 @@ p
| You can also associate patterns with entity IDs, to allow some basic | You can also associate patterns with entity IDs, to allow some basic
| entity linking or disambiguation. | entity linking or disambiguation.
+aside("What about \"real\" regular expressions?") //-+aside("What about \"real\" regular expressions?")
+h(2, "adding-patterns") Adding patterns +h(2, "adding-patterns") Adding patterns
@ -119,7 +119,7 @@ p
+code. +code.
# Add a new custom flag to the vocab, which is always False by default. # Add a new custom flag to the vocab, which is always False by default.
# BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span. # BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span.
BAD_HTML_FLAG = doc.vocab.add_flag(lambda text: False) BAD_HTML_FLAG = nlp.vocab.add_flag(lambda text: False)
def merge_and_flag(matcher, doc, i, matches): def merge_and_flag(matcher, doc, i, matches):
match_id, start, end = matches[i] match_id, start, end = matches[i]
@ -221,7 +221,7 @@ p
+cell match 0 or 1 times +cell match 0 or 1 times
+cell optional, max one +cell optional, max one
+h(3, "quantifiers-example1") Quantifiers example: Using linguistic annotations +h(2, "example1") Example: Using linguistic annotations
p p
| Let's say you're analysing user comments and you want to find out what | Let's say you're analysing user comments and you want to find out what
@ -283,7 +283,7 @@ p
# set manual=True to make displaCy render straight from a dictionary # set manual=True to make displaCy render straight from a dictionary
displacy.serve(matched_sents, style='ent', manual=True) displacy.serve(matched_sents, style='ent', manual=True)
+h(3, "quantifiers-example2") Quantifiers example: Phone numbers +h(2, "example2") Example: Phone numbers
p p
| Phone numbers can have many different formats and matching them is often | Phone numbers can have many different formats and matching them is often
@ -320,3 +320,114 @@ p
| It'll produce more predictable results, is much easier to modify and | It'll produce more predictable results, is much easier to modify and
| extend, and doesn't require any training data only a set of | extend, and doesn't require any training data only a set of
| test cases. | test cases.
+h(2, "example3") Example: Hashtags and emoji on social media
p
| Social media posts, especially tweets, can be difficult to work with.
| They're very short and often contain various emoji and hashtags. By only
| looking at the plain text, you'll lose a lot of valuable semantic
| information.
p
| Let's say you've extracted a large sample of social media posts on a
| specific topic, for example posts mentioning a brand name or product.
| As the first step of your data exploration, you want to filter out posts
| containing certain emoji and use them to assign a general sentiment
| score, based on whether the expressed emotion is positive or negative,
| e.g. #[span.o-icon.o-icon--inline 😀] or #[span.o-icon.o-icon--inline 😞].
| You also want to find, merge and label hashtags like
| #[code #MondayMotivation], to be able to ignore or analyse them later.
+aside("Note on sentiment analysis")
| Ultimately, sentiment analysis is not always #[em that] easy. In
| addition to the emoji, you'll also want to take specific words into
| account and check the #[code subtree] for intensifiers like "very", to
| increase the sentiment score. At some point, you might also want to train
| a sentiment model. However, the approach described in this example is
| very useful for #[strong bootstrapping rules to collect training data].
| It's also an incredibly fast way to gather first insights into your data
| with about 1 million tweets, you'd be looking at a processing time of
| #[strong under 1 minute].
p
| By default, spaCy's tokenizer will split emoji into separate tokens. This
| means that you can create a pattern for one or more emoji tokens. In this
| case, a sequence of identical emoji should be treated as one instance.
| Valid hashtags usually consist of a #[code #], plus a sequence of
| ASCII characters with no whitespace, making them easy to match as well.
+code.
from spacy.lang.en import English
from spacy.matcher import Matcher
nlp = English() # we only want the tokenizer, so no need to load a model
matcher = Matcher(nlp.vocab)
pos_emoji = [u'😀', u'😃', u'😂', u'🤣', u'😊', u'😍'] # positive emoji
neg_emoji = [u'😞', u'😠', u'😩', u'😢', u'😭', u'😒'] # negative emoji
# add patterns to match one or more emoji tokens
pos_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in pos_emoji]
neg_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in neg_emoji]
matcher.add('HAPPY', label_sentiment, *pos_patterns) # add positive pattern
matcher.add('SAD', label_sentiment, *neg_patterns) # add negative pattern
# add pattern to merge valid hashtag, i.e. '#' plus any ASCII token
matcher.add('HASHTAG', merge_hashtag, [{'ORTH': '#'}, {'IS_ASCII': True}])
p
| Because the #[code on_match] callback receives the ID of each match, you
| can use the same function to handle the sentiment assignment for both
| the positive and negative pattern. To keep it simple, we'll either add
| or subtract #[code 0.1] points this way, the score will also reflect
| combinations of emoji, even positive #[em and] negative ones.
p
| With a library like
| #[+a("https://github.com/bcongdon/python-emojipedia") Emojipedia],
| we can also retrieve a short description for each emoji for example,
| #[span.o-icon.o-icon--inline 😍]'s official title is "Smiling Face With
| Heart-Eyes". Assigning it to the merged token's norm will make it
| available as #[code token.norm_].
+code.
from emojipedia import Emojipedia # installation: pip install emojipedia
def label_sentiment(matcher, doc, i, matches):
match_id, start, end = matches[i]
if match_id is 'HAPPY':
doc.sentiment += 0.1 # add 0.1 for positive sentiment
elif match_id is 'SAD':
doc.sentiment -= 0.1 # subtract 0.1 for negative sentiment
span = doc[start : end]
emoji = Emojipedia.search(span[0].text) # get data for emoji
span.merge(norm=emoji.title) # merge span and set NORM to emoji title
p
| To label the hashtags, we first need to add a new custom flag.
| #[code IS_HASHTAG] will be the flag's ID, which you can use to assign it
| to the hashtag's span, and check its value via a token's
| #[+api("token#check_flag") #[code code check_flag()]] method. On each
| match, we merge the hashtag and assign the flag.
+code.
# Add a new custom flag to the vocab, which is always False by default
IS_HASHTAG = nlp.vocab.add_flag(lambda text: False)
def merge_hashtag(matcher, doc, i, matches):
match_id, start, end = matches[i]
span = doc[start : end]
span.merge() # merge hashtag
span.set_flag(IS_HASHTAG, True) # set IS_HASHTAG to True
p
| To process a stream of social media posts, we can use
| #[+api("language#pipe") #[code Language.pipe()]], which will return a
| stream of #[code Doc] objects that we can pass to
| #[+api("matcher#pipe") #[code Matcher.pipe()]].
+code.
docs = nlp.pipe(LOTS_OF_TWEETS)
matches = matcher.pipe(docs)

View File

@ -74,16 +74,14 @@ p
+aside-code("meta.json", "json"). +aside-code("meta.json", "json").
{ {
"name": "example_model", "name": "example_model",
"lang": "en",
"version": "1.0.0", "version": "1.0.0",
"spacy_version": "&gt;=2.0.0,&lt;3.0.0", "spacy_version": "&gt;=2.0.0,&lt;3.0.0",
"description": "Example model for spaCy", "description": "Example model for spaCy",
"author": "You", "author": "You",
"email": "you@example.com", "email": "you@example.com",
"license": "CC BY-SA 3.0", "license": "CC BY-SA 3.0",
"setup": { "pipeline": ["token_vectors", "tagger"]
"lang": "en",
"pipeline": ["token_vectors", "tagger"]
}
} }
+code(false, "bash"). +code(false, "bash").
@ -110,9 +108,9 @@ p
+h(3, "models-custom") Customising the model setup +h(3, "models-custom") Customising the model setup
p p
| The meta.json includes a #[code setup] key that lets you customise how | The meta.json includes the model details, like name, requirements and
| the model should be initialised and loaded. You can define the language | license, and lets you customise how the model should be initialised and
| data to be loaded and the | loaded. You can define the language data to be loaded and the
| #[+a("/docs/usage/language-processing-pipeline") processing pipeline] to | #[+a("/docs/usage/language-processing-pipeline") processing pipeline] to
| execute. | execute.
@ -183,9 +181,9 @@ p
p p
| To load a model from a data directory, you can use | To load a model from a data directory, you can use
| #[+api("spacy#load") #[code spacy.load()]] with the local path. This will | #[+api("spacy#load") #[code spacy.load()]] with the local path. This will
| look for a meta.json in the directory and use the #[code setup] details | look for a meta.json in the directory and use the #[code lang] and
| to initialise a #[code Language] class with a processing pipeline and | #[code pipeline] settings to initialise a #[code Language] class with a
| load in the model data. | processing pipeline and load in the model data.
+code. +code.
nlp = spacy.load('/path/to/model') nlp = spacy.load('/path/to/model')

View File

@ -65,7 +65,7 @@ p
| spaCy provides a variety of linguistic annotations to give you insights | spaCy provides a variety of linguistic annotations to give you insights
| into a text's grammatical structure. This includes the word types, | into a text's grammatical structure. This includes the word types,
| i.e. the parts of speech, and how the words are related to each other. | i.e. the parts of speech, and how the words are related to each other.
| For example, if you're analysing text, it makes a #[em huge] difference | For example, if you're analysing text, it makes a huge difference
| whether a noun is the subject of a sentence, or the object or whether | whether a noun is the subject of a sentence, or the object or whether
| "google" is used as a verb, or refers to the website or company in a | "google" is used as a verb, or refers to the website or company in a
| specific context. | specific context.
@ -94,9 +94,10 @@ p
include _spacy-101/_tokenization include _spacy-101/_tokenization
+infobox +infobox
| To learn more about how spaCy's tokenizer and its rules work in detail, | To learn more about how spaCy's tokenization rules work in detail,
| how to #[strong customise] it and how to #[strong add your own tokenizer] | how to #[strong customise and replace] the default tokenizer and how to
| to a processing pipeline, see the usage guide on | #[strong add language-specific data], see the usage guides on
| #[+a("/docs/usage/adding-languages") adding languages] and
| #[+a("/docs/usage/customizing-tokenizer") customising the tokenizer]. | #[+a("/docs/usage/customizing-tokenizer") customising the tokenizer].
+h(3, "annotations-pos-deps") Part-of-speech tags and dependencies +h(3, "annotations-pos-deps") Part-of-speech tags and dependencies
@ -118,9 +119,11 @@ include _spacy-101/_named-entities
+infobox +infobox
| To learn more about entity recognition in spaCy, how to | To learn more about entity recognition in spaCy, how to
| #[strong add your own entities] to a document and how to train and update | #[strong add your own entities] to a document and how to
| the entity predictions of a model, see the usage guide on | #[strong train and update] the entity predictions of a model, see the
| #[+a("/docs/usage/entity-recognition") named entity recognition]. | usage guides on
| #[+a("/docs/usage/entity-recognition") named entity recognition] and
| #[+a("/docs/usage/training-ner") training the named entity recognizer].
+h(2, "vectors-similarity") Word vectors and similarity +h(2, "vectors-similarity") Word vectors and similarity
+tag-model("vectors") +tag-model("vectors")

View File

@ -20,19 +20,18 @@ p
nlp = Language(pipeline=['my_factory', mycomponent]) nlp = Language(pipeline=['my_factory', mycomponent])
p p
| It's now much easier to customise the pipeline with your own components. | It's now much easier to #[strong customise the pipeline] with your own
| Components are functions that receive a #[code Doc] object, modify and | components, functions that receive a #[code Doc] object, modify and
| return it. If your component is stateful, you'll want to create a new one | return it. If your component is stateful, you can define and register a
| for each pipeline. You can do that by defining and registering a factory | factory which receives the shared #[code Vocab] object and returns a
| which receives the shared #[code Vocab] object and returns a component. |  component. spaCy's default components can be added to your pipeline by
| using their string IDs. This way, you won't have to worry about finding
p | and implementing them simply add #[code "tagger"] to the pipeline,
| spaCy's default components the vectorizer, tagger, parser and entity
| recognizer, can be added to your pipeline by using their string IDs.
| This way, you won't have to worry about finding and implementing them
| to use the default tagger, simply add #[code "tagger"] to the pipeline,
| and spaCy will know what to do. | and spaCy will know what to do.
+image
include ../../assets/img/docs/pipeline.svg
+infobox +infobox
| #[strong API:] #[+api("language") #[code Language]] | #[strong API:] #[+api("language") #[code Language]]
| #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text] | #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text]
@ -96,11 +95,10 @@ p
| #[code Language] class, or load a model that initialises one. This allows | #[code Language] class, or load a model that initialises one. This allows
| languages to contain more custom data, e.g. lemmatizer lookup tables, or | languages to contain more custom data, e.g. lemmatizer lookup tables, or
| complex regular expressions. The language data has also been tidied up | complex regular expressions. The language data has also been tidied up
| and simplified. It's now also possible to overwrite the functions that | and simplified. spaCy now also supports simple lookup-based lemmatization.
| compute lexical attributes like #[code like_num], and supply
| language-specific syntax iterators, e.g. to determine noun chunks. spaCy +image
| now also supports simple lookup-based lemmatization. The data is stored include ../../assets/img/docs/language_data.svg
| in a dictionary mapping a string to its lemma.
+infobox +infobox
| #[strong API:] #[+api("language") #[code Language]] | #[strong API:] #[+api("language") #[code Language]]
@ -111,13 +109,10 @@ p
+aside-code("Example"). +aside-code("Example").
from spacy.matcher import Matcher from spacy.matcher import Matcher
from spacy.attrs import LOWER, IS_PUNCT
matcher = Matcher(nlp.vocab) matcher = Matcher(nlp.vocab)
matcher.add('HelloWorld', None, matcher.add('HEARTS', None, [{'ORTH': '❤️', 'OP': '+'}])
[{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}],
[{LOWER: 'hello'}, {LOWER: 'world'}])
assert len(matcher) == 1 assert len(matcher) == 1
assert 'HelloWorld' in matcher assert 'HEARTS' in matcher
p p
| Patterns can now be added to the matcher by calling | Patterns can now be added to the matcher by calling
@ -157,28 +152,8 @@ p
+cell #[+api("language#to_disk") #[code Language.to_disk]] +cell #[+api("language#to_disk") #[code Language.to_disk]]
+row +row
+cell #[code Tokenizer.load] +cell #[code Language.create_make_doc]
+cell +cell #[+api("language#attributes") #[code Language.tokenizer]]
| #[+api("tokenizer#from_disk") #[code Tokenizer.from_disk]]
| #[+api("tokenizer#from_bytes") #[code Tokenizer.from_bytes]]
+row
+cell #[code Tagger.load]
+cell
| #[+api("tagger#from_disk") #[code Tagger.from_disk]]
| #[+api("tagger#from_bytes") #[code Tagger.from_bytes]]
+row
+cell #[code DependencyParser.load]
+cell
| #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]]
| #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]]
+row
+cell #[code EntityRecognizer.load]
+cell
| #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]]
| #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]]
+row +row
+cell +cell
@ -212,6 +187,28 @@ p
| #[+api("stringstore#to_disk") #[code StringStore.to_disk]] | #[+api("stringstore#to_disk") #[code StringStore.to_disk]]
| #[+api("stringstore#to_bytes") #[code StringStore.to_bytes]] | #[+api("stringstore#to_bytes") #[code StringStore.to_bytes]]
+row
+cell #[code Tokenizer.load]
+cell -
+row
+cell #[code Tagger.load]
+cell
| #[+api("tagger#from_disk") #[code Tagger.from_disk]]
| #[+api("tagger#from_bytes") #[code Tagger.from_bytes]]
+row
+cell #[code DependencyParser.load]
+cell
| #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]]
| #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]]
+row
+cell #[code EntityRecognizer.load]
+cell
| #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]]
| #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]]
+row +row
+cell #[code Matcher.load] +cell #[code Matcher.load]
+cell - +cell -
@ -232,7 +229,7 @@ p
+row +row
+cell #[code Doc.read_bytes] +cell #[code Doc.read_bytes]
+cell +cell #[+api("binder") #[code Binder]]
+row +row
+cell #[code Token.is_ancestor_of] +cell #[code Token.is_ancestor_of]