Merge branch 'develop' of https://github.com/explosion/spaCy into develop
|
@ -1,10 +1,7 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import importlib
|
||||
|
||||
from .compat import basestring_
|
||||
from .cli.info import info
|
||||
from .cli.info import info as cli_info
|
||||
from .glossary import explain
|
||||
from .deprecated import resolve_load_name
|
||||
from . import util
|
||||
|
@ -12,11 +9,8 @@ from . import util
|
|||
|
||||
def load(name, **overrides):
|
||||
name = resolve_load_name(name, **overrides)
|
||||
model_path = util.resolve_model_path(name)
|
||||
meta = util.parse_package_meta(model_path)
|
||||
if 'lang' not in meta:
|
||||
raise IOError('No language setting found in model meta.')
|
||||
cls = util.get_lang_class(meta['lang'])
|
||||
overrides['meta'] = meta
|
||||
overrides['path'] = model_path
|
||||
return cls(**overrides)
|
||||
return util.load_model(name)
|
||||
|
||||
|
||||
def info(model=None, markdown=False):
|
||||
return cli_info(None, model, markdown)
|
||||
|
|
|
@ -19,6 +19,8 @@ import numpy
|
|||
|
||||
|
||||
def _init_for_precomputed(W, ops):
|
||||
if (W**2).sum() != 0.:
|
||||
return
|
||||
reshaped = W.reshape((W.shape[1], W.shape[0] * W.shape[2]))
|
||||
ops.xavier_uniform_init(reshaped)
|
||||
W[:] = reshaped.reshape(W.shape)
|
||||
|
@ -247,6 +249,7 @@ def doc2feats(cols=None):
|
|||
model.cols = cols
|
||||
return model
|
||||
|
||||
|
||||
def print_shape(prefix):
|
||||
def forward(X, drop=0.):
|
||||
return X, lambda dX, **kwargs: dX
|
||||
|
|
|
@ -24,8 +24,9 @@ CONVERTERS = {
|
|||
n_sents=("Number of sentences per doc", "option", "n", float),
|
||||
morphology=("Enable appending morphology to tags", "flag", "m", bool)
|
||||
)
|
||||
def convert(_, input_file, output_dir, n_sents, morphology):
|
||||
"""Convert files into JSON format for use with train command and other
|
||||
def convert(cmd, input_file, output_dir, n_sents, morphology):
|
||||
"""
|
||||
Convert files into JSON format for use with train command and other
|
||||
experiment management functions.
|
||||
"""
|
||||
input_path = Path(input_file)
|
||||
|
@ -39,4 +40,4 @@ def convert(_, input_file, output_dir, n_sents, morphology):
|
|||
prints("Can't find converter for %s" % input_path.parts[-1],
|
||||
title="Unknown format", exits=1)
|
||||
CONVERTERS[file_ext](input_path, output_path,
|
||||
n_sents=n_sents, morphology=morphology)
|
||||
n_sents=n_sents, use_morphology=morphology)
|
||||
|
|
|
@ -17,8 +17,9 @@ from .. import about
|
|||
direct=("force direct download. Needs model name with version and won't "
|
||||
"perform compatibility check", "flag", "d", bool)
|
||||
)
|
||||
def download(model, direct=False):
|
||||
"""Download compatible model from default download path using pip. Model
|
||||
def download(cmd, model, direct=False):
|
||||
"""
|
||||
Download compatible model from default download path using pip. Model
|
||||
can be shortcut, model name or, if --direct flag is set, full model name
|
||||
with version.
|
||||
"""
|
||||
|
@ -31,7 +32,7 @@ def download(model, direct=False):
|
|||
version = get_version(model_name, compatibility)
|
||||
download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
|
||||
try:
|
||||
link(model_name, model, force=True)
|
||||
link(None, model_name, model, force=True)
|
||||
except:
|
||||
# Dirty, but since spacy.download and the auto-linking is mostly
|
||||
# a convenience wrapper, it's best to show a success message and
|
||||
|
|
|
@ -14,14 +14,20 @@ from .. import util
|
|||
model=("optional: shortcut link of model", "positional", None, str),
|
||||
markdown=("generate Markdown for GitHub issues", "flag", "md", str)
|
||||
)
|
||||
def info(model=None, markdown=False):
|
||||
def info(cmd, model=None, markdown=False):
|
||||
"""Print info about spaCy installation. If a model shortcut link is
|
||||
speficied as an argument, print model information. Flag --markdown
|
||||
prints details in Markdown for easy copy-pasting to GitHub issues.
|
||||
"""
|
||||
if model:
|
||||
model_path = util.resolve_model_path(model)
|
||||
meta = util.parse_package_meta(model_path)
|
||||
if util.is_package(model):
|
||||
model_path = util.get_package_path(model)
|
||||
else:
|
||||
model_path = util.get_data_path() / model
|
||||
meta_path = model_path / 'meta.json'
|
||||
if not meta_path.is_file():
|
||||
prints(meta_path, title="Can't find model meta.json", exits=1)
|
||||
meta = read_json(meta_path)
|
||||
if model_path.resolve() != model_path:
|
||||
meta['link'] = path2str(model_path)
|
||||
meta['source'] = path2str(model_path.resolve())
|
||||
|
|
|
@ -14,13 +14,14 @@ from .. import util
|
|||
link_name=("name of shortuct link to create", "positional", None, str),
|
||||
force=("force overwriting of existing link", "flag", "f", bool)
|
||||
)
|
||||
def link(origin, link_name, force=False):
|
||||
"""Create a symlink for models within the spacy/data directory. Accepts
|
||||
def link(cmd, origin, link_name, force=False):
|
||||
"""
|
||||
Create a symlink for models within the spacy/data directory. Accepts
|
||||
either the name of a pip package, or the local path to the model data
|
||||
directory. Linking models allows loading them via spacy.load(link_name).
|
||||
"""
|
||||
if util.is_package(origin):
|
||||
model_path = util.get_model_package_path(origin)
|
||||
model_path = util.get_package_path(model)
|
||||
else:
|
||||
model_path = Path(origin)
|
||||
if not model_path.exists():
|
||||
|
|
|
@ -18,8 +18,9 @@ from .. import about
|
|||
meta=("path to meta.json", "option", "m", str),
|
||||
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
|
||||
)
|
||||
def package(input_dir, output_dir, meta, force):
|
||||
"""Generate Python package for model data, including meta and required
|
||||
def package(cmd, input_dir, output_dir, meta=None, force=False):
|
||||
"""
|
||||
Generate Python package for model data, including meta and required
|
||||
installation files. A new directory will be created in the specified
|
||||
output directory, and model data will be copied over.
|
||||
"""
|
||||
|
@ -42,7 +43,7 @@ def package(input_dir, output_dir, meta, force):
|
|||
meta = util.read_json(meta_path)
|
||||
else:
|
||||
meta = generate_meta()
|
||||
validate_meta(meta, ['lang', 'name', 'version'])
|
||||
meta = validate_meta(meta, ['lang', 'name', 'version'])
|
||||
|
||||
model_name = meta['lang'] + '_' + meta['name']
|
||||
model_name_v = model_name + '-' + meta['version']
|
||||
|
@ -85,20 +86,32 @@ def generate_meta():
|
|||
('email', 'Author email', False),
|
||||
('url', 'Author website', False),
|
||||
('license', 'License', 'CC BY-NC 3.0')]
|
||||
|
||||
prints("Enter the package settings for your model.", title="Generating meta.json")
|
||||
meta = {}
|
||||
for setting, desc, default in settings:
|
||||
response = util.get_raw_input(desc, default)
|
||||
meta[setting] = default if response == '' and default else response
|
||||
meta['pipeline'] = generate_pipeline()
|
||||
return meta
|
||||
|
||||
|
||||
def generate_pipeline():
|
||||
prints("If set to 'True', the default pipeline is used. If set to 'False', "
|
||||
"the pipeline will be disabled. Components should be specified as a "
|
||||
"comma-separated list of component names, e.g. vectorizer, tagger, "
|
||||
"parser, ner. For more information, see the docs on processing pipelines.",
|
||||
title="Enter your model's pipeline components")
|
||||
pipeline = util.get_raw_input("Pipeline components", True)
|
||||
replace = {'True': True, 'False': False}
|
||||
return replace[pipeline] if pipeline in replace else pipeline.split(', ')
|
||||
|
||||
|
||||
def validate_meta(meta, keys):
|
||||
for key in keys:
|
||||
if key not in meta or meta[key] == '':
|
||||
prints("This setting is required to build your package.",
|
||||
title='No "%s" setting found in meta.json' % key, exits=1)
|
||||
return meta
|
||||
|
||||
|
||||
def get_template(filepath):
|
||||
|
|
|
@ -32,9 +32,11 @@ from .. import displacy
|
|||
no_parser=("Don't train parser", "flag", "P", bool),
|
||||
no_entities=("Don't train NER", "flag", "N", bool)
|
||||
)
|
||||
def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||
use_gpu=False, no_tagger=False, no_parser=False, no_entities=False):
|
||||
"""Train a model. Expects data in spaCy's JSON format."""
|
||||
"""
|
||||
Train a model. Expects data in spaCy's JSON format.
|
||||
"""
|
||||
n_sents = n_sents or None
|
||||
output_path = util.ensure_path(output_dir)
|
||||
train_path = util.ensure_path(train_data)
|
||||
|
@ -84,11 +86,11 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
|||
pbar.update(len(docs))
|
||||
|
||||
with nlp.use_params(optimizer.averages):
|
||||
scorer = nlp.evaluate(corpus.dev_docs(nlp, gold_preproc=False))
|
||||
with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
|
||||
dill.dump(nlp, file_, -1)
|
||||
|
||||
|
||||
with (output_path / ('model%d.pickle' % i)).open('rb') as file_:
|
||||
nlp_loaded = dill.load(file_)
|
||||
scorer = nlp_loaded.evaluate(corpus.dev_docs(nlp_loaded, gold_preproc=False))
|
||||
print_progress(i, losses, scorer.scores)
|
||||
finally:
|
||||
print("Saving model...")
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, UNITS
|
||||
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
|
||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
||||
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES, UNITS
|
||||
|
||||
|
||||
_currency = r"\$|¢|£|€|¥|฿|৳"
|
||||
|
@ -10,16 +10,16 @@ _quotes = QUOTES.replace("'", '')
|
|||
_list_punct = LIST_PUNCT + '। ॥'.strip().split()
|
||||
|
||||
|
||||
_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES)
|
||||
_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS)
|
||||
|
||||
_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES +
|
||||
_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
|
||||
[r'(?<=[0-9])\+',
|
||||
r'(?<=°[FfCcKk])\.',
|
||||
r'(?<=[0-9])(?:{})'.format(_currency),
|
||||
r'(?<=[0-9])(?:{})'.format(UNITS),
|
||||
r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)])
|
||||
|
||||
_infixes = (LIST_ELLIPSES +
|
||||
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
||||
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
|
||||
|
|
|
@ -20,7 +20,6 @@ _upper = [_latin_upper]
|
|||
_lower = [_latin_lower]
|
||||
_uncased = [_bengali, _hebrew]
|
||||
|
||||
|
||||
ALPHA = merge_char_classes(_upper + _lower + _uncased)
|
||||
ALPHA_LOWER = merge_char_classes(_lower + _uncased)
|
||||
ALPHA_UPPER = merge_char_classes(_upper + _uncased)
|
||||
|
@ -33,13 +32,14 @@ _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
|
|||
_punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
|
||||
_quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «'
|
||||
_hyphens = '- – — -- ---'
|
||||
|
||||
_other_symbols = r'[\p{So}]'
|
||||
|
||||
UNITS = merge_chars(_units)
|
||||
CURRENCY = merge_chars(_currency)
|
||||
QUOTES = merge_chars(_quotes)
|
||||
PUNCT = merge_chars(_punct)
|
||||
HYPHENS = merge_chars(_hyphens)
|
||||
ICONS = _other_symbols
|
||||
|
||||
LIST_UNITS = split_chars(_units)
|
||||
LIST_CURRENCY = split_chars(_currency)
|
||||
|
@ -47,3 +47,4 @@ LIST_QUOTES = split_chars(_quotes)
|
|||
LIST_PUNCT = split_chars(_punct)
|
||||
LIST_HYPHENS = split_chars(_hyphens)
|
||||
LIST_ELLIPSES = [r'\.\.+', '…']
|
||||
LIST_ICONS = [_other_symbols]
|
||||
|
|
|
@ -35,4 +35,4 @@ class English(Language):
|
|||
Defaults = EnglishDefaults
|
||||
|
||||
|
||||
__all__ = ['English', 'EnglishDefaults']
|
||||
__all__ = ['English']
|
||||
|
|
|
@ -2,15 +2,16 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
||||
from .char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
|
||||
from .char_classes import CURRENCY, UNITS
|
||||
from .char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
|
||||
from .char_classes import QUOTES, CURRENCY, UNITS
|
||||
|
||||
|
||||
_prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
|
||||
LIST_CURRENCY)
|
||||
LIST_CURRENCY + LIST_ICONS)
|
||||
|
||||
|
||||
_suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
|
||||
_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
|
||||
["'s", "'S", "’s", "’S"] +
|
||||
[r'(?<=[0-9])\+',
|
||||
r'(?<=°[FfCcKk])\.',
|
||||
r'(?<=[0-9])(?:{})'.format(CURRENCY),
|
||||
|
@ -19,7 +20,7 @@ _suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QU
|
|||
r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER)])
|
||||
|
||||
|
||||
_infixes = (LIST_ELLIPSES +
|
||||
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
||||
[r'(?<=[0-9])[+\-\*^](?=[0-9-])',
|
||||
r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||
|
|
26
spacy/lang/xx/__init__.py
Normal file
|
@ -0,0 +1,26 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
class MultiLanguageDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'xx'
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
|
||||
|
||||
class MultiLanguage(Language):
|
||||
"""Language class to be used for models that support multiple languages.
|
||||
This module allows models to specify their language ID as 'xx'.
|
||||
"""
|
||||
lang = 'xx'
|
||||
Defaults = MultiLanguageDefaults
|
||||
|
||||
|
||||
__all__ = ['MultiLanguage']
|
|
@ -337,6 +337,9 @@ cdef class NeuralDependencyParser(NeuralParser):
|
|||
name = 'parser'
|
||||
TransitionSystem = ArcEager
|
||||
|
||||
def __reduce__(self):
|
||||
return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None)
|
||||
|
||||
|
||||
cdef class NeuralEntityRecognizer(NeuralParser):
|
||||
name = 'entity'
|
||||
|
@ -344,6 +347,10 @@ cdef class NeuralEntityRecognizer(NeuralParser):
|
|||
|
||||
nr_feature = 6
|
||||
|
||||
def __reduce__(self):
|
||||
return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)
|
||||
|
||||
|
||||
|
||||
cdef class BeamDependencyParser(BeamParser):
|
||||
TransitionSystem = ArcEager
|
||||
|
|
|
@ -335,17 +335,18 @@ cdef cppclass StateC:
|
|||
this._break = this._b_i
|
||||
|
||||
void clone(const StateC* src) nogil:
|
||||
this.length = src.length
|
||||
memcpy(this._sent, src._sent, this.length * sizeof(TokenC))
|
||||
memcpy(this._stack, src._stack, this.length * sizeof(int))
|
||||
memcpy(this._buffer, src._buffer, this.length * sizeof(int))
|
||||
memcpy(this._ents, src._ents, this.length * sizeof(Entity))
|
||||
memcpy(this.shifted, src.shifted, this.length * sizeof(this.shifted[0]))
|
||||
this.length = src.length
|
||||
this._b_i = src._b_i
|
||||
this._s_i = src._s_i
|
||||
this._e_i = src._e_i
|
||||
this._break = src._break
|
||||
this.offset = src.offset
|
||||
this._empty_token = src._empty_token
|
||||
|
||||
void fast_forward() nogil:
|
||||
# space token attachement policy:
|
||||
|
|
|
@ -9,6 +9,7 @@ import ctypes
|
|||
from libc.stdint cimport uint32_t
|
||||
from libc.string cimport memcpy
|
||||
from cymem.cymem cimport Pool
|
||||
from collections import OrderedDict
|
||||
|
||||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC, is_space_token
|
||||
|
@ -312,12 +313,13 @@ cdef class ArcEager(TransitionSystem):
|
|||
@classmethod
|
||||
def get_actions(cls, **kwargs):
|
||||
actions = kwargs.get('actions',
|
||||
{
|
||||
SHIFT: [''],
|
||||
REDUCE: [''],
|
||||
RIGHT: [],
|
||||
LEFT: [],
|
||||
BREAK: ['ROOT']})
|
||||
OrderedDict((
|
||||
(SHIFT, ['']),
|
||||
(REDUCE, ['']),
|
||||
(RIGHT, []),
|
||||
(LEFT, []),
|
||||
(BREAK, ['ROOT'])
|
||||
)))
|
||||
seen_actions = set()
|
||||
for label in kwargs.get('left_labels', []):
|
||||
if label.upper() != 'ROOT':
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from thinc.typedefs cimport weight_t
|
||||
from collections import OrderedDict
|
||||
|
||||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC
|
||||
|
@ -51,17 +52,29 @@ cdef bint _entity_is_sunk(StateClass st, Transition* golds) nogil:
|
|||
|
||||
|
||||
cdef class BiluoPushDown(TransitionSystem):
|
||||
def __init__(self, *args, **kwargs):
|
||||
TransitionSystem.__init__(self, *args, **kwargs)
|
||||
|
||||
def __reduce__(self):
|
||||
labels_by_action = OrderedDict()
|
||||
cdef Transition t
|
||||
for trans in self.c[:self.n_moves]:
|
||||
label_str = self.strings[trans.label]
|
||||
labels_by_action.setdefault(trans.move, []).append(label_str)
|
||||
return (BiluoPushDown, (self.strings, labels_by_action),
|
||||
None, None)
|
||||
|
||||
@classmethod
|
||||
def get_actions(cls, **kwargs):
|
||||
actions = kwargs.get('actions',
|
||||
{
|
||||
MISSING: [''],
|
||||
BEGIN: [],
|
||||
IN: [],
|
||||
LAST: [],
|
||||
UNIT: [],
|
||||
OUT: ['']
|
||||
})
|
||||
OrderedDict((
|
||||
(MISSING, ['']),
|
||||
(BEGIN, []),
|
||||
(IN, []),
|
||||
(LAST, []),
|
||||
(UNIT, []),
|
||||
(OUT, [''])
|
||||
)))
|
||||
seen_entities = set()
|
||||
for entity_type in kwargs.get('entity_types', []):
|
||||
if entity_type in seen_entities:
|
||||
|
@ -90,7 +103,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
def move_name(self, int move, int label):
|
||||
if move == OUT:
|
||||
return 'O'
|
||||
elif move == 'MISSING':
|
||||
elif move == MISSING:
|
||||
return 'M'
|
||||
else:
|
||||
return MOVE_NAMES[move] + '-' + self.strings[label]
|
||||
|
|
|
@ -527,6 +527,14 @@ cdef class Parser:
|
|||
xp.add.at(d_tokvecs,
|
||||
ids, d_state_features * active_feats)
|
||||
|
||||
@property
|
||||
def move_names(self):
|
||||
names = []
|
||||
for i in range(self.moves.n_moves):
|
||||
name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label)
|
||||
names.append(name)
|
||||
return names
|
||||
|
||||
def get_batch_model(self, batch_size, tokvecs, stream, dropout):
|
||||
lower, upper = self.model
|
||||
state2vec = precompute_hiddens(batch_size, tokvecs,
|
||||
|
|
|
@ -5,7 +5,7 @@ from __future__ import unicode_literals
|
|||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
||||
from cymem.cymem cimport Pool
|
||||
from thinc.typedefs cimport weight_t
|
||||
from collections import defaultdict
|
||||
from collections import defaultdict, OrderedDict
|
||||
|
||||
from ..structs cimport TokenC
|
||||
from .stateclass cimport StateClass
|
||||
|
@ -26,7 +26,7 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
|
|||
|
||||
|
||||
cdef class TransitionSystem:
|
||||
def __init__(self, StringStore string_table, dict labels_by_action):
|
||||
def __init__(self, StringStore string_table, labels_by_action):
|
||||
self.mem = Pool()
|
||||
self.strings = string_table
|
||||
self.n_moves = 0
|
||||
|
@ -34,14 +34,14 @@ cdef class TransitionSystem:
|
|||
|
||||
self.c = <Transition*>self.mem.alloc(self._size, sizeof(Transition))
|
||||
|
||||
for action, label_strs in sorted(labels_by_action.items()):
|
||||
for action, label_strs in labels_by_action.items():
|
||||
for label_str in label_strs:
|
||||
self.add_action(int(action), label_str)
|
||||
self.root_label = self.strings['ROOT']
|
||||
self.init_beam_state = _init_state
|
||||
|
||||
def __reduce__(self):
|
||||
labels_by_action = {}
|
||||
labels_by_action = OrderedDict()
|
||||
cdef Transition t
|
||||
for trans in self.c[:self.n_moves]:
|
||||
label_str = self.strings[trans.label]
|
||||
|
@ -77,6 +77,11 @@ cdef class TransitionSystem:
|
|||
history.append(i)
|
||||
action.do(state.c, action.label)
|
||||
break
|
||||
else:
|
||||
print(gold.words)
|
||||
print(gold.ner)
|
||||
print(history)
|
||||
raise ValueError("Could not find gold move")
|
||||
return history
|
||||
|
||||
cdef int initialize_state(self, StateC* state) nogil:
|
||||
|
|
|
@ -1,7 +1,4 @@
|
|||
# coding: utf-8
|
||||
"""Test that tokenizer exceptions and emoticons are handled correctly."""
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
@ -39,3 +36,12 @@ def test_tokenizer_handles_emoticons(tokenizer):
|
|||
def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8),
|
||||
('i💙you', 3), ('🤘🤘yay!', 4)])
|
||||
def test_tokenizer_handles_emoji(tokenizer, text, length):
|
||||
exceptions = ["hu"]
|
||||
tokens = tokenizer(text)
|
||||
if tokens[0].lang_ not in exceptions:
|
||||
assert len(tokens) == length
|
||||
|
|
133
spacy/util.py
|
@ -78,27 +78,86 @@ def ensure_path(path):
|
|||
return path
|
||||
|
||||
|
||||
def resolve_model_path(name):
|
||||
"""Resolve a model name or string to a model path.
|
||||
def load_model(name):
|
||||
"""Load a model from a shortcut link, package or data path.
|
||||
|
||||
name (unicode): Package name, shortcut link or model path.
|
||||
RETURNS (Path): Path to model data directory.
|
||||
RETURNS (Language): `Language` class with the loaded model.
|
||||
"""
|
||||
data_path = get_data_path()
|
||||
if not data_path or not data_path.exists():
|
||||
raise IOError("Can't find spaCy data path: %s" % path2str(data_path))
|
||||
if isinstance(name, basestring_):
|
||||
if (data_path / name).exists(): # in data dir or shortcut link
|
||||
return (data_path / name)
|
||||
if is_package(name): # installed as a package
|
||||
return get_model_package_path(name)
|
||||
if Path(name).exists(): # path to model
|
||||
return Path(name)
|
||||
elif hasattr(name, 'exists'): # Path or Path-like object
|
||||
return name
|
||||
if (data_path / name).exists(): # in data dir or shortcut
|
||||
return load_model_from_path(data_path / name)
|
||||
if is_package(name): # installed as package
|
||||
return load_model_from_pkg(name)
|
||||
if Path(name).exists(): # path to model data directory
|
||||
return load_data_from_path(Path(name))
|
||||
elif hasattr(name, 'exists'): # Path or Path-like to model data
|
||||
return load_data_from_path(name)
|
||||
raise IOError("Can't find model '%s'" % name)
|
||||
|
||||
|
||||
def load_model_from_init_py(init_file):
|
||||
"""Helper function to use in the `load()` method of a model package's
|
||||
__init__.py.
|
||||
|
||||
init_file (unicode): Path to model's __init__.py, i.e. `__file__`.
|
||||
RETURNS (Language): `Language` class with loaded model.
|
||||
"""
|
||||
model_path = Path(init_file).parent
|
||||
return load_data_from_path(model_path, package=True)
|
||||
|
||||
|
||||
def load_model_from_path(model_path):
|
||||
"""Import and load a model package from its file path.
|
||||
|
||||
path (unicode or Path): Path to package directory.
|
||||
RETURNS (Language): `Language` class with loaded model.
|
||||
"""
|
||||
model_path = ensure_path(model_path)
|
||||
spec = importlib.util.spec_from_file_location('model', model_path)
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module)
|
||||
return module.load()
|
||||
|
||||
|
||||
def load_model_from_pkg(name):
|
||||
"""Import and load a model package.
|
||||
|
||||
name (unicode): Name of model package installed via pip.
|
||||
RETURNS (Language): `Language` class with loaded model.
|
||||
"""
|
||||
module = importlib.import_module(name)
|
||||
return module.load()
|
||||
|
||||
|
||||
def load_data_from_path(model_path, package=False):
|
||||
"""Initialie a `Language` class with a loaded model from a model data path.
|
||||
|
||||
model_path (unicode or Path): Path to model data directory.
|
||||
package (bool): Does the path point to the parent package directory?
|
||||
RETURNS (Language): `Language` class with loaded model.
|
||||
"""
|
||||
model_path = ensure_path(model_path)
|
||||
meta_path = model_path / 'meta.json'
|
||||
if not meta_path.is_file():
|
||||
raise IOError("Could not read meta.json from %s" % location)
|
||||
meta = read_json(location)
|
||||
for setting in ['lang', 'name', 'version']:
|
||||
if setting not in meta:
|
||||
raise IOError('No %s setting found in model meta.json' % setting)
|
||||
if package:
|
||||
model_data_path = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version'])
|
||||
model_path = model_path / model_data_path
|
||||
if not model_path.exists():
|
||||
raise ValueError("Can't find model directory: %s" % path2str(model_path))
|
||||
cls = get_lang_class(meta['lang'])
|
||||
nlp = cls(pipeline=meta.get('pipeline', True))
|
||||
return nlp.from_disk(model_path)
|
||||
|
||||
|
||||
def is_package(name):
|
||||
"""Check if string maps to a package installed via pip.
|
||||
|
||||
|
@ -112,36 +171,16 @@ def is_package(name):
|
|||
return False
|
||||
|
||||
|
||||
def get_model_package_path(package_name):
|
||||
"""Get path to a model package installed via pip.
|
||||
def get_package_path(name):
|
||||
"""Get the path to an installed package.
|
||||
|
||||
package_name (unicode): Name of installed package.
|
||||
RETURNS (Path): Path to model data directory.
|
||||
name (unicode): Package name.
|
||||
RETURNS (Path): Path to installed package.
|
||||
"""
|
||||
# Here we're importing the module just to find it. This is worryingly
|
||||
# indirect, but it's otherwise very difficult to find the package.
|
||||
# Python's installation and import rules are very complicated.
|
||||
pkg = importlib.import_module(package_name)
|
||||
package_path = Path(pkg.__file__).parent.parent
|
||||
meta = parse_package_meta(package_path / package_name)
|
||||
model_name = '%s-%s' % (package_name, meta['version'])
|
||||
return package_path / package_name / model_name
|
||||
|
||||
|
||||
def parse_package_meta(package_path, require=True):
|
||||
"""Check if a meta.json exists in a package and return its contents.
|
||||
|
||||
package_path (Path): Path to model package directory.
|
||||
require (bool): If True, raise error if no meta.json is found.
|
||||
RETURNS (dict or None): Model meta.json data or None.
|
||||
"""
|
||||
location = package_path / 'meta.json'
|
||||
if location.is_file():
|
||||
return read_json(location)
|
||||
elif require:
|
||||
raise IOError("Could not read meta.json from %s" % location)
|
||||
else:
|
||||
return None
|
||||
return Path(pkg.__file__).parent
|
||||
|
||||
|
||||
def is_in_jupyter():
|
||||
|
@ -177,10 +216,13 @@ def get_async(stream, numpy_array):
|
|||
|
||||
def itershuffle(iterable, bufsize=1000):
|
||||
"""Shuffle an iterator. This works by holding `bufsize` items back
|
||||
and yielding them sometime later. Obviously, this is not unbiased --
|
||||
and yielding them sometime later. Obviously, this is not unbiased –
|
||||
but should be good enough for batching. Larger bufsize means less bias.
|
||||
|
||||
From https://gist.github.com/andres-erbsen/1307752
|
||||
|
||||
iterable (iterable): Iterator to shuffle.
|
||||
bufsize (int): Items to hold back.
|
||||
YIELDS (iterable): The shuffled iterator.
|
||||
"""
|
||||
iterable = iter(iterable)
|
||||
buf = []
|
||||
|
@ -315,17 +357,16 @@ def normalize_slice(length, start, stop, step=None):
|
|||
|
||||
|
||||
def compounding(start, stop, compound):
|
||||
'''Yield an infinite series of compounding values. Each time the
|
||||
"""Yield an infinite series of compounding values. Each time the
|
||||
generator is called, a value is produced by multiplying the previous
|
||||
value by the compound rate.
|
||||
|
||||
EXAMPLE
|
||||
|
||||
EXAMPLE:
|
||||
>>> sizes = compounding(1., 10., 1.5)
|
||||
>>> assert next(sizes) == 1.
|
||||
>>> assert next(sizes) == 1 * 1.5
|
||||
>>> assert next(sizes) == 1.5 * 1.5
|
||||
'''
|
||||
"""
|
||||
def clip(value):
|
||||
return max(value, stop) if (start>stop) else min(value, stop)
|
||||
curr = float(start)
|
||||
|
@ -335,7 +376,7 @@ def compounding(start, stop, compound):
|
|||
|
||||
|
||||
def decaying(start, stop, decay):
|
||||
'''Yield an infinite series of linearly decaying values.'''
|
||||
"""Yield an infinite series of linearly decaying values."""
|
||||
def clip(value):
|
||||
return max(value, stop) if (start>stop) else min(value, stop)
|
||||
nr_upd = 1.
|
||||
|
@ -344,12 +385,6 @@ def decaying(start, stop, decay):
|
|||
nr_upd += 1
|
||||
|
||||
|
||||
def check_renamed_kwargs(renamed, kwargs):
|
||||
for old, new in renamed.items():
|
||||
if old in kwargs:
|
||||
raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
|
||||
|
||||
|
||||
def read_json(location):
|
||||
"""Open and load JSON from file.
|
||||
|
||||
|
|
|
@ -53,8 +53,6 @@ cdef class Vocab:
|
|||
vice versa.
|
||||
RETURNS (Vocab): The newly constructed vocab object.
|
||||
"""
|
||||
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
|
||||
|
||||
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
||||
tag_map = tag_map if tag_map is not None else {}
|
||||
if lemmatizer in (None, True, False):
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-1 -1 907 737" width="906" height="746">
|
||||
<style>
|
||||
.svg__architecture__text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
|
||||
.svg__architecture__text-medium { fill: #1a1e23; font: 17px "Source Sans Pro" }
|
||||
.svg__architecture__text-small { fill: #1a1e23; font: bold 14px "Source Sans Pro" }
|
||||
.svg__architecture__text-code { fill: #1a1e23; font: 600 12px "Source Code Pro" }
|
||||
.svg__architecture__text-large { fill: #1a1e23; font: 20px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||
.svg__architecture__text-medium { fill: #1a1e23; font: 17px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||
.svg__architecture__text-small { fill: #1a1e23; font: bold 14px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||
.svg__architecture__text-code { fill: #1a1e23; font: 600 12px "Source Code Pro", Monaco, "Courier New", monospace }
|
||||
</style>
|
||||
<ellipse cx="404" cy="203" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="74.8" ry="49.8"/>
|
||||
<text class="svg__architecture__text-large" transform="translate(362.5 206.5)" width="81" height="40">Language</text>
|
||||
|
|
Before Width: | Height: | Size: 14 KiB After Width: | Height: | Size: 14 KiB |
|
@ -1,8 +1,8 @@
|
|||
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="931" height="456" viewBox="-1 -1 932 480" preserveAspectRatio="xMinYMin meet">
|
||||
<style>
|
||||
.svg__langdata__text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
|
||||
.svg__langdata__text-small { fill: #1a1e23; font: bold 16px "Source Sans Pro" }
|
||||
.svg__langdata__text-tiny { fill: #1a1e23; font: bold 16px "Source Sans Pro" }
|
||||
.svg__langdata__text-large { fill: #1a1e23; font: 20px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||
.svg__langdata__text-small { fill: #1a1e23; font: bold 16px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||
.svg__langdata__text-tiny { fill: #1a1e23; font: bold 16px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||
</style>
|
||||
<path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M610 404h-69.8" stroke-dasharray="1 6" stroke-linecap="round"/>
|
||||
<path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M534.2 404l8-4-2 4 2 4z"/>
|
||||
|
|
Before Width: | Height: | Size: 9.0 KiB After Width: | Height: | Size: 9.1 KiB |
|
@ -1,8 +1,8 @@
|
|||
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 923 200" width="923" height="200">
|
||||
<style>
|
||||
.svg__pipeline__text { fill: #1a1e23; font: 20px "Source Sans Pro" }
|
||||
.svg__pipeline__text-small { fill: #1a1e23; font: bold 18px "Source Sans Pro" }
|
||||
.svg__pipeline__text-code { fill: #1a1e23; font: 600 16px "Source Code Pro" }
|
||||
.svg__pipeline__text { fill: #1a1e23; font: 20px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||
.svg__pipeline__text-small { fill: #1a1e23; font: bold 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||
.svg__pipeline__text-code { fill: #1a1e23; font: 600 16px "Source Code Pro", Monaco, "Courier New", monospace }
|
||||
</style>
|
||||
<rect width="601" height="127" x="159" y="21" fill="none" stroke="#09a3d5" stroke-width="3" rx="19.1" stroke-dasharray="3 6" ry="19.1"/>
|
||||
<path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M801 55h120v60H801z"/>
|
||||
|
|
Before Width: | Height: | Size: 3.1 KiB After Width: | Height: | Size: 3.2 KiB |
123
website/assets/img/docs/tokenization.svg
Normal file
|
@ -0,0 +1,123 @@
|
|||
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="600" height="380" viewBox="-20 -10 550 400">
|
||||
<style>
|
||||
.svg__tokenization__text { fill: #1a1e23; font: 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||
.svg__tokenization__text-small { fill: #fff; font: 600 13px "Source Code Pro", Monaco, "Courier New", monospace }
|
||||
</style>
|
||||
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M71 39v12H16v11M71 39v12h20v11"/>
|
||||
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M1 1h140v38.2H1z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" width="43" height="19" transform="translate(48.5 9.5)">“Let’s</text>
|
||||
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M175 39v23"/>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M150 1h50v38.2h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 9.5)" width="19" height="19">go</text>
|
||||
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M235 39v23"/>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M210 1h50v38.2h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 9.5)" width="15" height="19">to</text>
|
||||
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M341 39v23"/>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M270 1h141v38.2H270z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(320.5 9.5)" width="38" height="19">N.Y.!”</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 100v20"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 62h30v38.2H1z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 70.5)" width="7" height="19">“</text>
|
||||
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M91 100v11H66v9M91 100v11h29v9"/>
|
||||
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M41 62h100v38.2H41z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(72.5 70.5)" width="35" height="19">Let’s</text>
|
||||
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M175 100v20"/>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M150 62h50v38.2h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 70.5)" width="19" height="19">go</text>
|
||||
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M235 100v20"/>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M210 62h50v38.2h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 70.5)" width="15" height="19">to</text>
|
||||
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M341 100v20"/>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M270 62h141v38.2H270z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(320.5 70.5)" width="38" height="19">N.Y.!”</text>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 120h30v38H1z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 128.5)" width="7" height="19">“</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 158v24"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 120h50v38H41z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 128.5)" width="23" height="19">Let</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 158v24"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 120h50v38h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 128.5)" width="19" height="19">go</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 158v24"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 120h50v38h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 128.5)" width="15" height="19">to</text>
|
||||
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M341 158v13h-20v11M341 158v13h55v11"/>
|
||||
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M270 120h141v38H270z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(320.5 128.5)" width="38" height="19">N.Y.!”</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 158v24"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 120h40v38h-40z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 128.5)" width="11" height="19">’s</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 220v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 181.8h30V220H1z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 190.5)" width="7" height="19">“</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 220v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 181.8h50V220H41z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 190.5)" width="23" height="19">Let</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 220v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 181.8h50V220h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 190.5)" width="19" height="19">go</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 220v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 181.8h50V220h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 190.5)" width="15" height="19">to</text>
|
||||
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M321 220v11h-20v12M321 220v11h34v12"/>
|
||||
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M270 181.8h101V220H270z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(304.5 190.5)" width="30" height="19">N.Y.!</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 220v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 181.8h40V220h-40z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 190.5)" width="11" height="19">’s</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M396 220v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 181.8h30V220h-30z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(391.5 190.5)" width="7" height="19">”</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 281v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 242.7h30V281H1z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 251.5)" width="7" height="19">“</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 281v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 242.7h50V281H41z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 251.5)" width="23" height="19">Let</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 281v20-17 20"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 242.7h50V281h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 251.5)" width="19" height="19">go</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 281v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 242.7h50V281h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 251.5)" width="15" height="19">to</text>
|
||||
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M301 281v23"/>
|
||||
<path fill="#f8cecc" stroke="#b85450" stroke-width="2" d="M270 242.7h61V281h-61z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(286.5 251.5)" width="26" height="19">N.Y.</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 281v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 242.7h40V281h-40z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 251.5)" width="11" height="19">’s</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M396 281v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 242.7h30V281h-30z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(391.5 251.5)" width="7" height="19">”</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M355 281v23"/>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M340 242.7h30V281h-30z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(351.5 251.5)" width="5" height="19">!</text>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 304h30v38H1z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 312.5)" width="7" height="19">“</text>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 304h50v38H41z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 312.5)" width="23" height="19">Let</text>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 304h50v38h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 312.5)" width="19" height="19">go</text>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 304h50v38h-50z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 312.5)" width="15" height="19">to</text>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M270 304h61v38h-61z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(286.5 312.5)" width="26" height="19">N.Y.</text>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 304h40v38h-40z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 312.5)" width="11" height="19">’s</text>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 304h30v38h-30z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(391.5 312.5)" width="7" height="19">”</text>
|
||||
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M340 304h30v38h-30z"/>
|
||||
<text class="svg__tokenization__text" dy="1em" transform="translate(351.5 312.5)" width="5" height="19">!</text>
|
||||
<rect width="104" height="19" x="437" y="72" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
|
||||
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(455.5 74.5)" width="65" height="12">EXCEPTION</text>
|
||||
<rect width="104" height="19" x="437" y="11" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
|
||||
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 13.5)" width="43" height="12">PREFIX</text>
|
||||
<rect width="104" height="19" x="437" y="130" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
|
||||
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 132.5)" width="43" height="12">SUFFIX</text>
|
||||
<rect width="104" height="19" x="437" y="191" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
|
||||
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 193.5)" width="43" height="12">SUFFIX</text>
|
||||
<rect width="104" height="19" x="437" y="252" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
|
||||
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(455.5 254.5)" width="65" height="12">EXCEPTION</text>
|
||||
<rect width="104" height="19" x="437" y="313" fill="#82b366" stroke="#82b366" stroke-width="2" rx="2.9" ry="2.9"/>
|
||||
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(473.5 315.5)" width="29" height="12">DONE</text>
|
||||
</svg>
|
After Width: | Height: | Size: 12 KiB |
|
@ -1,9 +1,9 @@
|
|||
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-10 -10 582 365" width="572" height="355">
|
||||
<style>
|
||||
.svg__vocab__text { fill: #1a1e23; font: 18px "Source Sans Pro" }
|
||||
.svg__vocab__text-large { fill: #fff; font: bold 18px "Source Sans Pro"; text-transform: uppercase }
|
||||
.svg__vocab__text-box { fill: #fff; font: bold 12px "Source Code Pro" }
|
||||
.svg__vocab__text-code { fill: #1a1e23; font: bold 12px "Source Code Pro" }
|
||||
.svg__vocab__text { fill: #1a1e23; font: 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
|
||||
.svg__vocab__text-large { fill: #fff; font: bold 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif; text-transform: uppercase }
|
||||
.svg__vocab__text-box { fill: #fff; font: bold 12px "Source Code Pro", Monaco, "Courier New", monospace }
|
||||
.svg__vocab__text-code { fill: #1a1e23; font: bold 12px "Source Code Pro", Monaco, "Courier New", monospace }
|
||||
</style>
|
||||
<rect width="570" height="88" x="1" y="135" fill="#d5e8d4" stroke="#82b366" stroke-width="2" rx="13.2" ry="13.2"/>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 164h100v40H444z"/>
|
||||
|
|
Before Width: | Height: | Size: 7.6 KiB After Width: | Height: | Size: 7.8 KiB |
|
@ -158,7 +158,8 @@
|
|||
|
||||
"binder": {
|
||||
"title": "Binder",
|
||||
"tag": "class"
|
||||
"tag": "class",
|
||||
"source": "spacy/tokens/binder.pyx"
|
||||
},
|
||||
|
||||
"annotation": {
|
||||
|
|
|
@ -2,7 +2,10 @@
|
|||
|
||||
include ../../_includes/_mixins
|
||||
|
||||
p spaCy currently supports the following languages and capabilities:
|
||||
p
|
||||
| spaCy currently provides models for the following languages and
|
||||
| capabilities:
|
||||
|
||||
|
||||
+aside-code("Download language models", "bash").
|
||||
python -m spacy download en
|
||||
|
@ -22,12 +25,16 @@ p spaCy currently supports the following languages and capabilities:
|
|||
|
||||
+row
|
||||
+cell French #[code fr]
|
||||
each icon in [ "pro", "pro", "con", "pro", "con", "pro", "pro", "con" ]
|
||||
each icon in [ "pro", "con", "con", "pro", "con", "pro", "pro", "con" ]
|
||||
+cell.u-text-center #[+procon(icon)]
|
||||
|
||||
+h(2, "available") Available models
|
||||
+row
|
||||
+cell Spanish #[code es]
|
||||
each icon in [ "pro", "pro", "con", "pro", "pro", "pro", "pro", "con" ]
|
||||
+cell.u-text-center #[+procon(icon)]
|
||||
|
||||
include ../usage/_models-list
|
||||
p
|
||||
+button("/docs/usage/models", true, "primary") See available models
|
||||
|
||||
+h(2, "alpha-support") Alpha tokenization support
|
||||
|
||||
|
@ -52,9 +59,35 @@ p
|
|||
| #[+a("https://github.com/mocobeta/janome") Janome].
|
||||
|
||||
+table([ "Language", "Code", "Source" ])
|
||||
each language, code in { es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", da: "Danish", hu: "Hungarian", pl: "Polish", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" }
|
||||
each language, code in { it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", da: "Danish", hu: "Hungarian", pl: "Polish", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" }
|
||||
+row
|
||||
+cell #{language}
|
||||
+cell #[code=code]
|
||||
+cell
|
||||
+src(gh("spaCy", "spacy/lang/" + code)) lang/#{code}
|
||||
|
||||
+h(2, "multi-language") Multi-language support
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| As of v2.0, spaCy supports models trained on more than one language. This
|
||||
| is especially useful for named entity recognition. The language ID used
|
||||
| for multi-language or language-neutral models is #[code xx]. The
|
||||
| language class, a generic subclass containing only the base language data,
|
||||
| can be found in #[+src(gh("spaCy", "spacy/lang/xx")) lang/xx].
|
||||
|
||||
p
|
||||
| To load your model with the neutral, multi-language class, simply set
|
||||
| #[code "language": "xx"] in your
|
||||
| #[+a("/docs/usage/saving-loading#models-generating") model package]'s
|
||||
| meta.json. You can also import the class directly, or call
|
||||
| #[+api("util#get_lang_class") #[code util.get_lang_class()]] for
|
||||
| lazy-loading.
|
||||
|
||||
+code("Standard import").
|
||||
from spacy.lang.xx import MultiLanguage
|
||||
nlp = MultiLanguage()
|
||||
|
||||
+code("With lazy-loading").
|
||||
from spacy.util import get_lang_class
|
||||
nlp = get_lang_class('xx')
|
||||
|
|
|
@ -11,8 +11,13 @@ p
|
|||
| the name of an installed
|
||||
| #[+a("/docs/usage/saving-loading#generating") model package], a unicode
|
||||
| path or a #[code Path]-like object. spaCy will try resolving the load
|
||||
| argument in this order. The #[code Language] class to initialise will be
|
||||
| determined based on the model's settings.
|
||||
| argument in this order. If a model is loaded from a shortcut link or
|
||||
| package name, spaCy will assume it's a Python package and import it and
|
||||
| call the model's own #[code load()] method. If a model is loaded from a
|
||||
| path, spaCy will assume it's a data directory, read the language and
|
||||
| pipeline settings off the meta.json and initialise the #[code Language]
|
||||
| class. The data will be loaded in via
|
||||
| #[+api("language#from_disk") #[code Language.from_disk()]].
|
||||
|
||||
+aside-code("Example").
|
||||
nlp = spacy.load('en') # shortcut link
|
||||
|
@ -20,7 +25,7 @@ p
|
|||
nlp = spacy.load('/path/to/en') # unicode path
|
||||
nlp = spacy.load(Path('/path/to/en')) # pathlib Path
|
||||
|
||||
nlp = spacy.load('en', disable['parser', 'tagger'])
|
||||
nlp = spacy.load('en', disable=['parser', 'tagger'])
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
|
|
|
@ -1,12 +1,10 @@
|
|||
//- 💫 DOCS > API > ANNOTATION SPECS
|
||||
//- 💫 DOCS > API > UTIL
|
||||
|
||||
include ../../_includes/_mixins
|
||||
|
||||
p
|
||||
| spaCy comes with a small collection of utility functions located in
|
||||
| #[+src(gh("spaCy", "spacy/util.py")) spacy/util.py].
|
||||
|
||||
+infobox("Important note")
|
||||
| Because utility functions are mostly intended for
|
||||
| #[strong internal use within spaCy], their behaviour may change with
|
||||
| future releases. The functions documented on this page should be safe
|
||||
|
@ -74,15 +72,23 @@ p
|
|||
+cell #[code Language]
|
||||
+cell Language class.
|
||||
|
||||
+h(2, "resolve_model_path") util.resolve_model_path
|
||||
+h(2, "load_model") util.load_model
|
||||
+tag function
|
||||
+tag-new(2)
|
||||
|
||||
p Resolve a model name or string to a model path.
|
||||
p
|
||||
| Load a model from a shortcut link, package or data path. If called with a
|
||||
| shortcut link or package name, spaCy will assume the model is a Python
|
||||
| package and import and call its #[code load()] method. If called with a
|
||||
| path, spaCy will assume it's a data directory, read the language and
|
||||
| pipeline settings from the meta.json and initialise a #[code Language]
|
||||
| class. The model data will then be loaded in via
|
||||
| #[+api("language#from_disk") #[code Language.from_disk()]].
|
||||
|
||||
+aside-code("Example").
|
||||
model_path = util.resolve_model_path('en')
|
||||
model_path = util.resolve_model_path('/path/to/en')
|
||||
nlp = util.load_model('en')
|
||||
nlp = util.load_model('en_core_web_sm')
|
||||
nlp = util.load_model('/path/to/data')
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
|
@ -92,8 +98,33 @@ p Resolve a model name or string to a model path.
|
|||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Path]
|
||||
+cell Path to model data directory.
|
||||
+cell #[code Language]
|
||||
+cell #[code Language] class with the loaded model.
|
||||
|
||||
+h(2, "load_model_from_init_py") util.load_model_from_init_py
|
||||
+tag function
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| A helper function to use in the #[code load()] method of a model package's
|
||||
| #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) __init__.py].
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.util import load_model_from_init_py
|
||||
|
||||
def load():
|
||||
return load_model_from_init_py(__file__)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code init_file]
|
||||
+cell unicode
|
||||
+cell Path to model's __init__.py, i.e. #[code __file__].
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Language]
|
||||
+cell #[code Language] class with the loaded model.
|
||||
|
||||
+h(2, "is_package") util.is_package
|
||||
+tag function
|
||||
|
@ -117,16 +148,18 @@ p
|
|||
+cell #[code bool]
|
||||
+cell #[code True] if installed package, #[code False] if not.
|
||||
|
||||
+h(2, "get_model_package_path") util.get_model_package_path
|
||||
+h(2, "get_package_path") util.get_package_path
|
||||
+tag function
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| Get path to a #[+a("/docs/usage/models") model package] installed via pip.
|
||||
| Currently imports the package to find it and parse its meta data.
|
||||
| Get path to an installed package. Mainly used to resolve the location of
|
||||
| #[+a("/docs/usage/models") model packages]. Currently imports the package
|
||||
| to find its path.
|
||||
|
||||
+aside-code("Example").
|
||||
util.get_model_package_path('en_core_web_sm')
|
||||
# /usr/lib/python3.6/site-packages/en_core_web_sm/en_core_web_sm-1.2.0
|
||||
util.get_package_path('en_core_web_sm')
|
||||
# /usr/lib/python3.6/site-packages/en_core_web_sm
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
|
@ -137,37 +170,8 @@ p
|
|||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Path]
|
||||
+cell Path to model data directory.
|
||||
|
||||
+h(2, "parse_package_meta") util.parse_package_meta
|
||||
+tag function
|
||||
|
||||
p
|
||||
| Check if a #[code meta.json] exists in a model package and return its
|
||||
| contents.
|
||||
|
||||
+aside-code("Example").
|
||||
if util.is_package('en_core_web_sm'):
|
||||
path = util.get_model_package_path('en_core_web_sm')
|
||||
meta = util.parse_package_meta(path, require=True)
|
||||
# {'name': 'core_web_sm', 'lang': 'en', ...}
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code package_path]
|
||||
+cell #[code Path]
|
||||
+cell Path to model package directory.
|
||||
|
||||
+row
|
||||
+cell #[code require]
|
||||
+cell #[code bool]
|
||||
+cell If #[code True], raise error if no #[code meta.json] is found.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell dict / #[code None]
|
||||
+cell Model meta data or #[code None].
|
||||
|
||||
+h(2, "is_in_jupyter") util.is_in_jupyter
|
||||
+tag function
|
||||
+tag-new(2)
|
||||
|
|
|
@ -5,7 +5,7 @@ p
|
|||
| #[strong how similar they are]. Predicting similarity is useful for
|
||||
| building recommendation systems or flagging duplicates. For example, you
|
||||
| can suggest a user content that's similar to what they're currently
|
||||
| looking at, or label a support ticket as a duplicate, if it's very
|
||||
| looking at, or label a support ticket as a duplicate if it's very
|
||||
| similar to an already existing one.
|
||||
|
||||
p
|
||||
|
|
|
@ -16,3 +16,47 @@ p
|
|||
+row
|
||||
for cell in ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "for", "$", "1", "billion"]
|
||||
+cell=cell
|
||||
|
||||
p
|
||||
| Fist, the raw text is split on whitespace characters, similar to
|
||||
| #[code text.split(' ')]. Then, the tokenizer processes the text from
|
||||
| left to right. On each substring, it performs two checks:
|
||||
|
||||
+list("numbers")
|
||||
+item
|
||||
| #[strong Does the substring match a tokenizer exception rule?] For
|
||||
| example, "don't" does not contain whitespace, but should be split
|
||||
| into two tokens, "do" and "n't", while "U.K." should always
|
||||
| remain one token.
|
||||
+item
|
||||
| #[strong Can a prefix, suffix or infixes be split off?]. For example
|
||||
| punctuation like commas, periods, hyphens or quotes.
|
||||
|
||||
p
|
||||
| If there's a match, the rule is applied and the tokenizer continues its
|
||||
| loop, starting with the newly split substrings. This way, spaCy can split
|
||||
| #[strong complex, nested tokens] like combinations of abbreviations and
|
||||
| multiple punctuation marks.
|
||||
|
||||
+aside
|
||||
| #[strong Tokenizer exception:] Special-case rule to split a string into
|
||||
| several tokens or prevent a token from being split when punctuation rules
|
||||
| are applied.#[br]
|
||||
| #[strong Prefix:] Character(s) at the beginning, e.g.
|
||||
| #[code $], #[code (], #[code “], #[code ¿].#[br]
|
||||
| #[strong Suffix:] Character(s) at the end, e.g.
|
||||
| #[code km], #[code )], #[code ”], #[code !].#[br]
|
||||
| #[strong Infix:] Character(s) in between, e.g.
|
||||
| #[code -], #[code --], #[code /], #[code …].#[br]
|
||||
|
||||
+image
|
||||
include ../../../assets/img/docs/tokenization.svg
|
||||
.u-text-right
|
||||
+button("/assets/img/docs/tokenization.svg", false, "secondary").u-text-tag View large graphic
|
||||
|
||||
p
|
||||
| While punctuation rules are usually pretty general, tokenizer exceptions
|
||||
| strongly depend on the specifics of the individual language. This is
|
||||
| why each #[+a("/docs/api/language-models") available language] has its
|
||||
| own subclass like #[code English] or #[code German], that loads in lists
|
||||
| of hard-coded data and exception rules.
|
||||
|
|
|
@ -89,4 +89,6 @@ p
|
|||
|
||||
p
|
||||
| Even though both #[code Doc] objects contain the same words, the internal
|
||||
| integer IDs are very different.
|
||||
| integer IDs are very different. The same applies for all other strings,
|
||||
| like the annotation scheme. To avoid mismatched IDs, spaCy will always
|
||||
| export the vocab if you save a #[code Doc] or #[code nlp] object.
|
||||
|
|
|
@ -19,19 +19,17 @@ p
|
|||
|
||||
p
|
||||
| When you load a model, spaCy first consults the model's
|
||||
| #[+a("/docs/usage/saving-loading#models-generating") meta.json] for its
|
||||
| #[code setup] details. This typically includes the ID of a language class,
|
||||
| #[+a("/docs/usage/saving-loading#models-generating") meta.json]. The
|
||||
| meta typically includes the model details, the ID of a language class,
|
||||
| and an optional list of pipeline components. spaCy then does the
|
||||
| following:
|
||||
|
||||
+aside-code("meta.json (excerpt)", "json").
|
||||
{
|
||||
"name": "example_model",
|
||||
"lang": "en"
|
||||
"description": "Example model for spaCy",
|
||||
"setup": {
|
||||
"lang": "en",
|
||||
"pipeline": ["token_vectors", "tagger"]
|
||||
}
|
||||
"pipeline": ["token_vectors", "tagger"]
|
||||
}
|
||||
|
||||
+list("numbers")
|
||||
|
@ -146,7 +144,7 @@ p
|
|||
+table(["Argument", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code vocab]
|
||||
+cell #[coce Vocab]
|
||||
+cell #[code Vocab]
|
||||
+cell
|
||||
| Shared data between components, including strings, morphology,
|
||||
| vectors etc.
|
||||
|
@ -287,17 +285,15 @@ p
|
|||
|
||||
p
|
||||
| In the model package's meta.json, specify the language class and pipeline
|
||||
| IDs in #[code setup]:
|
||||
| IDs:
|
||||
|
||||
+code("meta.json (excerpt)", "json").
|
||||
{
|
||||
"name": "my_sentiment_model",
|
||||
"name": "sentiment_model",
|
||||
"lang": "en",
|
||||
"version": "1.0.0",
|
||||
"spacy_version": ">=2.0.0,<3.0.0",
|
||||
"setup": {
|
||||
"lang": "en",
|
||||
"pipeline": ["vectorizer", "sentiment"]
|
||||
}
|
||||
"pipeline": ["vectorizer", "sentiment"]
|
||||
}
|
||||
|
||||
p
|
||||
|
@ -307,7 +303,7 @@ p
|
|||
| by your custom #[code "sentiment"] factory.
|
||||
|
||||
+code.
|
||||
nlp = spacy.load('my_sentiment_model')
|
||||
nlp = spacy.load('en_sentiment_model')
|
||||
doc = nlp(u'I love pizza')
|
||||
assert doc.sentiment
|
||||
|
||||
|
|
|
@ -129,15 +129,18 @@ p
|
|||
+code.
|
||||
import spacy
|
||||
from spacy.tokens.doc import Doc
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
nlp = spacy.load('en')
|
||||
moby_dick = open('moby_dick.txt', 'r')
|
||||
doc = nlp(moby_dick)
|
||||
doc.to_disk('/moby_dick.bin')
|
||||
|
||||
new_doc = Doc().from_disk('/moby_dick.bin')
|
||||
new_doc = Doc(Vocab()).from_disk('/moby_dick.bin')
|
||||
|
||||
+infobox
|
||||
| #[strong API:] #[+api("language") #[code Language]],
|
||||
| #[+api("doc") #[code Doc]]
|
||||
| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
|
||||
|
||||
+h(2, "rule-matcher") Match text with token rules
|
||||
|
@ -148,9 +151,14 @@ p
|
|||
|
||||
nlp = spacy.load('en')
|
||||
matcher = Matcher(nlp.vocab)
|
||||
# match "Google I/O" or "Google i/o"
|
||||
pattern = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}]
|
||||
matcher.add('GoogleIO', None, pattern)
|
||||
|
||||
def set_sentiment(matcher, doc, i, matches):
|
||||
doc.sentiment += 0.1
|
||||
|
||||
pattern1 = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}]
|
||||
pattern2 = [[{'ORTH': emoji, 'OP': '+'}] for emoji in ['😀', '😂', '🤣', '😍']]
|
||||
matcher.add('GoogleIO', None, pattern1) # match "Google I/O" or "Google i/o"
|
||||
matcher.add('HAPPY', set_sentiment, pattern2) # match one or more happy emoji
|
||||
matches = nlp(LOTS_OF TEXT)
|
||||
|
||||
+infobox
|
||||
|
|
|
@ -11,7 +11,7 @@ p
|
|||
| You can also associate patterns with entity IDs, to allow some basic
|
||||
| entity linking or disambiguation.
|
||||
|
||||
+aside("What about \"real\" regular expressions?")
|
||||
//-+aside("What about \"real\" regular expressions?")
|
||||
|
||||
+h(2, "adding-patterns") Adding patterns
|
||||
|
||||
|
@ -119,7 +119,7 @@ p
|
|||
+code.
|
||||
# Add a new custom flag to the vocab, which is always False by default.
|
||||
# BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span.
|
||||
BAD_HTML_FLAG = doc.vocab.add_flag(lambda text: False)
|
||||
BAD_HTML_FLAG = nlp.vocab.add_flag(lambda text: False)
|
||||
|
||||
def merge_and_flag(matcher, doc, i, matches):
|
||||
match_id, start, end = matches[i]
|
||||
|
@ -221,7 +221,7 @@ p
|
|||
+cell match 0 or 1 times
|
||||
+cell optional, max one
|
||||
|
||||
+h(3, "quantifiers-example1") Quantifiers example: Using linguistic annotations
|
||||
+h(2, "example1") Example: Using linguistic annotations
|
||||
|
||||
p
|
||||
| Let's say you're analysing user comments and you want to find out what
|
||||
|
@ -283,7 +283,7 @@ p
|
|||
# set manual=True to make displaCy render straight from a dictionary
|
||||
displacy.serve(matched_sents, style='ent', manual=True)
|
||||
|
||||
+h(3, "quantifiers-example2") Quantifiers example: Phone numbers
|
||||
+h(2, "example2") Example: Phone numbers
|
||||
|
||||
p
|
||||
| Phone numbers can have many different formats and matching them is often
|
||||
|
@ -320,3 +320,114 @@ p
|
|||
| It'll produce more predictable results, is much easier to modify and
|
||||
| extend, and doesn't require any training data – only a set of
|
||||
| test cases.
|
||||
|
||||
+h(2, "example3") Example: Hashtags and emoji on social media
|
||||
|
||||
p
|
||||
| Social media posts, especially tweets, can be difficult to work with.
|
||||
| They're very short and often contain various emoji and hashtags. By only
|
||||
| looking at the plain text, you'll lose a lot of valuable semantic
|
||||
| information.
|
||||
|
||||
p
|
||||
| Let's say you've extracted a large sample of social media posts on a
|
||||
| specific topic, for example posts mentioning a brand name or product.
|
||||
| As the first step of your data exploration, you want to filter out posts
|
||||
| containing certain emoji and use them to assign a general sentiment
|
||||
| score, based on whether the expressed emotion is positive or negative,
|
||||
| e.g. #[span.o-icon.o-icon--inline 😀] or #[span.o-icon.o-icon--inline 😞].
|
||||
| You also want to find, merge and label hashtags like
|
||||
| #[code #MondayMotivation], to be able to ignore or analyse them later.
|
||||
|
||||
+aside("Note on sentiment analysis")
|
||||
| Ultimately, sentiment analysis is not always #[em that] easy. In
|
||||
| addition to the emoji, you'll also want to take specific words into
|
||||
| account and check the #[code subtree] for intensifiers like "very", to
|
||||
| increase the sentiment score. At some point, you might also want to train
|
||||
| a sentiment model. However, the approach described in this example is
|
||||
| very useful for #[strong bootstrapping rules to collect training data].
|
||||
| It's also an incredibly fast way to gather first insights into your data
|
||||
| – with about 1 million tweets, you'd be looking at a processing time of
|
||||
| #[strong under 1 minute].
|
||||
|
||||
p
|
||||
| By default, spaCy's tokenizer will split emoji into separate tokens. This
|
||||
| means that you can create a pattern for one or more emoji tokens. In this
|
||||
| case, a sequence of identical emoji should be treated as one instance.
|
||||
| Valid hashtags usually consist of a #[code #], plus a sequence of
|
||||
| ASCII characters with no whitespace, making them easy to match as well.
|
||||
|
||||
+code.
|
||||
from spacy.lang.en import English
|
||||
from spacy.matcher import Matcher
|
||||
|
||||
nlp = English() # we only want the tokenizer, so no need to load a model
|
||||
matcher = Matcher(nlp.vocab)
|
||||
|
||||
pos_emoji = [u'😀', u'😃', u'😂', u'🤣', u'😊', u'😍'] # positive emoji
|
||||
neg_emoji = [u'😞', u'😠', u'😩', u'😢', u'😭', u'😒'] # negative emoji
|
||||
|
||||
# add patterns to match one or more emoji tokens
|
||||
pos_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in pos_emoji]
|
||||
neg_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in neg_emoji]
|
||||
|
||||
matcher.add('HAPPY', label_sentiment, *pos_patterns) # add positive pattern
|
||||
matcher.add('SAD', label_sentiment, *neg_patterns) # add negative pattern
|
||||
|
||||
# add pattern to merge valid hashtag, i.e. '#' plus any ASCII token
|
||||
matcher.add('HASHTAG', merge_hashtag, [{'ORTH': '#'}, {'IS_ASCII': True}])
|
||||
|
||||
p
|
||||
| Because the #[code on_match] callback receives the ID of each match, you
|
||||
| can use the same function to handle the sentiment assignment for both
|
||||
| the positive and negative pattern. To keep it simple, we'll either add
|
||||
| or subtract #[code 0.1] points – this way, the score will also reflect
|
||||
| combinations of emoji, even positive #[em and] negative ones.
|
||||
|
||||
p
|
||||
| With a library like
|
||||
| #[+a("https://github.com/bcongdon/python-emojipedia") Emojipedia],
|
||||
| we can also retrieve a short description for each emoji – for example,
|
||||
| #[span.o-icon.o-icon--inline 😍]'s official title is "Smiling Face With
|
||||
| Heart-Eyes". Assigning it to the merged token's norm will make it
|
||||
| available as #[code token.norm_].
|
||||
|
||||
+code.
|
||||
from emojipedia import Emojipedia # installation: pip install emojipedia
|
||||
|
||||
def label_sentiment(matcher, doc, i, matches):
|
||||
match_id, start, end = matches[i]
|
||||
if match_id is 'HAPPY':
|
||||
doc.sentiment += 0.1 # add 0.1 for positive sentiment
|
||||
elif match_id is 'SAD':
|
||||
doc.sentiment -= 0.1 # subtract 0.1 for negative sentiment
|
||||
span = doc[start : end]
|
||||
emoji = Emojipedia.search(span[0].text) # get data for emoji
|
||||
span.merge(norm=emoji.title) # merge span and set NORM to emoji title
|
||||
|
||||
p
|
||||
| To label the hashtags, we first need to add a new custom flag.
|
||||
| #[code IS_HASHTAG] will be the flag's ID, which you can use to assign it
|
||||
| to the hashtag's span, and check its value via a token's
|
||||
| #[+api("token#check_flag") #[code code check_flag()]] method. On each
|
||||
| match, we merge the hashtag and assign the flag.
|
||||
|
||||
+code.
|
||||
# Add a new custom flag to the vocab, which is always False by default
|
||||
IS_HASHTAG = nlp.vocab.add_flag(lambda text: False)
|
||||
|
||||
def merge_hashtag(matcher, doc, i, matches):
|
||||
match_id, start, end = matches[i]
|
||||
span = doc[start : end]
|
||||
span.merge() # merge hashtag
|
||||
span.set_flag(IS_HASHTAG, True) # set IS_HASHTAG to True
|
||||
|
||||
p
|
||||
| To process a stream of social media posts, we can use
|
||||
| #[+api("language#pipe") #[code Language.pipe()]], which will return a
|
||||
| stream of #[code Doc] objects that we can pass to
|
||||
| #[+api("matcher#pipe") #[code Matcher.pipe()]].
|
||||
|
||||
+code.
|
||||
docs = nlp.pipe(LOTS_OF_TWEETS)
|
||||
matches = matcher.pipe(docs)
|
||||
|
|
|
@ -74,16 +74,14 @@ p
|
|||
+aside-code("meta.json", "json").
|
||||
{
|
||||
"name": "example_model",
|
||||
"lang": "en",
|
||||
"version": "1.0.0",
|
||||
"spacy_version": ">=2.0.0,<3.0.0",
|
||||
"description": "Example model for spaCy",
|
||||
"author": "You",
|
||||
"email": "you@example.com",
|
||||
"license": "CC BY-SA 3.0",
|
||||
"setup": {
|
||||
"lang": "en",
|
||||
"pipeline": ["token_vectors", "tagger"]
|
||||
}
|
||||
"pipeline": ["token_vectors", "tagger"]
|
||||
}
|
||||
|
||||
+code(false, "bash").
|
||||
|
@ -110,9 +108,9 @@ p
|
|||
+h(3, "models-custom") Customising the model setup
|
||||
|
||||
p
|
||||
| The meta.json includes a #[code setup] key that lets you customise how
|
||||
| the model should be initialised and loaded. You can define the language
|
||||
| data to be loaded and the
|
||||
| The meta.json includes the model details, like name, requirements and
|
||||
| license, and lets you customise how the model should be initialised and
|
||||
| loaded. You can define the language data to be loaded and the
|
||||
| #[+a("/docs/usage/language-processing-pipeline") processing pipeline] to
|
||||
| execute.
|
||||
|
||||
|
@ -183,9 +181,9 @@ p
|
|||
p
|
||||
| To load a model from a data directory, you can use
|
||||
| #[+api("spacy#load") #[code spacy.load()]] with the local path. This will
|
||||
| look for a meta.json in the directory and use the #[code setup] details
|
||||
| to initialise a #[code Language] class with a processing pipeline and
|
||||
| load in the model data.
|
||||
| look for a meta.json in the directory and use the #[code lang] and
|
||||
| #[code pipeline] settings to initialise a #[code Language] class with a
|
||||
| processing pipeline and load in the model data.
|
||||
|
||||
+code.
|
||||
nlp = spacy.load('/path/to/model')
|
||||
|
|
|
@ -65,7 +65,7 @@ p
|
|||
| spaCy provides a variety of linguistic annotations to give you insights
|
||||
| into a text's grammatical structure. This includes the word types,
|
||||
| i.e. the parts of speech, and how the words are related to each other.
|
||||
| For example, if you're analysing text, it makes a #[em huge] difference
|
||||
| For example, if you're analysing text, it makes a huge difference
|
||||
| whether a noun is the subject of a sentence, or the object – or whether
|
||||
| "google" is used as a verb, or refers to the website or company in a
|
||||
| specific context.
|
||||
|
@ -94,9 +94,10 @@ p
|
|||
include _spacy-101/_tokenization
|
||||
|
||||
+infobox
|
||||
| To learn more about how spaCy's tokenizer and its rules work in detail,
|
||||
| how to #[strong customise] it and how to #[strong add your own tokenizer]
|
||||
| to a processing pipeline, see the usage guide on
|
||||
| To learn more about how spaCy's tokenization rules work in detail,
|
||||
| how to #[strong customise and replace] the default tokenizer and how to
|
||||
| #[strong add language-specific data], see the usage guides on
|
||||
| #[+a("/docs/usage/adding-languages") adding languages] and
|
||||
| #[+a("/docs/usage/customizing-tokenizer") customising the tokenizer].
|
||||
|
||||
+h(3, "annotations-pos-deps") Part-of-speech tags and dependencies
|
||||
|
@ -118,9 +119,11 @@ include _spacy-101/_named-entities
|
|||
|
||||
+infobox
|
||||
| To learn more about entity recognition in spaCy, how to
|
||||
| #[strong add your own entities] to a document and how to train and update
|
||||
| the entity predictions of a model, see the usage guide on
|
||||
| #[+a("/docs/usage/entity-recognition") named entity recognition].
|
||||
| #[strong add your own entities] to a document and how to
|
||||
| #[strong train and update] the entity predictions of a model, see the
|
||||
| usage guides on
|
||||
| #[+a("/docs/usage/entity-recognition") named entity recognition] and
|
||||
| #[+a("/docs/usage/training-ner") training the named entity recognizer].
|
||||
|
||||
+h(2, "vectors-similarity") Word vectors and similarity
|
||||
+tag-model("vectors")
|
||||
|
|
|
@ -20,19 +20,18 @@ p
|
|||
nlp = Language(pipeline=['my_factory', mycomponent])
|
||||
|
||||
p
|
||||
| It's now much easier to customise the pipeline with your own components.
|
||||
| Components are functions that receive a #[code Doc] object, modify and
|
||||
| return it. If your component is stateful, you'll want to create a new one
|
||||
| for each pipeline. You can do that by defining and registering a factory
|
||||
| which receives the shared #[code Vocab] object and returns a component.
|
||||
|
||||
p
|
||||
| spaCy's default components – the vectorizer, tagger, parser and entity
|
||||
| recognizer, can be added to your pipeline by using their string IDs.
|
||||
| This way, you won't have to worry about finding and implementing them –
|
||||
| to use the default tagger, simply add #[code "tagger"] to the pipeline,
|
||||
| It's now much easier to #[strong customise the pipeline] with your own
|
||||
| components, functions that receive a #[code Doc] object, modify and
|
||||
| return it. If your component is stateful, you can define and register a
|
||||
| factory which receives the shared #[code Vocab] object and returns a
|
||||
| component. spaCy's default components can be added to your pipeline by
|
||||
| using their string IDs. This way, you won't have to worry about finding
|
||||
| and implementing them – simply add #[code "tagger"] to the pipeline,
|
||||
| and spaCy will know what to do.
|
||||
|
||||
+image
|
||||
include ../../assets/img/docs/pipeline.svg
|
||||
|
||||
+infobox
|
||||
| #[strong API:] #[+api("language") #[code Language]]
|
||||
| #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text]
|
||||
|
@ -96,11 +95,10 @@ p
|
|||
| #[code Language] class, or load a model that initialises one. This allows
|
||||
| languages to contain more custom data, e.g. lemmatizer lookup tables, or
|
||||
| complex regular expressions. The language data has also been tidied up
|
||||
| and simplified. It's now also possible to overwrite the functions that
|
||||
| compute lexical attributes like #[code like_num], and supply
|
||||
| language-specific syntax iterators, e.g. to determine noun chunks. spaCy
|
||||
| now also supports simple lookup-based lemmatization. The data is stored
|
||||
| in a dictionary mapping a string to its lemma.
|
||||
| and simplified. spaCy now also supports simple lookup-based lemmatization.
|
||||
|
||||
+image
|
||||
include ../../assets/img/docs/language_data.svg
|
||||
|
||||
+infobox
|
||||
| #[strong API:] #[+api("language") #[code Language]]
|
||||
|
@ -111,13 +109,10 @@ p
|
|||
|
||||
+aside-code("Example").
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.attrs import LOWER, IS_PUNCT
|
||||
matcher = Matcher(nlp.vocab)
|
||||
matcher.add('HelloWorld', None,
|
||||
[{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}],
|
||||
[{LOWER: 'hello'}, {LOWER: 'world'}])
|
||||
matcher.add('HEARTS', None, [{'ORTH': '❤️', 'OP': '+'}])
|
||||
assert len(matcher) == 1
|
||||
assert 'HelloWorld' in matcher
|
||||
assert 'HEARTS' in matcher
|
||||
|
||||
p
|
||||
| Patterns can now be added to the matcher by calling
|
||||
|
@ -157,28 +152,8 @@ p
|
|||
+cell #[+api("language#to_disk") #[code Language.to_disk]]
|
||||
|
||||
+row
|
||||
+cell #[code Tokenizer.load]
|
||||
+cell
|
||||
| #[+api("tokenizer#from_disk") #[code Tokenizer.from_disk]]
|
||||
| #[+api("tokenizer#from_bytes") #[code Tokenizer.from_bytes]]
|
||||
|
||||
+row
|
||||
+cell #[code Tagger.load]
|
||||
+cell
|
||||
| #[+api("tagger#from_disk") #[code Tagger.from_disk]]
|
||||
| #[+api("tagger#from_bytes") #[code Tagger.from_bytes]]
|
||||
|
||||
+row
|
||||
+cell #[code DependencyParser.load]
|
||||
+cell
|
||||
| #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]]
|
||||
| #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]]
|
||||
|
||||
+row
|
||||
+cell #[code EntityRecognizer.load]
|
||||
+cell
|
||||
| #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]]
|
||||
| #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]]
|
||||
+cell #[code Language.create_make_doc]
|
||||
+cell #[+api("language#attributes") #[code Language.tokenizer]]
|
||||
|
||||
+row
|
||||
+cell
|
||||
|
@ -212,6 +187,28 @@ p
|
|||
| #[+api("stringstore#to_disk") #[code StringStore.to_disk]]
|
||||
| #[+api("stringstore#to_bytes") #[code StringStore.to_bytes]]
|
||||
|
||||
+row
|
||||
+cell #[code Tokenizer.load]
|
||||
+cell -
|
||||
|
||||
+row
|
||||
+cell #[code Tagger.load]
|
||||
+cell
|
||||
| #[+api("tagger#from_disk") #[code Tagger.from_disk]]
|
||||
| #[+api("tagger#from_bytes") #[code Tagger.from_bytes]]
|
||||
|
||||
+row
|
||||
+cell #[code DependencyParser.load]
|
||||
+cell
|
||||
| #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]]
|
||||
| #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]]
|
||||
|
||||
+row
|
||||
+cell #[code EntityRecognizer.load]
|
||||
+cell
|
||||
| #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]]
|
||||
| #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]]
|
||||
|
||||
+row
|
||||
+cell #[code Matcher.load]
|
||||
+cell -
|
||||
|
@ -232,7 +229,7 @@ p
|
|||
|
||||
+row
|
||||
+cell #[code Doc.read_bytes]
|
||||
+cell
|
||||
+cell #[+api("binder") #[code Binder]]
|
||||
|
||||
+row
|
||||
+cell #[code Token.is_ancestor_of]
|
||||
|
|