mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
49235017bf
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
||||||
import importlib
|
import importlib
|
||||||
|
|
||||||
from .compat import basestring_
|
from .compat import basestring_
|
||||||
from .cli.info import info
|
from .cli.info import info as cli_info
|
||||||
from .glossary import explain
|
from .glossary import explain
|
||||||
from .deprecated import resolve_load_name
|
from .deprecated import resolve_load_name
|
||||||
from . import util
|
from . import util
|
||||||
|
@ -20,3 +20,7 @@ def load(name, **overrides):
|
||||||
overrides['meta'] = meta
|
overrides['meta'] = meta
|
||||||
overrides['path'] = model_path
|
overrides['path'] = model_path
|
||||||
return cls(**overrides)
|
return cls(**overrides)
|
||||||
|
|
||||||
|
|
||||||
|
def info(model=None, markdown=False):
|
||||||
|
return cli_info(None, model, markdown)
|
||||||
|
|
|
@ -24,8 +24,9 @@ CONVERTERS = {
|
||||||
n_sents=("Number of sentences per doc", "option", "n", float),
|
n_sents=("Number of sentences per doc", "option", "n", float),
|
||||||
morphology=("Enable appending morphology to tags", "flag", "m", bool)
|
morphology=("Enable appending morphology to tags", "flag", "m", bool)
|
||||||
)
|
)
|
||||||
def convert(_, input_file, output_dir, n_sents, morphology):
|
def convert(cmd, input_file, output_dir, n_sents, morphology):
|
||||||
"""Convert files into JSON format for use with train command and other
|
"""
|
||||||
|
Convert files into JSON format for use with train command and other
|
||||||
experiment management functions.
|
experiment management functions.
|
||||||
"""
|
"""
|
||||||
input_path = Path(input_file)
|
input_path = Path(input_file)
|
||||||
|
|
|
@ -17,8 +17,9 @@ from .. import about
|
||||||
direct=("force direct download. Needs model name with version and won't "
|
direct=("force direct download. Needs model name with version and won't "
|
||||||
"perform compatibility check", "flag", "d", bool)
|
"perform compatibility check", "flag", "d", bool)
|
||||||
)
|
)
|
||||||
def download(model, direct=False):
|
def download(cmd, model, direct=False):
|
||||||
"""Download compatible model from default download path using pip. Model
|
"""
|
||||||
|
Download compatible model from default download path using pip. Model
|
||||||
can be shortcut, model name or, if --direct flag is set, full model name
|
can be shortcut, model name or, if --direct flag is set, full model name
|
||||||
with version.
|
with version.
|
||||||
"""
|
"""
|
||||||
|
@ -31,7 +32,7 @@ def download(model, direct=False):
|
||||||
version = get_version(model_name, compatibility)
|
version = get_version(model_name, compatibility)
|
||||||
download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
|
download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
|
||||||
try:
|
try:
|
||||||
link(model_name, model, force=True)
|
link(None, model_name, model, force=True)
|
||||||
except:
|
except:
|
||||||
# Dirty, but since spacy.download and the auto-linking is mostly
|
# Dirty, but since spacy.download and the auto-linking is mostly
|
||||||
# a convenience wrapper, it's best to show a success message and
|
# a convenience wrapper, it's best to show a success message and
|
||||||
|
|
|
@ -14,7 +14,7 @@ from .. import util
|
||||||
model=("optional: shortcut link of model", "positional", None, str),
|
model=("optional: shortcut link of model", "positional", None, str),
|
||||||
markdown=("generate Markdown for GitHub issues", "flag", "md", str)
|
markdown=("generate Markdown for GitHub issues", "flag", "md", str)
|
||||||
)
|
)
|
||||||
def info(model=None, markdown=False):
|
def info(cmd, model=None, markdown=False):
|
||||||
"""Print info about spaCy installation. If a model shortcut link is
|
"""Print info about spaCy installation. If a model shortcut link is
|
||||||
speficied as an argument, print model information. Flag --markdown
|
speficied as an argument, print model information. Flag --markdown
|
||||||
prints details in Markdown for easy copy-pasting to GitHub issues.
|
prints details in Markdown for easy copy-pasting to GitHub issues.
|
||||||
|
|
|
@ -14,8 +14,9 @@ from .. import util
|
||||||
link_name=("name of shortuct link to create", "positional", None, str),
|
link_name=("name of shortuct link to create", "positional", None, str),
|
||||||
force=("force overwriting of existing link", "flag", "f", bool)
|
force=("force overwriting of existing link", "flag", "f", bool)
|
||||||
)
|
)
|
||||||
def link(origin, link_name, force=False):
|
def link(cmd, origin, link_name, force=False):
|
||||||
"""Create a symlink for models within the spacy/data directory. Accepts
|
"""
|
||||||
|
Create a symlink for models within the spacy/data directory. Accepts
|
||||||
either the name of a pip package, or the local path to the model data
|
either the name of a pip package, or the local path to the model data
|
||||||
directory. Linking models allows loading them via spacy.load(link_name).
|
directory. Linking models allows loading them via spacy.load(link_name).
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -18,8 +18,9 @@ from .. import about
|
||||||
meta=("path to meta.json", "option", "m", str),
|
meta=("path to meta.json", "option", "m", str),
|
||||||
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
|
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
|
||||||
)
|
)
|
||||||
def package(input_dir, output_dir, meta, force):
|
def package(cmd, input_dir, output_dir, meta=None, force=False):
|
||||||
"""Generate Python package for model data, including meta and required
|
"""
|
||||||
|
Generate Python package for model data, including meta and required
|
||||||
installation files. A new directory will be created in the specified
|
installation files. A new directory will be created in the specified
|
||||||
output directory, and model data will be copied over.
|
output directory, and model data will be copied over.
|
||||||
"""
|
"""
|
||||||
|
@ -42,7 +43,7 @@ def package(input_dir, output_dir, meta, force):
|
||||||
meta = util.read_json(meta_path)
|
meta = util.read_json(meta_path)
|
||||||
else:
|
else:
|
||||||
meta = generate_meta()
|
meta = generate_meta()
|
||||||
validate_meta(meta, ['lang', 'name', 'version'])
|
meta = validate_meta(meta, ['lang', 'name', 'version'])
|
||||||
|
|
||||||
model_name = meta['lang'] + '_' + meta['name']
|
model_name = meta['lang'] + '_' + meta['name']
|
||||||
model_name_v = model_name + '-' + meta['version']
|
model_name_v = model_name + '-' + meta['version']
|
||||||
|
@ -85,20 +86,32 @@ def generate_meta():
|
||||||
('email', 'Author email', False),
|
('email', 'Author email', False),
|
||||||
('url', 'Author website', False),
|
('url', 'Author website', False),
|
||||||
('license', 'License', 'CC BY-NC 3.0')]
|
('license', 'License', 'CC BY-NC 3.0')]
|
||||||
|
|
||||||
prints("Enter the package settings for your model.", title="Generating meta.json")
|
prints("Enter the package settings for your model.", title="Generating meta.json")
|
||||||
meta = {}
|
meta = {}
|
||||||
for setting, desc, default in settings:
|
for setting, desc, default in settings:
|
||||||
response = util.get_raw_input(desc, default)
|
response = util.get_raw_input(desc, default)
|
||||||
meta[setting] = default if response == '' and default else response
|
meta[setting] = default if response == '' and default else response
|
||||||
|
meta['pipeline'] = generate_pipeline()
|
||||||
return meta
|
return meta
|
||||||
|
|
||||||
|
|
||||||
|
def generate_pipeline():
|
||||||
|
prints("If set to 'True', the default pipeline is used. If set to 'False', "
|
||||||
|
"the pipeline will be disabled. Components should be specified as a "
|
||||||
|
"comma-separated list of component names, e.g. vectorizer, tagger, "
|
||||||
|
"parser, ner. For more information, see the docs on processing pipelines.",
|
||||||
|
title="Enter your model's pipeline components")
|
||||||
|
pipeline = util.get_raw_input("Pipeline components", True)
|
||||||
|
replace = {'True': True, 'False': False}
|
||||||
|
return replace[pipeline] if pipeline in replace else pipeline.split(', ')
|
||||||
|
|
||||||
|
|
||||||
def validate_meta(meta, keys):
|
def validate_meta(meta, keys):
|
||||||
for key in keys:
|
for key in keys:
|
||||||
if key not in meta or meta[key] == '':
|
if key not in meta or meta[key] == '':
|
||||||
prints("This setting is required to build your package.",
|
prints("This setting is required to build your package.",
|
||||||
title='No "%s" setting found in meta.json' % key, exits=1)
|
title='No "%s" setting found in meta.json' % key, exits=1)
|
||||||
|
return meta
|
||||||
|
|
||||||
|
|
||||||
def get_template(filepath):
|
def get_template(filepath):
|
||||||
|
|
|
@ -32,9 +32,11 @@ from .. import displacy
|
||||||
no_parser=("Don't train parser", "flag", "P", bool),
|
no_parser=("Don't train parser", "flag", "P", bool),
|
||||||
no_entities=("Don't train NER", "flag", "N", bool)
|
no_entities=("Don't train NER", "flag", "N", bool)
|
||||||
)
|
)
|
||||||
def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||||
use_gpu=False, no_tagger=False, no_parser=False, no_entities=False):
|
use_gpu=False, no_tagger=False, no_parser=False, no_entities=False):
|
||||||
"""Train a model. Expects data in spaCy's JSON format."""
|
"""
|
||||||
|
Train a model. Expects data in spaCy's JSON format.
|
||||||
|
"""
|
||||||
n_sents = n_sents or None
|
n_sents = n_sents or None
|
||||||
output_path = util.ensure_path(output_dir)
|
output_path = util.ensure_path(output_dir)
|
||||||
train_path = util.ensure_path(train_data)
|
train_path = util.ensure_path(train_data)
|
||||||
|
@ -70,12 +72,12 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||||
|
|
||||||
optimizer = nlp.begin_training(lambda: corpus.train_tuples, use_gpu=use_gpu)
|
optimizer = nlp.begin_training(lambda: corpus.train_tuples, use_gpu=use_gpu)
|
||||||
|
|
||||||
print("Itn.\tDep. Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
|
print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
|
||||||
try:
|
try:
|
||||||
for i in range(n_iter):
|
for i in range(n_iter):
|
||||||
with tqdm.tqdm(total=corpus.count_train(), leave=False) as pbar:
|
with tqdm.tqdm(total=corpus.count_train(), leave=False) as pbar:
|
||||||
train_docs = corpus.train_docs(nlp, projectivize=True,
|
train_docs = corpus.train_docs(nlp, projectivize=True,
|
||||||
gold_preproc=False, max_length=1000)
|
gold_preproc=False, max_length=0)
|
||||||
losses = {}
|
losses = {}
|
||||||
for batch in minibatch(train_docs, size=batch_sizes):
|
for batch in minibatch(train_docs, size=batch_sizes):
|
||||||
docs, golds = zip(*batch)
|
docs, golds = zip(*batch)
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, UNITS
|
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
||||||
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
|
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES, UNITS
|
||||||
|
|
||||||
|
|
||||||
_currency = r"\$|¢|£|€|¥|฿|৳"
|
_currency = r"\$|¢|£|€|¥|฿|৳"
|
||||||
|
@ -10,16 +10,16 @@ _quotes = QUOTES.replace("'", '')
|
||||||
_list_punct = LIST_PUNCT + '। ॥'.strip().split()
|
_list_punct = LIST_PUNCT + '। ॥'.strip().split()
|
||||||
|
|
||||||
|
|
||||||
_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES)
|
_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS)
|
||||||
|
|
||||||
_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES +
|
_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
|
||||||
[r'(?<=[0-9])\+',
|
[r'(?<=[0-9])\+',
|
||||||
r'(?<=°[FfCcKk])\.',
|
r'(?<=°[FfCcKk])\.',
|
||||||
r'(?<=[0-9])(?:{})'.format(_currency),
|
r'(?<=[0-9])(?:{})'.format(_currency),
|
||||||
r'(?<=[0-9])(?:{})'.format(UNITS),
|
r'(?<=[0-9])(?:{})'.format(UNITS),
|
||||||
r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)])
|
r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)])
|
||||||
|
|
||||||
_infixes = (LIST_ELLIPSES +
|
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
||||||
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
||||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||||
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
|
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
|
||||||
|
|
|
@ -20,7 +20,6 @@ _upper = [_latin_upper]
|
||||||
_lower = [_latin_lower]
|
_lower = [_latin_lower]
|
||||||
_uncased = [_bengali, _hebrew]
|
_uncased = [_bengali, _hebrew]
|
||||||
|
|
||||||
|
|
||||||
ALPHA = merge_char_classes(_upper + _lower + _uncased)
|
ALPHA = merge_char_classes(_upper + _lower + _uncased)
|
||||||
ALPHA_LOWER = merge_char_classes(_lower + _uncased)
|
ALPHA_LOWER = merge_char_classes(_lower + _uncased)
|
||||||
ALPHA_UPPER = merge_char_classes(_upper + _uncased)
|
ALPHA_UPPER = merge_char_classes(_upper + _uncased)
|
||||||
|
@ -33,13 +32,14 @@ _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
|
||||||
_punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
|
_punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
|
||||||
_quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «'
|
_quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «'
|
||||||
_hyphens = '- – — -- ---'
|
_hyphens = '- – — -- ---'
|
||||||
|
_other_symbols = r'[\p{So}]'
|
||||||
|
|
||||||
UNITS = merge_chars(_units)
|
UNITS = merge_chars(_units)
|
||||||
CURRENCY = merge_chars(_currency)
|
CURRENCY = merge_chars(_currency)
|
||||||
QUOTES = merge_chars(_quotes)
|
QUOTES = merge_chars(_quotes)
|
||||||
PUNCT = merge_chars(_punct)
|
PUNCT = merge_chars(_punct)
|
||||||
HYPHENS = merge_chars(_hyphens)
|
HYPHENS = merge_chars(_hyphens)
|
||||||
|
ICONS = _other_symbols
|
||||||
|
|
||||||
LIST_UNITS = split_chars(_units)
|
LIST_UNITS = split_chars(_units)
|
||||||
LIST_CURRENCY = split_chars(_currency)
|
LIST_CURRENCY = split_chars(_currency)
|
||||||
|
@ -47,3 +47,4 @@ LIST_QUOTES = split_chars(_quotes)
|
||||||
LIST_PUNCT = split_chars(_punct)
|
LIST_PUNCT = split_chars(_punct)
|
||||||
LIST_HYPHENS = split_chars(_hyphens)
|
LIST_HYPHENS = split_chars(_hyphens)
|
||||||
LIST_ELLIPSES = [r'\.\.+', '…']
|
LIST_ELLIPSES = [r'\.\.+', '…']
|
||||||
|
LIST_ICONS = [_other_symbols]
|
||||||
|
|
|
@ -2,15 +2,16 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
||||||
from .char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
|
from .char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
|
||||||
from .char_classes import CURRENCY, UNITS
|
from .char_classes import QUOTES, CURRENCY, UNITS
|
||||||
|
|
||||||
|
|
||||||
_prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
|
_prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
|
||||||
LIST_CURRENCY)
|
LIST_CURRENCY + LIST_ICONS)
|
||||||
|
|
||||||
|
|
||||||
_suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
|
_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
|
||||||
|
["'s", "'S", "’s", "’S"] +
|
||||||
[r'(?<=[0-9])\+',
|
[r'(?<=[0-9])\+',
|
||||||
r'(?<=°[FfCcKk])\.',
|
r'(?<=°[FfCcKk])\.',
|
||||||
r'(?<=[0-9])(?:{})'.format(CURRENCY),
|
r'(?<=[0-9])(?:{})'.format(CURRENCY),
|
||||||
|
@ -19,7 +20,7 @@ _suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QU
|
||||||
r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER)])
|
r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER)])
|
||||||
|
|
||||||
|
|
||||||
_infixes = (LIST_ELLIPSES +
|
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
||||||
[r'(?<=[0-9])[+\-\*^](?=[0-9-])',
|
[r'(?<=[0-9])[+\-\*^](?=[0-9-])',
|
||||||
r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
||||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||||
|
|
|
@ -212,18 +212,17 @@ class Language(object):
|
||||||
"""
|
"""
|
||||||
tok2vec = self.pipeline[0]
|
tok2vec = self.pipeline[0]
|
||||||
feats = tok2vec.doc2feats(docs)
|
feats = tok2vec.doc2feats(docs)
|
||||||
procs = list(self.pipeline[1:])
|
|
||||||
random.shuffle(procs)
|
|
||||||
grads = {}
|
grads = {}
|
||||||
def get_grads(W, dW, key=None):
|
def get_grads(W, dW, key=None):
|
||||||
grads[key] = (W, dW)
|
grads[key] = (W, dW)
|
||||||
for proc in procs:
|
for proc in self.pipeline[1:]:
|
||||||
if not hasattr(proc, 'update'):
|
if not hasattr(proc, 'update'):
|
||||||
continue
|
continue
|
||||||
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
|
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
|
||||||
d_tokvecses = proc.update((docs, tokvecses), golds,
|
d_tokvecses = proc.update((docs, tokvecses), golds,
|
||||||
drop=drop, sgd=get_grads, losses=losses)
|
drop=drop, sgd=get_grads, losses=losses)
|
||||||
bp_tokvecses(d_tokvecses, sgd=sgd)
|
if d_tokvecses is not None:
|
||||||
|
bp_tokvecses(d_tokvecses, sgd=sgd)
|
||||||
for key, (W, dW) in grads.items():
|
for key, (W, dW) in grads.items():
|
||||||
sgd(W, dW, key=key)
|
sgd(W, dW, key=key)
|
||||||
# Clear the tensor variable, to free GPU memory.
|
# Clear the tensor variable, to free GPU memory.
|
||||||
|
|
|
@ -432,6 +432,8 @@ cdef class Parser:
|
||||||
0.0)
|
0.0)
|
||||||
todo = [(s, g) for (s, g) in zip(states, golds)
|
todo = [(s, g) for (s, g) in zip(states, golds)
|
||||||
if not s.is_final() and g is not None]
|
if not s.is_final() and g is not None]
|
||||||
|
if not todo:
|
||||||
|
return None
|
||||||
|
|
||||||
backprops = []
|
backprops = []
|
||||||
d_tokvecs = state2vec.ops.allocate(tokvecs.shape)
|
d_tokvecs = state2vec.ops.allocate(tokvecs.shape)
|
||||||
|
|
|
@ -1,7 +1,4 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
"""Test that tokenizer exceptions and emoticons are handled correctly."""
|
|
||||||
|
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
@ -39,3 +36,12 @@ def test_tokenizer_handles_emoticons(tokenizer):
|
||||||
def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
|
def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
|
||||||
tokens = tokenizer(text)
|
tokens = tokenizer(text)
|
||||||
assert len(tokens) == length
|
assert len(tokens) == length
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8),
|
||||||
|
('i💙you', 3), ('🤘🤘yay!', 4)])
|
||||||
|
def test_tokenizer_handles_emoji(tokenizer, text, length):
|
||||||
|
exceptions = ["hu"]
|
||||||
|
tokens = tokenizer(text)
|
||||||
|
if tokens[0].lang_ not in exceptions:
|
||||||
|
assert len(tokens) == length
|
||||||
|
|
|
@ -19,19 +19,17 @@ p
|
||||||
|
|
||||||
p
|
p
|
||||||
| When you load a model, spaCy first consults the model's
|
| When you load a model, spaCy first consults the model's
|
||||||
| #[+a("/docs/usage/saving-loading#models-generating") meta.json] for its
|
| #[+a("/docs/usage/saving-loading#models-generating") meta.json]. The
|
||||||
| #[code setup] details. This typically includes the ID of a language class,
|
| meta typically includes the model details, the ID of a language class,
|
||||||
| and an optional list of pipeline components. spaCy then does the
|
| and an optional list of pipeline components. spaCy then does the
|
||||||
| following:
|
| following:
|
||||||
|
|
||||||
+aside-code("meta.json (excerpt)", "json").
|
+aside-code("meta.json (excerpt)", "json").
|
||||||
{
|
{
|
||||||
"name": "example_model",
|
"name": "example_model",
|
||||||
|
"lang": "en"
|
||||||
"description": "Example model for spaCy",
|
"description": "Example model for spaCy",
|
||||||
"setup": {
|
"pipeline": ["token_vectors", "tagger"]
|
||||||
"lang": "en",
|
|
||||||
"pipeline": ["token_vectors", "tagger"]
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
+list("numbers")
|
+list("numbers")
|
||||||
|
@ -287,17 +285,15 @@ p
|
||||||
|
|
||||||
p
|
p
|
||||||
| In the model package's meta.json, specify the language class and pipeline
|
| In the model package's meta.json, specify the language class and pipeline
|
||||||
| IDs in #[code setup]:
|
| IDs:
|
||||||
|
|
||||||
+code("meta.json (excerpt)", "json").
|
+code("meta.json (excerpt)", "json").
|
||||||
{
|
{
|
||||||
"name": "my_sentiment_model",
|
"name": "sentiment_model",
|
||||||
|
"lang": "en",
|
||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
"spacy_version": ">=2.0.0,<3.0.0",
|
"spacy_version": ">=2.0.0,<3.0.0",
|
||||||
"setup": {
|
"pipeline": ["vectorizer", "sentiment"]
|
||||||
"lang": "en",
|
|
||||||
"pipeline": ["vectorizer", "sentiment"]
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
p
|
p
|
||||||
|
@ -307,7 +303,7 @@ p
|
||||||
| by your custom #[code "sentiment"] factory.
|
| by your custom #[code "sentiment"] factory.
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
nlp = spacy.load('my_sentiment_model')
|
nlp = spacy.load('en_sentiment_model')
|
||||||
doc = nlp(u'I love pizza')
|
doc = nlp(u'I love pizza')
|
||||||
assert doc.sentiment
|
assert doc.sentiment
|
||||||
|
|
||||||
|
|
|
@ -129,13 +129,14 @@ p
|
||||||
+code.
|
+code.
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.tokens.doc import Doc
|
from spacy.tokens.doc import Doc
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
nlp = spacy.load('en')
|
nlp = spacy.load('en')
|
||||||
moby_dick = open('moby_dick.txt', 'r')
|
moby_dick = open('moby_dick.txt', 'r')
|
||||||
doc = nlp(moby_dick)
|
doc = nlp(moby_dick)
|
||||||
doc.to_disk('/moby_dick.bin')
|
doc.to_disk('/moby_dick.bin')
|
||||||
|
|
||||||
new_doc = Doc().from_disk('/moby_dick.bin')
|
new_doc = Doc(Vocab()).from_disk('/moby_dick.bin')
|
||||||
|
|
||||||
+infobox
|
+infobox
|
||||||
| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
|
| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
|
||||||
|
@ -148,9 +149,14 @@ p
|
||||||
|
|
||||||
nlp = spacy.load('en')
|
nlp = spacy.load('en')
|
||||||
matcher = Matcher(nlp.vocab)
|
matcher = Matcher(nlp.vocab)
|
||||||
# match "Google I/O" or "Google i/o"
|
|
||||||
pattern = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}]
|
def set_sentiment(matcher, doc, i, matches):
|
||||||
matcher.add('GoogleIO', None, pattern)
|
doc.sentiment += 0.1
|
||||||
|
|
||||||
|
pattern1 = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}]
|
||||||
|
pattern2 = [[{'ORTH': emoji, 'OP': '+'}] for emoji in ['😀', '😂', '🤣', '😍']]
|
||||||
|
matcher.add('GoogleIO', None, pattern1) # match "Google I/O" or "Google i/o"
|
||||||
|
matcher.add('HAPPY', set_sentiment, pattern2) # match one or more happy emoji
|
||||||
matches = nlp(LOTS_OF TEXT)
|
matches = nlp(LOTS_OF TEXT)
|
||||||
|
|
||||||
+infobox
|
+infobox
|
||||||
|
|
|
@ -11,7 +11,7 @@ p
|
||||||
| You can also associate patterns with entity IDs, to allow some basic
|
| You can also associate patterns with entity IDs, to allow some basic
|
||||||
| entity linking or disambiguation.
|
| entity linking or disambiguation.
|
||||||
|
|
||||||
+aside("What about \"real\" regular expressions?")
|
//-+aside("What about \"real\" regular expressions?")
|
||||||
|
|
||||||
+h(2, "adding-patterns") Adding patterns
|
+h(2, "adding-patterns") Adding patterns
|
||||||
|
|
||||||
|
@ -119,7 +119,7 @@ p
|
||||||
+code.
|
+code.
|
||||||
# Add a new custom flag to the vocab, which is always False by default.
|
# Add a new custom flag to the vocab, which is always False by default.
|
||||||
# BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span.
|
# BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span.
|
||||||
BAD_HTML_FLAG = doc.vocab.add_flag(lambda text: False)
|
BAD_HTML_FLAG = nlp.vocab.add_flag(lambda text: False)
|
||||||
|
|
||||||
def merge_and_flag(matcher, doc, i, matches):
|
def merge_and_flag(matcher, doc, i, matches):
|
||||||
match_id, start, end = matches[i]
|
match_id, start, end = matches[i]
|
||||||
|
@ -221,7 +221,7 @@ p
|
||||||
+cell match 0 or 1 times
|
+cell match 0 or 1 times
|
||||||
+cell optional, max one
|
+cell optional, max one
|
||||||
|
|
||||||
+h(3, "quantifiers-example1") Quantifiers example: Using linguistic annotations
|
+h(2, "example1") Example: Using linguistic annotations
|
||||||
|
|
||||||
p
|
p
|
||||||
| Let's say you're analysing user comments and you want to find out what
|
| Let's say you're analysing user comments and you want to find out what
|
||||||
|
@ -283,7 +283,7 @@ p
|
||||||
# set manual=True to make displaCy render straight from a dictionary
|
# set manual=True to make displaCy render straight from a dictionary
|
||||||
displacy.serve(matched_sents, style='ent', manual=True)
|
displacy.serve(matched_sents, style='ent', manual=True)
|
||||||
|
|
||||||
+h(3, "quantifiers-example2") Quantifiers example: Phone numbers
|
+h(2, "example2") Example: Phone numbers
|
||||||
|
|
||||||
p
|
p
|
||||||
| Phone numbers can have many different formats and matching them is often
|
| Phone numbers can have many different formats and matching them is often
|
||||||
|
@ -320,3 +320,114 @@ p
|
||||||
| It'll produce more predictable results, is much easier to modify and
|
| It'll produce more predictable results, is much easier to modify and
|
||||||
| extend, and doesn't require any training data – only a set of
|
| extend, and doesn't require any training data – only a set of
|
||||||
| test cases.
|
| test cases.
|
||||||
|
|
||||||
|
+h(2, "example3") Example: Hashtags and emoji on social media
|
||||||
|
|
||||||
|
p
|
||||||
|
| Social media posts, especially tweets, can be difficult to work with.
|
||||||
|
| They're very short and often contain various emoji and hashtags. By only
|
||||||
|
| looking at the plain text, you'll lose a lot of valuable semantic
|
||||||
|
| information.
|
||||||
|
|
||||||
|
p
|
||||||
|
| Let's say you've extracted a large sample of social media posts on a
|
||||||
|
| specific topic, for example posts mentioning a brand name or product.
|
||||||
|
| As the first step of your data exploration, you want to filter out posts
|
||||||
|
| containing certain emoji and use them to assign a general sentiment
|
||||||
|
| score, based on whether the expressed emotion is positive or negative,
|
||||||
|
| e.g. #[span.o-icon.o-icon--inline 😀] or #[span.o-icon.o-icon--inline 😞].
|
||||||
|
| You also want to find, merge and label hashtags like
|
||||||
|
| #[code #MondayMotivation], to be able to ignore or analyse them later.
|
||||||
|
|
||||||
|
+aside("Note on sentiment analysis")
|
||||||
|
| Ultimately, sentiment analysis is not always #[em that] easy. In
|
||||||
|
| addition to the emoji, you'll also want to take specific words into
|
||||||
|
| account and check the #[code subtree] for intensifiers like "very", to
|
||||||
|
| increase the sentiment score. At some point, you might also want to train
|
||||||
|
| a sentiment model. However, the approach described in this example is
|
||||||
|
| very useful for #[strong bootstrapping rules to gather training data].
|
||||||
|
| It's also an incredibly fast way to gather first insights into your data
|
||||||
|
| – with about 1 million tweets, you'd be looking at a processing time of
|
||||||
|
| #[strong under 1 minute].
|
||||||
|
|
||||||
|
p
|
||||||
|
| By default, spaCy's tokenizer will split emoji into separate tokens. This
|
||||||
|
| means that you can create a pattern for one or more emoji tokens. In this
|
||||||
|
| case, a sequence of identical emoji should be treated as one instance.
|
||||||
|
| Valid hashtags usually consist of a #[code #], plus a sequence of
|
||||||
|
| ASCII characters with no whitespace, making them easy to match as well.
|
||||||
|
|
||||||
|
+code.
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.matcher import Matcher
|
||||||
|
|
||||||
|
nlp = English() # we only want the tokenizer, so no need to load a model
|
||||||
|
matcher = Matcher(nlp.vocab)
|
||||||
|
|
||||||
|
pos_emoji = [u'😀', u'😃', u'😂', u'🤣', u'😊', u'😍'] # positive emoji
|
||||||
|
neg_emoji = [u'😞', u'😠', u'😩', u'😢', u'😭', u'😒'] # negative emoji
|
||||||
|
|
||||||
|
# add patterns to match one or more emoji tokens
|
||||||
|
pos_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in pos_emoji]
|
||||||
|
neg_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in neg_emoji]
|
||||||
|
|
||||||
|
matcher.add('HAPPY', label_sentiment, *pos_patterns) # add positive pattern
|
||||||
|
matcher.add('SAD', label_sentiment, *neg_patterns) # add negative pattern
|
||||||
|
|
||||||
|
# add pattern to merge valid hashtag, i.e. '#' plus any ASCII token
|
||||||
|
matcher.add('HASHTAG', merge_hashtag, [{'ORTH': '#'}, {'IS_ASCII': True}])
|
||||||
|
|
||||||
|
p
|
||||||
|
| Because the #[code on_match] callback receives the ID of each match, you
|
||||||
|
| can use the same function to handle the sentiment assignment for both
|
||||||
|
| the positive and negative pattern. To keep it simple, we'll either add
|
||||||
|
| or subtract #[code 0.1] points – this way, the score will also reflect
|
||||||
|
| combinations of emoji, even positive #[em and] negative ones.
|
||||||
|
|
||||||
|
p
|
||||||
|
| With a library like
|
||||||
|
| #[+a("https://github.com/bcongdon/python-emojipedia") Emojipedia],
|
||||||
|
| we can also retrieve a short description for each emoji – for example,
|
||||||
|
| #[span.o-icon.o-icon--inline 😍]'s official title is "Smiling Face With
|
||||||
|
| Heart-Eyes". Assigning it to the merged token's norm will make it
|
||||||
|
| available as #[code token.norm_].
|
||||||
|
|
||||||
|
+code.
|
||||||
|
from emojipedia import Emojipedia # installation: pip install emojipedia
|
||||||
|
|
||||||
|
def label_sentiment(matcher, doc, i, matches):
|
||||||
|
match_id, start, end = matches[i]
|
||||||
|
if match_id is 'HAPPY':
|
||||||
|
doc.sentiment += 0.1 # add 0.1 for positive sentiment
|
||||||
|
elif match_id is 'SAD':
|
||||||
|
doc.sentiment -= 0.1 # subtract 0.1 for negative sentiment
|
||||||
|
span = doc[start : end]
|
||||||
|
emoji = Emojipedia.search(span[0].text) # get data for emoji
|
||||||
|
span.merge(norm=emoji.title) # merge span and set NORM to emoji title
|
||||||
|
|
||||||
|
p
|
||||||
|
| To label the hashtags, we first need to add a new custom flag.
|
||||||
|
| #[code IS_HASHTAG] will be the flag's ID, which you can use to assign it
|
||||||
|
| to the hashtag's span, and check its value via a token's
|
||||||
|
| #[+api("token#check_flag") #[code code check_flag()]] method. On each
|
||||||
|
| match, we merge the hashtag and assign the flag.
|
||||||
|
|
||||||
|
+code.
|
||||||
|
# Add a new custom flag to the vocab, which is always False by default
|
||||||
|
IS_HASHTAG = nlp.vocab.add_flag(lambda text: False)
|
||||||
|
|
||||||
|
def merge_hashtag(matcher, doc, i, matches):
|
||||||
|
match_id, start, end = matches[i]
|
||||||
|
span = doc[start : end]
|
||||||
|
span.merge() # merge hashtag
|
||||||
|
span.set_flag(IS_HASHTAG, True) # set IS_HASHTAG to True
|
||||||
|
|
||||||
|
p
|
||||||
|
| To process a stream of social media posts, we can use
|
||||||
|
| #[+api("language#pipe") #[code Language.pipe()]], which will return a
|
||||||
|
| stream of #[code Doc] objects that we can pass to
|
||||||
|
| #[+api("matcher#pipe") #[code Matcher.pipe()]].
|
||||||
|
|
||||||
|
+code.
|
||||||
|
docs = nlp.pipe(LOTS_OF_TWEETS)
|
||||||
|
matches = matcher.pipe(docs)
|
||||||
|
|
|
@ -74,16 +74,14 @@ p
|
||||||
+aside-code("meta.json", "json").
|
+aside-code("meta.json", "json").
|
||||||
{
|
{
|
||||||
"name": "example_model",
|
"name": "example_model",
|
||||||
|
"lang": "en",
|
||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
"spacy_version": ">=2.0.0,<3.0.0",
|
"spacy_version": ">=2.0.0,<3.0.0",
|
||||||
"description": "Example model for spaCy",
|
"description": "Example model for spaCy",
|
||||||
"author": "You",
|
"author": "You",
|
||||||
"email": "you@example.com",
|
"email": "you@example.com",
|
||||||
"license": "CC BY-SA 3.0",
|
"license": "CC BY-SA 3.0",
|
||||||
"setup": {
|
"pipeline": ["token_vectors", "tagger"]
|
||||||
"lang": "en",
|
|
||||||
"pipeline": ["token_vectors", "tagger"]
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
+code(false, "bash").
|
+code(false, "bash").
|
||||||
|
@ -110,9 +108,9 @@ p
|
||||||
+h(3, "models-custom") Customising the model setup
|
+h(3, "models-custom") Customising the model setup
|
||||||
|
|
||||||
p
|
p
|
||||||
| The meta.json includes a #[code setup] key that lets you customise how
|
| The meta.json includes the model details, like name, requirements and
|
||||||
| the model should be initialised and loaded. You can define the language
|
| license, and lets you customise how the model should be initialised and
|
||||||
| data to be loaded and the
|
| loaded. You can define the language data to be loaded and the
|
||||||
| #[+a("/docs/usage/language-processing-pipeline") processing pipeline] to
|
| #[+a("/docs/usage/language-processing-pipeline") processing pipeline] to
|
||||||
| execute.
|
| execute.
|
||||||
|
|
||||||
|
@ -183,9 +181,9 @@ p
|
||||||
p
|
p
|
||||||
| To load a model from a data directory, you can use
|
| To load a model from a data directory, you can use
|
||||||
| #[+api("spacy#load") #[code spacy.load()]] with the local path. This will
|
| #[+api("spacy#load") #[code spacy.load()]] with the local path. This will
|
||||||
| look for a meta.json in the directory and use the #[code setup] details
|
| look for a meta.json in the directory and use the #[code lang] and
|
||||||
| to initialise a #[code Language] class with a processing pipeline and
|
| #[code pipeline] settings to initialise a #[code Language] class with a
|
||||||
| load in the model data.
|
| processing pipeline and load in the model data.
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
nlp = spacy.load('/path/to/model')
|
nlp = spacy.load('/path/to/model')
|
||||||
|
|
Loading…
Reference in New Issue
Block a user