Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-05-27 16:34:28 -05:00
commit 49235017bf
17 changed files with 209 additions and 67 deletions

View File

@ -4,7 +4,7 @@ from __future__ import unicode_literals
import importlib import importlib
from .compat import basestring_ from .compat import basestring_
from .cli.info import info from .cli.info import info as cli_info
from .glossary import explain from .glossary import explain
from .deprecated import resolve_load_name from .deprecated import resolve_load_name
from . import util from . import util
@ -20,3 +20,7 @@ def load(name, **overrides):
overrides['meta'] = meta overrides['meta'] = meta
overrides['path'] = model_path overrides['path'] = model_path
return cls(**overrides) return cls(**overrides)
def info(model=None, markdown=False):
return cli_info(None, model, markdown)

View File

@ -24,8 +24,9 @@ CONVERTERS = {
n_sents=("Number of sentences per doc", "option", "n", float), n_sents=("Number of sentences per doc", "option", "n", float),
morphology=("Enable appending morphology to tags", "flag", "m", bool) morphology=("Enable appending morphology to tags", "flag", "m", bool)
) )
def convert(_, input_file, output_dir, n_sents, morphology): def convert(cmd, input_file, output_dir, n_sents, morphology):
"""Convert files into JSON format for use with train command and other """
Convert files into JSON format for use with train command and other
experiment management functions. experiment management functions.
""" """
input_path = Path(input_file) input_path = Path(input_file)

View File

@ -17,8 +17,9 @@ from .. import about
direct=("force direct download. Needs model name with version and won't " direct=("force direct download. Needs model name with version and won't "
"perform compatibility check", "flag", "d", bool) "perform compatibility check", "flag", "d", bool)
) )
def download(model, direct=False): def download(cmd, model, direct=False):
"""Download compatible model from default download path using pip. Model """
Download compatible model from default download path using pip. Model
can be shortcut, model name or, if --direct flag is set, full model name can be shortcut, model name or, if --direct flag is set, full model name
with version. with version.
""" """
@ -31,7 +32,7 @@ def download(model, direct=False):
version = get_version(model_name, compatibility) version = get_version(model_name, compatibility)
download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version)) download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
try: try:
link(model_name, model, force=True) link(None, model_name, model, force=True)
except: except:
# Dirty, but since spacy.download and the auto-linking is mostly # Dirty, but since spacy.download and the auto-linking is mostly
# a convenience wrapper, it's best to show a success message and # a convenience wrapper, it's best to show a success message and

View File

@ -14,7 +14,7 @@ from .. import util
model=("optional: shortcut link of model", "positional", None, str), model=("optional: shortcut link of model", "positional", None, str),
markdown=("generate Markdown for GitHub issues", "flag", "md", str) markdown=("generate Markdown for GitHub issues", "flag", "md", str)
) )
def info(model=None, markdown=False): def info(cmd, model=None, markdown=False):
"""Print info about spaCy installation. If a model shortcut link is """Print info about spaCy installation. If a model shortcut link is
speficied as an argument, print model information. Flag --markdown speficied as an argument, print model information. Flag --markdown
prints details in Markdown for easy copy-pasting to GitHub issues. prints details in Markdown for easy copy-pasting to GitHub issues.

View File

@ -14,8 +14,9 @@ from .. import util
link_name=("name of shortuct link to create", "positional", None, str), link_name=("name of shortuct link to create", "positional", None, str),
force=("force overwriting of existing link", "flag", "f", bool) force=("force overwriting of existing link", "flag", "f", bool)
) )
def link(origin, link_name, force=False): def link(cmd, origin, link_name, force=False):
"""Create a symlink for models within the spacy/data directory. Accepts """
Create a symlink for models within the spacy/data directory. Accepts
either the name of a pip package, or the local path to the model data either the name of a pip package, or the local path to the model data
directory. Linking models allows loading them via spacy.load(link_name). directory. Linking models allows loading them via spacy.load(link_name).
""" """

View File

@ -18,8 +18,9 @@ from .. import about
meta=("path to meta.json", "option", "m", str), meta=("path to meta.json", "option", "m", str),
force=("force overwriting of existing folder in output directory", "flag", "f", bool) force=("force overwriting of existing folder in output directory", "flag", "f", bool)
) )
def package(input_dir, output_dir, meta, force): def package(cmd, input_dir, output_dir, meta=None, force=False):
"""Generate Python package for model data, including meta and required """
Generate Python package for model data, including meta and required
installation files. A new directory will be created in the specified installation files. A new directory will be created in the specified
output directory, and model data will be copied over. output directory, and model data will be copied over.
""" """
@ -42,7 +43,7 @@ def package(input_dir, output_dir, meta, force):
meta = util.read_json(meta_path) meta = util.read_json(meta_path)
else: else:
meta = generate_meta() meta = generate_meta()
validate_meta(meta, ['lang', 'name', 'version']) meta = validate_meta(meta, ['lang', 'name', 'version'])
model_name = meta['lang'] + '_' + meta['name'] model_name = meta['lang'] + '_' + meta['name']
model_name_v = model_name + '-' + meta['version'] model_name_v = model_name + '-' + meta['version']
@ -85,20 +86,32 @@ def generate_meta():
('email', 'Author email', False), ('email', 'Author email', False),
('url', 'Author website', False), ('url', 'Author website', False),
('license', 'License', 'CC BY-NC 3.0')] ('license', 'License', 'CC BY-NC 3.0')]
prints("Enter the package settings for your model.", title="Generating meta.json") prints("Enter the package settings for your model.", title="Generating meta.json")
meta = {} meta = {}
for setting, desc, default in settings: for setting, desc, default in settings:
response = util.get_raw_input(desc, default) response = util.get_raw_input(desc, default)
meta[setting] = default if response == '' and default else response meta[setting] = default if response == '' and default else response
meta['pipeline'] = generate_pipeline()
return meta return meta
def generate_pipeline():
prints("If set to 'True', the default pipeline is used. If set to 'False', "
"the pipeline will be disabled. Components should be specified as a "
"comma-separated list of component names, e.g. vectorizer, tagger, "
"parser, ner. For more information, see the docs on processing pipelines.",
title="Enter your model's pipeline components")
pipeline = util.get_raw_input("Pipeline components", True)
replace = {'True': True, 'False': False}
return replace[pipeline] if pipeline in replace else pipeline.split(', ')
def validate_meta(meta, keys): def validate_meta(meta, keys):
for key in keys: for key in keys:
if key not in meta or meta[key] == '': if key not in meta or meta[key] == '':
prints("This setting is required to build your package.", prints("This setting is required to build your package.",
title='No "%s" setting found in meta.json' % key, exits=1) title='No "%s" setting found in meta.json' % key, exits=1)
return meta
def get_template(filepath): def get_template(filepath):

View File

@ -32,9 +32,11 @@ from .. import displacy
no_parser=("Don't train parser", "flag", "P", bool), no_parser=("Don't train parser", "flag", "P", bool),
no_entities=("Don't train NER", "flag", "N", bool) no_entities=("Don't train NER", "flag", "N", bool)
) )
def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
use_gpu=False, no_tagger=False, no_parser=False, no_entities=False): use_gpu=False, no_tagger=False, no_parser=False, no_entities=False):
"""Train a model. Expects data in spaCy's JSON format.""" """
Train a model. Expects data in spaCy's JSON format.
"""
n_sents = n_sents or None n_sents = n_sents or None
output_path = util.ensure_path(output_dir) output_path = util.ensure_path(output_dir)
train_path = util.ensure_path(train_data) train_path = util.ensure_path(train_data)
@ -70,12 +72,12 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
optimizer = nlp.begin_training(lambda: corpus.train_tuples, use_gpu=use_gpu) optimizer = nlp.begin_training(lambda: corpus.train_tuples, use_gpu=use_gpu)
print("Itn.\tDep. Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %") print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
try: try:
for i in range(n_iter): for i in range(n_iter):
with tqdm.tqdm(total=corpus.count_train(), leave=False) as pbar: with tqdm.tqdm(total=corpus.count_train(), leave=False) as pbar:
train_docs = corpus.train_docs(nlp, projectivize=True, train_docs = corpus.train_docs(nlp, projectivize=True,
gold_preproc=False, max_length=1000) gold_preproc=False, max_length=0)
losses = {} losses = {}
for batch in minibatch(train_docs, size=batch_sizes): for batch in minibatch(train_docs, size=batch_sizes):
docs, golds = zip(*batch) docs, golds = zip(*batch)

View File

@ -1,8 +1,8 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, UNITS from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES, UNITS
_currency = r"\$|¢|£|€|¥|฿|৳" _currency = r"\$|¢|£|€|¥|฿|৳"
@ -10,16 +10,16 @@ _quotes = QUOTES.replace("'", '')
_list_punct = LIST_PUNCT + '। ॥'.strip().split() _list_punct = LIST_PUNCT + '। ॥'.strip().split()
_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES) _prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS)
_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + _suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
[r'(?<=[0-9])\+', [r'(?<=[0-9])\+',
r'(?<=°[FfCcKk])\.', r'(?<=°[FfCcKk])\.',
r'(?<=[0-9])(?:{})'.format(_currency), r'(?<=[0-9])(?:{})'.format(_currency),
r'(?<=[0-9])(?:{})'.format(UNITS), r'(?<=[0-9])(?:{})'.format(UNITS),
r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)]) r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)])
_infixes = (LIST_ELLIPSES + _infixes = (LIST_ELLIPSES + LIST_ICONS +
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),

View File

@ -20,7 +20,6 @@ _upper = [_latin_upper]
_lower = [_latin_lower] _lower = [_latin_lower]
_uncased = [_bengali, _hebrew] _uncased = [_bengali, _hebrew]
ALPHA = merge_char_classes(_upper + _lower + _uncased) ALPHA = merge_char_classes(_upper + _lower + _uncased)
ALPHA_LOWER = merge_char_classes(_lower + _uncased) ALPHA_LOWER = merge_char_classes(_lower + _uncased)
ALPHA_UPPER = merge_char_classes(_upper + _uncased) ALPHA_UPPER = merge_char_classes(_upper + _uncased)
@ -33,13 +32,14 @@ _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
_punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &' _punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
_quotes = r'\' \'\' " ” “ `` ` ´ , „ » «' _quotes = r'\' \'\' " ” “ `` ` ´ , „ » «'
_hyphens = '- — -- ---' _hyphens = '- — -- ---'
_other_symbols = r'[\p{So}]'
UNITS = merge_chars(_units) UNITS = merge_chars(_units)
CURRENCY = merge_chars(_currency) CURRENCY = merge_chars(_currency)
QUOTES = merge_chars(_quotes) QUOTES = merge_chars(_quotes)
PUNCT = merge_chars(_punct) PUNCT = merge_chars(_punct)
HYPHENS = merge_chars(_hyphens) HYPHENS = merge_chars(_hyphens)
ICONS = _other_symbols
LIST_UNITS = split_chars(_units) LIST_UNITS = split_chars(_units)
LIST_CURRENCY = split_chars(_currency) LIST_CURRENCY = split_chars(_currency)
@ -47,3 +47,4 @@ LIST_QUOTES = split_chars(_quotes)
LIST_PUNCT = split_chars(_punct) LIST_PUNCT = split_chars(_punct)
LIST_HYPHENS = split_chars(_hyphens) LIST_HYPHENS = split_chars(_hyphens)
LIST_ELLIPSES = [r'\.\.+', ''] LIST_ELLIPSES = [r'\.\.+', '']
LIST_ICONS = [_other_symbols]

View File

@ -2,15 +2,16 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
from .char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES from .char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
from .char_classes import CURRENCY, UNITS from .char_classes import QUOTES, CURRENCY, UNITS
_prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + _prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
LIST_CURRENCY) LIST_CURRENCY + LIST_ICONS)
_suffixes = (["'s", "'S", "s", "S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + _suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
["'s", "'S", "s", "S"] +
[r'(?<=[0-9])\+', [r'(?<=[0-9])\+',
r'(?<=°[FfCcKk])\.', r'(?<=°[FfCcKk])\.',
r'(?<=[0-9])(?:{})'.format(CURRENCY), r'(?<=[0-9])(?:{})'.format(CURRENCY),
@ -19,7 +20,7 @@ _suffixes = (["'s", "'S", "s", "S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QU
r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER)]) r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER)])
_infixes = (LIST_ELLIPSES + _infixes = (LIST_ELLIPSES + LIST_ICONS +
[r'(?<=[0-9])[+\-\*^](?=[0-9-])', [r'(?<=[0-9])[+\-\*^](?=[0-9-])',
r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),

View File

@ -212,18 +212,17 @@ class Language(object):
""" """
tok2vec = self.pipeline[0] tok2vec = self.pipeline[0]
feats = tok2vec.doc2feats(docs) feats = tok2vec.doc2feats(docs)
procs = list(self.pipeline[1:])
random.shuffle(procs)
grads = {} grads = {}
def get_grads(W, dW, key=None): def get_grads(W, dW, key=None):
grads[key] = (W, dW) grads[key] = (W, dW)
for proc in procs: for proc in self.pipeline[1:]:
if not hasattr(proc, 'update'): if not hasattr(proc, 'update'):
continue continue
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop) tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
d_tokvecses = proc.update((docs, tokvecses), golds, d_tokvecses = proc.update((docs, tokvecses), golds,
drop=drop, sgd=get_grads, losses=losses) drop=drop, sgd=get_grads, losses=losses)
bp_tokvecses(d_tokvecses, sgd=sgd) if d_tokvecses is not None:
bp_tokvecses(d_tokvecses, sgd=sgd)
for key, (W, dW) in grads.items(): for key, (W, dW) in grads.items():
sgd(W, dW, key=key) sgd(W, dW, key=key)
# Clear the tensor variable, to free GPU memory. # Clear the tensor variable, to free GPU memory.

View File

@ -432,6 +432,8 @@ cdef class Parser:
0.0) 0.0)
todo = [(s, g) for (s, g) in zip(states, golds) todo = [(s, g) for (s, g) in zip(states, golds)
if not s.is_final() and g is not None] if not s.is_final() and g is not None]
if not todo:
return None
backprops = [] backprops = []
d_tokvecs = state2vec.ops.allocate(tokvecs.shape) d_tokvecs = state2vec.ops.allocate(tokvecs.shape)

View File

@ -1,7 +1,4 @@
# coding: utf-8 # coding: utf-8
"""Test that tokenizer exceptions and emoticons are handled correctly."""
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
@ -39,3 +36,12 @@ def test_tokenizer_handles_emoticons(tokenizer):
def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length): def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
tokens = tokenizer(text) tokens = tokenizer(text)
assert len(tokens) == length assert len(tokens) == length
@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8),
('i💙you', 3), ('🤘🤘yay!', 4)])
def test_tokenizer_handles_emoji(tokenizer, text, length):
exceptions = ["hu"]
tokens = tokenizer(text)
if tokens[0].lang_ not in exceptions:
assert len(tokens) == length

View File

@ -19,19 +19,17 @@ p
p p
| When you load a model, spaCy first consults the model's | When you load a model, spaCy first consults the model's
| #[+a("/docs/usage/saving-loading#models-generating") meta.json] for its | #[+a("/docs/usage/saving-loading#models-generating") meta.json]. The
| #[code setup] details. This typically includes the ID of a language class, | meta typically includes the model details, the ID of a language class,
| and an optional list of pipeline components. spaCy then does the | and an optional list of pipeline components. spaCy then does the
| following: | following:
+aside-code("meta.json (excerpt)", "json"). +aside-code("meta.json (excerpt)", "json").
{ {
"name": "example_model", "name": "example_model",
"lang": "en"
"description": "Example model for spaCy", "description": "Example model for spaCy",
"setup": { "pipeline": ["token_vectors", "tagger"]
"lang": "en",
"pipeline": ["token_vectors", "tagger"]
}
} }
+list("numbers") +list("numbers")
@ -287,17 +285,15 @@ p
p p
| In the model package's meta.json, specify the language class and pipeline | In the model package's meta.json, specify the language class and pipeline
| IDs in #[code setup]: | IDs:
+code("meta.json (excerpt)", "json"). +code("meta.json (excerpt)", "json").
{ {
"name": "my_sentiment_model", "name": "sentiment_model",
"lang": "en",
"version": "1.0.0", "version": "1.0.0",
"spacy_version": "&gt;=2.0.0,&lt;3.0.0", "spacy_version": "&gt;=2.0.0,&lt;3.0.0",
"setup": { "pipeline": ["vectorizer", "sentiment"]
"lang": "en",
"pipeline": ["vectorizer", "sentiment"]
}
} }
p p
@ -307,7 +303,7 @@ p
| by your custom #[code "sentiment"] factory. | by your custom #[code "sentiment"] factory.
+code. +code.
nlp = spacy.load('my_sentiment_model') nlp = spacy.load('en_sentiment_model')
doc = nlp(u'I love pizza') doc = nlp(u'I love pizza')
assert doc.sentiment assert doc.sentiment

View File

@ -129,13 +129,14 @@ p
+code. +code.
import spacy import spacy
from spacy.tokens.doc import Doc from spacy.tokens.doc import Doc
from spacy.vocab import Vocab
nlp = spacy.load('en') nlp = spacy.load('en')
moby_dick = open('moby_dick.txt', 'r') moby_dick = open('moby_dick.txt', 'r')
doc = nlp(moby_dick) doc = nlp(moby_dick)
doc.to_disk('/moby_dick.bin') doc.to_disk('/moby_dick.bin')
new_doc = Doc().from_disk('/moby_dick.bin') new_doc = Doc(Vocab()).from_disk('/moby_dick.bin')
+infobox +infobox
| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading] | #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
@ -148,9 +149,14 @@ p
nlp = spacy.load('en') nlp = spacy.load('en')
matcher = Matcher(nlp.vocab) matcher = Matcher(nlp.vocab)
# match "Google I/O" or "Google i/o"
pattern = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}] def set_sentiment(matcher, doc, i, matches):
matcher.add('GoogleIO', None, pattern) doc.sentiment += 0.1
pattern1 = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}]
pattern2 = [[{'ORTH': emoji, 'OP': '+'}] for emoji in ['😀', '😂', '🤣', '😍']]
matcher.add('GoogleIO', None, pattern1) # match "Google I/O" or "Google i/o"
matcher.add('HAPPY', set_sentiment, pattern2) # match one or more happy emoji
matches = nlp(LOTS_OF TEXT) matches = nlp(LOTS_OF TEXT)
+infobox +infobox

View File

@ -11,7 +11,7 @@ p
| You can also associate patterns with entity IDs, to allow some basic | You can also associate patterns with entity IDs, to allow some basic
| entity linking or disambiguation. | entity linking or disambiguation.
+aside("What about \"real\" regular expressions?") //-+aside("What about \"real\" regular expressions?")
+h(2, "adding-patterns") Adding patterns +h(2, "adding-patterns") Adding patterns
@ -119,7 +119,7 @@ p
+code. +code.
# Add a new custom flag to the vocab, which is always False by default. # Add a new custom flag to the vocab, which is always False by default.
# BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span. # BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span.
BAD_HTML_FLAG = doc.vocab.add_flag(lambda text: False) BAD_HTML_FLAG = nlp.vocab.add_flag(lambda text: False)
def merge_and_flag(matcher, doc, i, matches): def merge_and_flag(matcher, doc, i, matches):
match_id, start, end = matches[i] match_id, start, end = matches[i]
@ -221,7 +221,7 @@ p
+cell match 0 or 1 times +cell match 0 or 1 times
+cell optional, max one +cell optional, max one
+h(3, "quantifiers-example1") Quantifiers example: Using linguistic annotations +h(2, "example1") Example: Using linguistic annotations
p p
| Let's say you're analysing user comments and you want to find out what | Let's say you're analysing user comments and you want to find out what
@ -283,7 +283,7 @@ p
# set manual=True to make displaCy render straight from a dictionary # set manual=True to make displaCy render straight from a dictionary
displacy.serve(matched_sents, style='ent', manual=True) displacy.serve(matched_sents, style='ent', manual=True)
+h(3, "quantifiers-example2") Quantifiers example: Phone numbers +h(2, "example2") Example: Phone numbers
p p
| Phone numbers can have many different formats and matching them is often | Phone numbers can have many different formats and matching them is often
@ -320,3 +320,114 @@ p
| It'll produce more predictable results, is much easier to modify and | It'll produce more predictable results, is much easier to modify and
| extend, and doesn't require any training data only a set of | extend, and doesn't require any training data only a set of
| test cases. | test cases.
+h(2, "example3") Example: Hashtags and emoji on social media
p
| Social media posts, especially tweets, can be difficult to work with.
| They're very short and often contain various emoji and hashtags. By only
| looking at the plain text, you'll lose a lot of valuable semantic
| information.
p
| Let's say you've extracted a large sample of social media posts on a
| specific topic, for example posts mentioning a brand name or product.
| As the first step of your data exploration, you want to filter out posts
| containing certain emoji and use them to assign a general sentiment
| score, based on whether the expressed emotion is positive or negative,
| e.g. #[span.o-icon.o-icon--inline 😀] or #[span.o-icon.o-icon--inline 😞].
| You also want to find, merge and label hashtags like
| #[code #MondayMotivation], to be able to ignore or analyse them later.
+aside("Note on sentiment analysis")
| Ultimately, sentiment analysis is not always #[em that] easy. In
| addition to the emoji, you'll also want to take specific words into
| account and check the #[code subtree] for intensifiers like "very", to
| increase the sentiment score. At some point, you might also want to train
| a sentiment model. However, the approach described in this example is
| very useful for #[strong bootstrapping rules to gather training data].
| It's also an incredibly fast way to gather first insights into your data
| with about 1 million tweets, you'd be looking at a processing time of
| #[strong under 1 minute].
p
| By default, spaCy's tokenizer will split emoji into separate tokens. This
| means that you can create a pattern for one or more emoji tokens. In this
| case, a sequence of identical emoji should be treated as one instance.
| Valid hashtags usually consist of a #[code #], plus a sequence of
| ASCII characters with no whitespace, making them easy to match as well.
+code.
from spacy.lang.en import English
from spacy.matcher import Matcher
nlp = English() # we only want the tokenizer, so no need to load a model
matcher = Matcher(nlp.vocab)
pos_emoji = [u'😀', u'😃', u'😂', u'🤣', u'😊', u'😍'] # positive emoji
neg_emoji = [u'😞', u'😠', u'😩', u'😢', u'😭', u'😒'] # negative emoji
# add patterns to match one or more emoji tokens
pos_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in pos_emoji]
neg_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in neg_emoji]
matcher.add('HAPPY', label_sentiment, *pos_patterns) # add positive pattern
matcher.add('SAD', label_sentiment, *neg_patterns) # add negative pattern
# add pattern to merge valid hashtag, i.e. '#' plus any ASCII token
matcher.add('HASHTAG', merge_hashtag, [{'ORTH': '#'}, {'IS_ASCII': True}])
p
| Because the #[code on_match] callback receives the ID of each match, you
| can use the same function to handle the sentiment assignment for both
| the positive and negative pattern. To keep it simple, we'll either add
| or subtract #[code 0.1] points this way, the score will also reflect
| combinations of emoji, even positive #[em and] negative ones.
p
| With a library like
| #[+a("https://github.com/bcongdon/python-emojipedia") Emojipedia],
| we can also retrieve a short description for each emoji for example,
| #[span.o-icon.o-icon--inline 😍]'s official title is "Smiling Face With
| Heart-Eyes". Assigning it to the merged token's norm will make it
| available as #[code token.norm_].
+code.
from emojipedia import Emojipedia # installation: pip install emojipedia
def label_sentiment(matcher, doc, i, matches):
match_id, start, end = matches[i]
if match_id is 'HAPPY':
doc.sentiment += 0.1 # add 0.1 for positive sentiment
elif match_id is 'SAD':
doc.sentiment -= 0.1 # subtract 0.1 for negative sentiment
span = doc[start : end]
emoji = Emojipedia.search(span[0].text) # get data for emoji
span.merge(norm=emoji.title) # merge span and set NORM to emoji title
p
| To label the hashtags, we first need to add a new custom flag.
| #[code IS_HASHTAG] will be the flag's ID, which you can use to assign it
| to the hashtag's span, and check its value via a token's
| #[+api("token#check_flag") #[code code check_flag()]] method. On each
| match, we merge the hashtag and assign the flag.
+code.
# Add a new custom flag to the vocab, which is always False by default
IS_HASHTAG = nlp.vocab.add_flag(lambda text: False)
def merge_hashtag(matcher, doc, i, matches):
match_id, start, end = matches[i]
span = doc[start : end]
span.merge() # merge hashtag
span.set_flag(IS_HASHTAG, True) # set IS_HASHTAG to True
p
| To process a stream of social media posts, we can use
| #[+api("language#pipe") #[code Language.pipe()]], which will return a
| stream of #[code Doc] objects that we can pass to
| #[+api("matcher#pipe") #[code Matcher.pipe()]].
+code.
docs = nlp.pipe(LOTS_OF_TWEETS)
matches = matcher.pipe(docs)

View File

@ -74,16 +74,14 @@ p
+aside-code("meta.json", "json"). +aside-code("meta.json", "json").
{ {
"name": "example_model", "name": "example_model",
"lang": "en",
"version": "1.0.0", "version": "1.0.0",
"spacy_version": "&gt;=2.0.0,&lt;3.0.0", "spacy_version": "&gt;=2.0.0,&lt;3.0.0",
"description": "Example model for spaCy", "description": "Example model for spaCy",
"author": "You", "author": "You",
"email": "you@example.com", "email": "you@example.com",
"license": "CC BY-SA 3.0", "license": "CC BY-SA 3.0",
"setup": { "pipeline": ["token_vectors", "tagger"]
"lang": "en",
"pipeline": ["token_vectors", "tagger"]
}
} }
+code(false, "bash"). +code(false, "bash").
@ -110,9 +108,9 @@ p
+h(3, "models-custom") Customising the model setup +h(3, "models-custom") Customising the model setup
p p
| The meta.json includes a #[code setup] key that lets you customise how | The meta.json includes the model details, like name, requirements and
| the model should be initialised and loaded. You can define the language | license, and lets you customise how the model should be initialised and
| data to be loaded and the | loaded. You can define the language data to be loaded and the
| #[+a("/docs/usage/language-processing-pipeline") processing pipeline] to | #[+a("/docs/usage/language-processing-pipeline") processing pipeline] to
| execute. | execute.
@ -183,9 +181,9 @@ p
p p
| To load a model from a data directory, you can use | To load a model from a data directory, you can use
| #[+api("spacy#load") #[code spacy.load()]] with the local path. This will | #[+api("spacy#load") #[code spacy.load()]] with the local path. This will
| look for a meta.json in the directory and use the #[code setup] details | look for a meta.json in the directory and use the #[code lang] and
| to initialise a #[code Language] class with a processing pipeline and | #[code pipeline] settings to initialise a #[code Language] class with a
| load in the model data. | processing pipeline and load in the model data.
+code. +code.
nlp = spacy.load('/path/to/model') nlp = spacy.load('/path/to/model')