Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-05-27 16:34:28 -05:00
commit 49235017bf
17 changed files with 209 additions and 67 deletions

View File

@ -4,7 +4,7 @@ from __future__ import unicode_literals
import importlib
from .compat import basestring_
from .cli.info import info
from .cli.info import info as cli_info
from .glossary import explain
from .deprecated import resolve_load_name
from . import util
@ -20,3 +20,7 @@ def load(name, **overrides):
overrides['meta'] = meta
overrides['path'] = model_path
return cls(**overrides)
def info(model=None, markdown=False):
return cli_info(None, model, markdown)

View File

@ -24,8 +24,9 @@ CONVERTERS = {
n_sents=("Number of sentences per doc", "option", "n", float),
morphology=("Enable appending morphology to tags", "flag", "m", bool)
)
def convert(_, input_file, output_dir, n_sents, morphology):
"""Convert files into JSON format for use with train command and other
def convert(cmd, input_file, output_dir, n_sents, morphology):
"""
Convert files into JSON format for use with train command and other
experiment management functions.
"""
input_path = Path(input_file)

View File

@ -17,8 +17,9 @@ from .. import about
direct=("force direct download. Needs model name with version and won't "
"perform compatibility check", "flag", "d", bool)
)
def download(model, direct=False):
"""Download compatible model from default download path using pip. Model
def download(cmd, model, direct=False):
"""
Download compatible model from default download path using pip. Model
can be shortcut, model name or, if --direct flag is set, full model name
with version.
"""
@ -31,7 +32,7 @@ def download(model, direct=False):
version = get_version(model_name, compatibility)
download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
try:
link(model_name, model, force=True)
link(None, model_name, model, force=True)
except:
# Dirty, but since spacy.download and the auto-linking is mostly
# a convenience wrapper, it's best to show a success message and

View File

@ -14,7 +14,7 @@ from .. import util
model=("optional: shortcut link of model", "positional", None, str),
markdown=("generate Markdown for GitHub issues", "flag", "md", str)
)
def info(model=None, markdown=False):
def info(cmd, model=None, markdown=False):
"""Print info about spaCy installation. If a model shortcut link is
speficied as an argument, print model information. Flag --markdown
prints details in Markdown for easy copy-pasting to GitHub issues.

View File

@ -14,8 +14,9 @@ from .. import util
link_name=("name of shortuct link to create", "positional", None, str),
force=("force overwriting of existing link", "flag", "f", bool)
)
def link(origin, link_name, force=False):
"""Create a symlink for models within the spacy/data directory. Accepts
def link(cmd, origin, link_name, force=False):
"""
Create a symlink for models within the spacy/data directory. Accepts
either the name of a pip package, or the local path to the model data
directory. Linking models allows loading them via spacy.load(link_name).
"""

View File

@ -18,8 +18,9 @@ from .. import about
meta=("path to meta.json", "option", "m", str),
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
)
def package(input_dir, output_dir, meta, force):
"""Generate Python package for model data, including meta and required
def package(cmd, input_dir, output_dir, meta=None, force=False):
"""
Generate Python package for model data, including meta and required
installation files. A new directory will be created in the specified
output directory, and model data will be copied over.
"""
@ -42,7 +43,7 @@ def package(input_dir, output_dir, meta, force):
meta = util.read_json(meta_path)
else:
meta = generate_meta()
validate_meta(meta, ['lang', 'name', 'version'])
meta = validate_meta(meta, ['lang', 'name', 'version'])
model_name = meta['lang'] + '_' + meta['name']
model_name_v = model_name + '-' + meta['version']
@ -85,20 +86,32 @@ def generate_meta():
('email', 'Author email', False),
('url', 'Author website', False),
('license', 'License', 'CC BY-NC 3.0')]
prints("Enter the package settings for your model.", title="Generating meta.json")
meta = {}
for setting, desc, default in settings:
response = util.get_raw_input(desc, default)
meta[setting] = default if response == '' and default else response
meta['pipeline'] = generate_pipeline()
return meta
def generate_pipeline():
prints("If set to 'True', the default pipeline is used. If set to 'False', "
"the pipeline will be disabled. Components should be specified as a "
"comma-separated list of component names, e.g. vectorizer, tagger, "
"parser, ner. For more information, see the docs on processing pipelines.",
title="Enter your model's pipeline components")
pipeline = util.get_raw_input("Pipeline components", True)
replace = {'True': True, 'False': False}
return replace[pipeline] if pipeline in replace else pipeline.split(', ')
def validate_meta(meta, keys):
for key in keys:
if key not in meta or meta[key] == '':
prints("This setting is required to build your package.",
title='No "%s" setting found in meta.json' % key, exits=1)
return meta
def get_template(filepath):

View File

@ -32,9 +32,11 @@ from .. import displacy
no_parser=("Don't train parser", "flag", "P", bool),
no_entities=("Don't train NER", "flag", "N", bool)
)
def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
use_gpu=False, no_tagger=False, no_parser=False, no_entities=False):
"""Train a model. Expects data in spaCy's JSON format."""
"""
Train a model. Expects data in spaCy's JSON format.
"""
n_sents = n_sents or None
output_path = util.ensure_path(output_dir)
train_path = util.ensure_path(train_data)
@ -70,12 +72,12 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
optimizer = nlp.begin_training(lambda: corpus.train_tuples, use_gpu=use_gpu)
print("Itn.\tDep. Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
try:
for i in range(n_iter):
with tqdm.tqdm(total=corpus.count_train(), leave=False) as pbar:
train_docs = corpus.train_docs(nlp, projectivize=True,
gold_preproc=False, max_length=1000)
gold_preproc=False, max_length=0)
losses = {}
for batch in minibatch(train_docs, size=batch_sizes):
docs, golds = zip(*batch)

View File

@ -1,8 +1,8 @@
# coding: utf8
from __future__ import unicode_literals
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, UNITS
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES, UNITS
_currency = r"\$|¢|£|€|¥|฿|৳"
@ -10,16 +10,16 @@ _quotes = QUOTES.replace("'", '')
_list_punct = LIST_PUNCT + '। ॥'.strip().split()
_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES)
_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS)
_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES +
_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
[r'(?<=[0-9])\+',
r'(?<=°[FfCcKk])\.',
r'(?<=[0-9])(?:{})'.format(_currency),
r'(?<=[0-9])(?:{})'.format(UNITS),
r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)])
_infixes = (LIST_ELLIPSES +
_infixes = (LIST_ELLIPSES + LIST_ICONS +
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),

View File

@ -20,7 +20,6 @@ _upper = [_latin_upper]
_lower = [_latin_lower]
_uncased = [_bengali, _hebrew]
ALPHA = merge_char_classes(_upper + _lower + _uncased)
ALPHA_LOWER = merge_char_classes(_lower + _uncased)
ALPHA_UPPER = merge_char_classes(_upper + _uncased)
@ -33,13 +32,14 @@ _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
_punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
_quotes = r'\' \'\' " ” “ `` ` ´ , „ » «'
_hyphens = '- — -- ---'
_other_symbols = r'[\p{So}]'
UNITS = merge_chars(_units)
CURRENCY = merge_chars(_currency)
QUOTES = merge_chars(_quotes)
PUNCT = merge_chars(_punct)
HYPHENS = merge_chars(_hyphens)
ICONS = _other_symbols
LIST_UNITS = split_chars(_units)
LIST_CURRENCY = split_chars(_currency)
@ -47,3 +47,4 @@ LIST_QUOTES = split_chars(_quotes)
LIST_PUNCT = split_chars(_punct)
LIST_HYPHENS = split_chars(_hyphens)
LIST_ELLIPSES = [r'\.\.+', '']
LIST_ICONS = [_other_symbols]

View File

@ -2,15 +2,16 @@
from __future__ import unicode_literals
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
from .char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
from .char_classes import CURRENCY, UNITS
from .char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
from .char_classes import QUOTES, CURRENCY, UNITS
_prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
LIST_CURRENCY)
LIST_CURRENCY + LIST_ICONS)
_suffixes = (["'s", "'S", "s", "S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
["'s", "'S", "s", "S"] +
[r'(?<=[0-9])\+',
r'(?<=°[FfCcKk])\.',
r'(?<=[0-9])(?:{})'.format(CURRENCY),
@ -19,7 +20,7 @@ _suffixes = (["'s", "'S", "s", "S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QU
r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER)])
_infixes = (LIST_ELLIPSES +
_infixes = (LIST_ELLIPSES + LIST_ICONS +
[r'(?<=[0-9])[+\-\*^](?=[0-9-])',
r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),

View File

@ -212,18 +212,17 @@ class Language(object):
"""
tok2vec = self.pipeline[0]
feats = tok2vec.doc2feats(docs)
procs = list(self.pipeline[1:])
random.shuffle(procs)
grads = {}
def get_grads(W, dW, key=None):
grads[key] = (W, dW)
for proc in procs:
for proc in self.pipeline[1:]:
if not hasattr(proc, 'update'):
continue
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
d_tokvecses = proc.update((docs, tokvecses), golds,
drop=drop, sgd=get_grads, losses=losses)
bp_tokvecses(d_tokvecses, sgd=sgd)
if d_tokvecses is not None:
bp_tokvecses(d_tokvecses, sgd=sgd)
for key, (W, dW) in grads.items():
sgd(W, dW, key=key)
# Clear the tensor variable, to free GPU memory.

View File

@ -432,6 +432,8 @@ cdef class Parser:
0.0)
todo = [(s, g) for (s, g) in zip(states, golds)
if not s.is_final() and g is not None]
if not todo:
return None
backprops = []
d_tokvecs = state2vec.ops.allocate(tokvecs.shape)

View File

@ -1,7 +1,4 @@
# coding: utf-8
"""Test that tokenizer exceptions and emoticons are handled correctly."""
from __future__ import unicode_literals
import pytest
@ -39,3 +36,12 @@ def test_tokenizer_handles_emoticons(tokenizer):
def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
tokens = tokenizer(text)
assert len(tokens) == length
@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8),
('i💙you', 3), ('🤘🤘yay!', 4)])
def test_tokenizer_handles_emoji(tokenizer, text, length):
exceptions = ["hu"]
tokens = tokenizer(text)
if tokens[0].lang_ not in exceptions:
assert len(tokens) == length

View File

@ -19,19 +19,17 @@ p
p
| When you load a model, spaCy first consults the model's
| #[+a("/docs/usage/saving-loading#models-generating") meta.json] for its
| #[code setup] details. This typically includes the ID of a language class,
| #[+a("/docs/usage/saving-loading#models-generating") meta.json]. The
| meta typically includes the model details, the ID of a language class,
| and an optional list of pipeline components. spaCy then does the
| following:
+aside-code("meta.json (excerpt)", "json").
{
"name": "example_model",
"lang": "en"
"description": "Example model for spaCy",
"setup": {
"lang": "en",
"pipeline": ["token_vectors", "tagger"]
}
"pipeline": ["token_vectors", "tagger"]
}
+list("numbers")
@ -287,17 +285,15 @@ p
p
| In the model package's meta.json, specify the language class and pipeline
| IDs in #[code setup]:
| IDs:
+code("meta.json (excerpt)", "json").
{
"name": "my_sentiment_model",
"name": "sentiment_model",
"lang": "en",
"version": "1.0.0",
"spacy_version": "&gt;=2.0.0,&lt;3.0.0",
"setup": {
"lang": "en",
"pipeline": ["vectorizer", "sentiment"]
}
"pipeline": ["vectorizer", "sentiment"]
}
p
@ -307,7 +303,7 @@ p
| by your custom #[code "sentiment"] factory.
+code.
nlp = spacy.load('my_sentiment_model')
nlp = spacy.load('en_sentiment_model')
doc = nlp(u'I love pizza')
assert doc.sentiment

View File

@ -129,13 +129,14 @@ p
+code.
import spacy
from spacy.tokens.doc import Doc
from spacy.vocab import Vocab
nlp = spacy.load('en')
moby_dick = open('moby_dick.txt', 'r')
doc = nlp(moby_dick)
doc.to_disk('/moby_dick.bin')
new_doc = Doc().from_disk('/moby_dick.bin')
new_doc = Doc(Vocab()).from_disk('/moby_dick.bin')
+infobox
| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
@ -148,9 +149,14 @@ p
nlp = spacy.load('en')
matcher = Matcher(nlp.vocab)
# match "Google I/O" or "Google i/o"
pattern = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}]
matcher.add('GoogleIO', None, pattern)
def set_sentiment(matcher, doc, i, matches):
doc.sentiment += 0.1
pattern1 = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}]
pattern2 = [[{'ORTH': emoji, 'OP': '+'}] for emoji in ['😀', '😂', '🤣', '😍']]
matcher.add('GoogleIO', None, pattern1) # match "Google I/O" or "Google i/o"
matcher.add('HAPPY', set_sentiment, pattern2) # match one or more happy emoji
matches = nlp(LOTS_OF TEXT)
+infobox

View File

@ -11,7 +11,7 @@ p
| You can also associate patterns with entity IDs, to allow some basic
| entity linking or disambiguation.
+aside("What about \"real\" regular expressions?")
//-+aside("What about \"real\" regular expressions?")
+h(2, "adding-patterns") Adding patterns
@ -119,7 +119,7 @@ p
+code.
# Add a new custom flag to the vocab, which is always False by default.
# BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span.
BAD_HTML_FLAG = doc.vocab.add_flag(lambda text: False)
BAD_HTML_FLAG = nlp.vocab.add_flag(lambda text: False)
def merge_and_flag(matcher, doc, i, matches):
match_id, start, end = matches[i]
@ -221,7 +221,7 @@ p
+cell match 0 or 1 times
+cell optional, max one
+h(3, "quantifiers-example1") Quantifiers example: Using linguistic annotations
+h(2, "example1") Example: Using linguistic annotations
p
| Let's say you're analysing user comments and you want to find out what
@ -283,7 +283,7 @@ p
# set manual=True to make displaCy render straight from a dictionary
displacy.serve(matched_sents, style='ent', manual=True)
+h(3, "quantifiers-example2") Quantifiers example: Phone numbers
+h(2, "example2") Example: Phone numbers
p
| Phone numbers can have many different formats and matching them is often
@ -320,3 +320,114 @@ p
| It'll produce more predictable results, is much easier to modify and
| extend, and doesn't require any training data only a set of
| test cases.
+h(2, "example3") Example: Hashtags and emoji on social media
p
| Social media posts, especially tweets, can be difficult to work with.
| They're very short and often contain various emoji and hashtags. By only
| looking at the plain text, you'll lose a lot of valuable semantic
| information.
p
| Let's say you've extracted a large sample of social media posts on a
| specific topic, for example posts mentioning a brand name or product.
| As the first step of your data exploration, you want to filter out posts
| containing certain emoji and use them to assign a general sentiment
| score, based on whether the expressed emotion is positive or negative,
| e.g. #[span.o-icon.o-icon--inline 😀] or #[span.o-icon.o-icon--inline 😞].
| You also want to find, merge and label hashtags like
| #[code #MondayMotivation], to be able to ignore or analyse them later.
+aside("Note on sentiment analysis")
| Ultimately, sentiment analysis is not always #[em that] easy. In
| addition to the emoji, you'll also want to take specific words into
| account and check the #[code subtree] for intensifiers like "very", to
| increase the sentiment score. At some point, you might also want to train
| a sentiment model. However, the approach described in this example is
| very useful for #[strong bootstrapping rules to gather training data].
| It's also an incredibly fast way to gather first insights into your data
| with about 1 million tweets, you'd be looking at a processing time of
| #[strong under 1 minute].
p
| By default, spaCy's tokenizer will split emoji into separate tokens. This
| means that you can create a pattern for one or more emoji tokens. In this
| case, a sequence of identical emoji should be treated as one instance.
| Valid hashtags usually consist of a #[code #], plus a sequence of
| ASCII characters with no whitespace, making them easy to match as well.
+code.
from spacy.lang.en import English
from spacy.matcher import Matcher
nlp = English() # we only want the tokenizer, so no need to load a model
matcher = Matcher(nlp.vocab)
pos_emoji = [u'😀', u'😃', u'😂', u'🤣', u'😊', u'😍'] # positive emoji
neg_emoji = [u'😞', u'😠', u'😩', u'😢', u'😭', u'😒'] # negative emoji
# add patterns to match one or more emoji tokens
pos_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in pos_emoji]
neg_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in neg_emoji]
matcher.add('HAPPY', label_sentiment, *pos_patterns) # add positive pattern
matcher.add('SAD', label_sentiment, *neg_patterns) # add negative pattern
# add pattern to merge valid hashtag, i.e. '#' plus any ASCII token
matcher.add('HASHTAG', merge_hashtag, [{'ORTH': '#'}, {'IS_ASCII': True}])
p
| Because the #[code on_match] callback receives the ID of each match, you
| can use the same function to handle the sentiment assignment for both
| the positive and negative pattern. To keep it simple, we'll either add
| or subtract #[code 0.1] points this way, the score will also reflect
| combinations of emoji, even positive #[em and] negative ones.
p
| With a library like
| #[+a("https://github.com/bcongdon/python-emojipedia") Emojipedia],
| we can also retrieve a short description for each emoji for example,
| #[span.o-icon.o-icon--inline 😍]'s official title is "Smiling Face With
| Heart-Eyes". Assigning it to the merged token's norm will make it
| available as #[code token.norm_].
+code.
from emojipedia import Emojipedia # installation: pip install emojipedia
def label_sentiment(matcher, doc, i, matches):
match_id, start, end = matches[i]
if match_id is 'HAPPY':
doc.sentiment += 0.1 # add 0.1 for positive sentiment
elif match_id is 'SAD':
doc.sentiment -= 0.1 # subtract 0.1 for negative sentiment
span = doc[start : end]
emoji = Emojipedia.search(span[0].text) # get data for emoji
span.merge(norm=emoji.title) # merge span and set NORM to emoji title
p
| To label the hashtags, we first need to add a new custom flag.
| #[code IS_HASHTAG] will be the flag's ID, which you can use to assign it
| to the hashtag's span, and check its value via a token's
| #[+api("token#check_flag") #[code code check_flag()]] method. On each
| match, we merge the hashtag and assign the flag.
+code.
# Add a new custom flag to the vocab, which is always False by default
IS_HASHTAG = nlp.vocab.add_flag(lambda text: False)
def merge_hashtag(matcher, doc, i, matches):
match_id, start, end = matches[i]
span = doc[start : end]
span.merge() # merge hashtag
span.set_flag(IS_HASHTAG, True) # set IS_HASHTAG to True
p
| To process a stream of social media posts, we can use
| #[+api("language#pipe") #[code Language.pipe()]], which will return a
| stream of #[code Doc] objects that we can pass to
| #[+api("matcher#pipe") #[code Matcher.pipe()]].
+code.
docs = nlp.pipe(LOTS_OF_TWEETS)
matches = matcher.pipe(docs)

View File

@ -74,16 +74,14 @@ p
+aside-code("meta.json", "json").
{
"name": "example_model",
"lang": "en",
"version": "1.0.0",
"spacy_version": "&gt;=2.0.0,&lt;3.0.0",
"description": "Example model for spaCy",
"author": "You",
"email": "you@example.com",
"license": "CC BY-SA 3.0",
"setup": {
"lang": "en",
"pipeline": ["token_vectors", "tagger"]
}
"pipeline": ["token_vectors", "tagger"]
}
+code(false, "bash").
@ -110,9 +108,9 @@ p
+h(3, "models-custom") Customising the model setup
p
| The meta.json includes a #[code setup] key that lets you customise how
| the model should be initialised and loaded. You can define the language
| data to be loaded and the
| The meta.json includes the model details, like name, requirements and
| license, and lets you customise how the model should be initialised and
| loaded. You can define the language data to be loaded and the
| #[+a("/docs/usage/language-processing-pipeline") processing pipeline] to
| execute.
@ -183,9 +181,9 @@ p
p
| To load a model from a data directory, you can use
| #[+api("spacy#load") #[code spacy.load()]] with the local path. This will
| look for a meta.json in the directory and use the #[code setup] details
| to initialise a #[code Language] class with a processing pipeline and
| load in the model data.
| look for a meta.json in the directory and use the #[code lang] and
| #[code pipeline] settings to initialise a #[code Language] class with a
| processing pipeline and load in the model data.
+code.
nlp = spacy.load('/path/to/model')