mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
49235017bf
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
|||
import importlib
|
||||
|
||||
from .compat import basestring_
|
||||
from .cli.info import info
|
||||
from .cli.info import info as cli_info
|
||||
from .glossary import explain
|
||||
from .deprecated import resolve_load_name
|
||||
from . import util
|
||||
|
@ -20,3 +20,7 @@ def load(name, **overrides):
|
|||
overrides['meta'] = meta
|
||||
overrides['path'] = model_path
|
||||
return cls(**overrides)
|
||||
|
||||
|
||||
def info(model=None, markdown=False):
|
||||
return cli_info(None, model, markdown)
|
||||
|
|
|
@ -24,8 +24,9 @@ CONVERTERS = {
|
|||
n_sents=("Number of sentences per doc", "option", "n", float),
|
||||
morphology=("Enable appending morphology to tags", "flag", "m", bool)
|
||||
)
|
||||
def convert(_, input_file, output_dir, n_sents, morphology):
|
||||
"""Convert files into JSON format for use with train command and other
|
||||
def convert(cmd, input_file, output_dir, n_sents, morphology):
|
||||
"""
|
||||
Convert files into JSON format for use with train command and other
|
||||
experiment management functions.
|
||||
"""
|
||||
input_path = Path(input_file)
|
||||
|
|
|
@ -17,8 +17,9 @@ from .. import about
|
|||
direct=("force direct download. Needs model name with version and won't "
|
||||
"perform compatibility check", "flag", "d", bool)
|
||||
)
|
||||
def download(model, direct=False):
|
||||
"""Download compatible model from default download path using pip. Model
|
||||
def download(cmd, model, direct=False):
|
||||
"""
|
||||
Download compatible model from default download path using pip. Model
|
||||
can be shortcut, model name or, if --direct flag is set, full model name
|
||||
with version.
|
||||
"""
|
||||
|
@ -31,7 +32,7 @@ def download(model, direct=False):
|
|||
version = get_version(model_name, compatibility)
|
||||
download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
|
||||
try:
|
||||
link(model_name, model, force=True)
|
||||
link(None, model_name, model, force=True)
|
||||
except:
|
||||
# Dirty, but since spacy.download and the auto-linking is mostly
|
||||
# a convenience wrapper, it's best to show a success message and
|
||||
|
|
|
@ -14,7 +14,7 @@ from .. import util
|
|||
model=("optional: shortcut link of model", "positional", None, str),
|
||||
markdown=("generate Markdown for GitHub issues", "flag", "md", str)
|
||||
)
|
||||
def info(model=None, markdown=False):
|
||||
def info(cmd, model=None, markdown=False):
|
||||
"""Print info about spaCy installation. If a model shortcut link is
|
||||
speficied as an argument, print model information. Flag --markdown
|
||||
prints details in Markdown for easy copy-pasting to GitHub issues.
|
||||
|
|
|
@ -14,8 +14,9 @@ from .. import util
|
|||
link_name=("name of shortuct link to create", "positional", None, str),
|
||||
force=("force overwriting of existing link", "flag", "f", bool)
|
||||
)
|
||||
def link(origin, link_name, force=False):
|
||||
"""Create a symlink for models within the spacy/data directory. Accepts
|
||||
def link(cmd, origin, link_name, force=False):
|
||||
"""
|
||||
Create a symlink for models within the spacy/data directory. Accepts
|
||||
either the name of a pip package, or the local path to the model data
|
||||
directory. Linking models allows loading them via spacy.load(link_name).
|
||||
"""
|
||||
|
|
|
@ -18,8 +18,9 @@ from .. import about
|
|||
meta=("path to meta.json", "option", "m", str),
|
||||
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
|
||||
)
|
||||
def package(input_dir, output_dir, meta, force):
|
||||
"""Generate Python package for model data, including meta and required
|
||||
def package(cmd, input_dir, output_dir, meta=None, force=False):
|
||||
"""
|
||||
Generate Python package for model data, including meta and required
|
||||
installation files. A new directory will be created in the specified
|
||||
output directory, and model data will be copied over.
|
||||
"""
|
||||
|
@ -42,7 +43,7 @@ def package(input_dir, output_dir, meta, force):
|
|||
meta = util.read_json(meta_path)
|
||||
else:
|
||||
meta = generate_meta()
|
||||
validate_meta(meta, ['lang', 'name', 'version'])
|
||||
meta = validate_meta(meta, ['lang', 'name', 'version'])
|
||||
|
||||
model_name = meta['lang'] + '_' + meta['name']
|
||||
model_name_v = model_name + '-' + meta['version']
|
||||
|
@ -85,20 +86,32 @@ def generate_meta():
|
|||
('email', 'Author email', False),
|
||||
('url', 'Author website', False),
|
||||
('license', 'License', 'CC BY-NC 3.0')]
|
||||
|
||||
prints("Enter the package settings for your model.", title="Generating meta.json")
|
||||
meta = {}
|
||||
for setting, desc, default in settings:
|
||||
response = util.get_raw_input(desc, default)
|
||||
meta[setting] = default if response == '' and default else response
|
||||
meta['pipeline'] = generate_pipeline()
|
||||
return meta
|
||||
|
||||
|
||||
def generate_pipeline():
|
||||
prints("If set to 'True', the default pipeline is used. If set to 'False', "
|
||||
"the pipeline will be disabled. Components should be specified as a "
|
||||
"comma-separated list of component names, e.g. vectorizer, tagger, "
|
||||
"parser, ner. For more information, see the docs on processing pipelines.",
|
||||
title="Enter your model's pipeline components")
|
||||
pipeline = util.get_raw_input("Pipeline components", True)
|
||||
replace = {'True': True, 'False': False}
|
||||
return replace[pipeline] if pipeline in replace else pipeline.split(', ')
|
||||
|
||||
|
||||
def validate_meta(meta, keys):
|
||||
for key in keys:
|
||||
if key not in meta or meta[key] == '':
|
||||
prints("This setting is required to build your package.",
|
||||
title='No "%s" setting found in meta.json' % key, exits=1)
|
||||
return meta
|
||||
|
||||
|
||||
def get_template(filepath):
|
||||
|
|
|
@ -32,9 +32,11 @@ from .. import displacy
|
|||
no_parser=("Don't train parser", "flag", "P", bool),
|
||||
no_entities=("Don't train NER", "flag", "N", bool)
|
||||
)
|
||||
def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||
use_gpu=False, no_tagger=False, no_parser=False, no_entities=False):
|
||||
"""Train a model. Expects data in spaCy's JSON format."""
|
||||
"""
|
||||
Train a model. Expects data in spaCy's JSON format.
|
||||
"""
|
||||
n_sents = n_sents or None
|
||||
output_path = util.ensure_path(output_dir)
|
||||
train_path = util.ensure_path(train_data)
|
||||
|
@ -70,12 +72,12 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
|||
|
||||
optimizer = nlp.begin_training(lambda: corpus.train_tuples, use_gpu=use_gpu)
|
||||
|
||||
print("Itn.\tDep. Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
|
||||
print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
|
||||
try:
|
||||
for i in range(n_iter):
|
||||
with tqdm.tqdm(total=corpus.count_train(), leave=False) as pbar:
|
||||
train_docs = corpus.train_docs(nlp, projectivize=True,
|
||||
gold_preproc=False, max_length=1000)
|
||||
gold_preproc=False, max_length=0)
|
||||
losses = {}
|
||||
for batch in minibatch(train_docs, size=batch_sizes):
|
||||
docs, golds = zip(*batch)
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, UNITS
|
||||
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
|
||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
||||
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES, UNITS
|
||||
|
||||
|
||||
_currency = r"\$|¢|£|€|¥|฿|৳"
|
||||
|
@ -10,16 +10,16 @@ _quotes = QUOTES.replace("'", '')
|
|||
_list_punct = LIST_PUNCT + '। ॥'.strip().split()
|
||||
|
||||
|
||||
_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES)
|
||||
_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS)
|
||||
|
||||
_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES +
|
||||
_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
|
||||
[r'(?<=[0-9])\+',
|
||||
r'(?<=°[FfCcKk])\.',
|
||||
r'(?<=[0-9])(?:{})'.format(_currency),
|
||||
r'(?<=[0-9])(?:{})'.format(UNITS),
|
||||
r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)])
|
||||
|
||||
_infixes = (LIST_ELLIPSES +
|
||||
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
||||
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
|
||||
|
|
|
@ -20,7 +20,6 @@ _upper = [_latin_upper]
|
|||
_lower = [_latin_lower]
|
||||
_uncased = [_bengali, _hebrew]
|
||||
|
||||
|
||||
ALPHA = merge_char_classes(_upper + _lower + _uncased)
|
||||
ALPHA_LOWER = merge_char_classes(_lower + _uncased)
|
||||
ALPHA_UPPER = merge_char_classes(_upper + _uncased)
|
||||
|
@ -33,13 +32,14 @@ _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
|
|||
_punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
|
||||
_quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «'
|
||||
_hyphens = '- – — -- ---'
|
||||
|
||||
_other_symbols = r'[\p{So}]'
|
||||
|
||||
UNITS = merge_chars(_units)
|
||||
CURRENCY = merge_chars(_currency)
|
||||
QUOTES = merge_chars(_quotes)
|
||||
PUNCT = merge_chars(_punct)
|
||||
HYPHENS = merge_chars(_hyphens)
|
||||
ICONS = _other_symbols
|
||||
|
||||
LIST_UNITS = split_chars(_units)
|
||||
LIST_CURRENCY = split_chars(_currency)
|
||||
|
@ -47,3 +47,4 @@ LIST_QUOTES = split_chars(_quotes)
|
|||
LIST_PUNCT = split_chars(_punct)
|
||||
LIST_HYPHENS = split_chars(_hyphens)
|
||||
LIST_ELLIPSES = [r'\.\.+', '…']
|
||||
LIST_ICONS = [_other_symbols]
|
||||
|
|
|
@ -2,15 +2,16 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
||||
from .char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
|
||||
from .char_classes import CURRENCY, UNITS
|
||||
from .char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
|
||||
from .char_classes import QUOTES, CURRENCY, UNITS
|
||||
|
||||
|
||||
_prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
|
||||
LIST_CURRENCY)
|
||||
LIST_CURRENCY + LIST_ICONS)
|
||||
|
||||
|
||||
_suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
|
||||
_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
|
||||
["'s", "'S", "’s", "’S"] +
|
||||
[r'(?<=[0-9])\+',
|
||||
r'(?<=°[FfCcKk])\.',
|
||||
r'(?<=[0-9])(?:{})'.format(CURRENCY),
|
||||
|
@ -19,7 +20,7 @@ _suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QU
|
|||
r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER)])
|
||||
|
||||
|
||||
_infixes = (LIST_ELLIPSES +
|
||||
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
||||
[r'(?<=[0-9])[+\-\*^](?=[0-9-])',
|
||||
r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||
|
|
|
@ -212,18 +212,17 @@ class Language(object):
|
|||
"""
|
||||
tok2vec = self.pipeline[0]
|
||||
feats = tok2vec.doc2feats(docs)
|
||||
procs = list(self.pipeline[1:])
|
||||
random.shuffle(procs)
|
||||
grads = {}
|
||||
def get_grads(W, dW, key=None):
|
||||
grads[key] = (W, dW)
|
||||
for proc in procs:
|
||||
for proc in self.pipeline[1:]:
|
||||
if not hasattr(proc, 'update'):
|
||||
continue
|
||||
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
|
||||
d_tokvecses = proc.update((docs, tokvecses), golds,
|
||||
drop=drop, sgd=get_grads, losses=losses)
|
||||
bp_tokvecses(d_tokvecses, sgd=sgd)
|
||||
if d_tokvecses is not None:
|
||||
bp_tokvecses(d_tokvecses, sgd=sgd)
|
||||
for key, (W, dW) in grads.items():
|
||||
sgd(W, dW, key=key)
|
||||
# Clear the tensor variable, to free GPU memory.
|
||||
|
|
|
@ -432,6 +432,8 @@ cdef class Parser:
|
|||
0.0)
|
||||
todo = [(s, g) for (s, g) in zip(states, golds)
|
||||
if not s.is_final() and g is not None]
|
||||
if not todo:
|
||||
return None
|
||||
|
||||
backprops = []
|
||||
d_tokvecs = state2vec.ops.allocate(tokvecs.shape)
|
||||
|
|
|
@ -1,7 +1,4 @@
|
|||
# coding: utf-8
|
||||
"""Test that tokenizer exceptions and emoticons are handled correctly."""
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
@ -39,3 +36,12 @@ def test_tokenizer_handles_emoticons(tokenizer):
|
|||
def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8),
|
||||
('i💙you', 3), ('🤘🤘yay!', 4)])
|
||||
def test_tokenizer_handles_emoji(tokenizer, text, length):
|
||||
exceptions = ["hu"]
|
||||
tokens = tokenizer(text)
|
||||
if tokens[0].lang_ not in exceptions:
|
||||
assert len(tokens) == length
|
||||
|
|
|
@ -19,19 +19,17 @@ p
|
|||
|
||||
p
|
||||
| When you load a model, spaCy first consults the model's
|
||||
| #[+a("/docs/usage/saving-loading#models-generating") meta.json] for its
|
||||
| #[code setup] details. This typically includes the ID of a language class,
|
||||
| #[+a("/docs/usage/saving-loading#models-generating") meta.json]. The
|
||||
| meta typically includes the model details, the ID of a language class,
|
||||
| and an optional list of pipeline components. spaCy then does the
|
||||
| following:
|
||||
|
||||
+aside-code("meta.json (excerpt)", "json").
|
||||
{
|
||||
"name": "example_model",
|
||||
"lang": "en"
|
||||
"description": "Example model for spaCy",
|
||||
"setup": {
|
||||
"lang": "en",
|
||||
"pipeline": ["token_vectors", "tagger"]
|
||||
}
|
||||
"pipeline": ["token_vectors", "tagger"]
|
||||
}
|
||||
|
||||
+list("numbers")
|
||||
|
@ -287,17 +285,15 @@ p
|
|||
|
||||
p
|
||||
| In the model package's meta.json, specify the language class and pipeline
|
||||
| IDs in #[code setup]:
|
||||
| IDs:
|
||||
|
||||
+code("meta.json (excerpt)", "json").
|
||||
{
|
||||
"name": "my_sentiment_model",
|
||||
"name": "sentiment_model",
|
||||
"lang": "en",
|
||||
"version": "1.0.0",
|
||||
"spacy_version": ">=2.0.0,<3.0.0",
|
||||
"setup": {
|
||||
"lang": "en",
|
||||
"pipeline": ["vectorizer", "sentiment"]
|
||||
}
|
||||
"pipeline": ["vectorizer", "sentiment"]
|
||||
}
|
||||
|
||||
p
|
||||
|
@ -307,7 +303,7 @@ p
|
|||
| by your custom #[code "sentiment"] factory.
|
||||
|
||||
+code.
|
||||
nlp = spacy.load('my_sentiment_model')
|
||||
nlp = spacy.load('en_sentiment_model')
|
||||
doc = nlp(u'I love pizza')
|
||||
assert doc.sentiment
|
||||
|
||||
|
|
|
@ -129,13 +129,14 @@ p
|
|||
+code.
|
||||
import spacy
|
||||
from spacy.tokens.doc import Doc
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
nlp = spacy.load('en')
|
||||
moby_dick = open('moby_dick.txt', 'r')
|
||||
doc = nlp(moby_dick)
|
||||
doc.to_disk('/moby_dick.bin')
|
||||
|
||||
new_doc = Doc().from_disk('/moby_dick.bin')
|
||||
new_doc = Doc(Vocab()).from_disk('/moby_dick.bin')
|
||||
|
||||
+infobox
|
||||
| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
|
||||
|
@ -148,9 +149,14 @@ p
|
|||
|
||||
nlp = spacy.load('en')
|
||||
matcher = Matcher(nlp.vocab)
|
||||
# match "Google I/O" or "Google i/o"
|
||||
pattern = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}]
|
||||
matcher.add('GoogleIO', None, pattern)
|
||||
|
||||
def set_sentiment(matcher, doc, i, matches):
|
||||
doc.sentiment += 0.1
|
||||
|
||||
pattern1 = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}]
|
||||
pattern2 = [[{'ORTH': emoji, 'OP': '+'}] for emoji in ['😀', '😂', '🤣', '😍']]
|
||||
matcher.add('GoogleIO', None, pattern1) # match "Google I/O" or "Google i/o"
|
||||
matcher.add('HAPPY', set_sentiment, pattern2) # match one or more happy emoji
|
||||
matches = nlp(LOTS_OF TEXT)
|
||||
|
||||
+infobox
|
||||
|
|
|
@ -11,7 +11,7 @@ p
|
|||
| You can also associate patterns with entity IDs, to allow some basic
|
||||
| entity linking or disambiguation.
|
||||
|
||||
+aside("What about \"real\" regular expressions?")
|
||||
//-+aside("What about \"real\" regular expressions?")
|
||||
|
||||
+h(2, "adding-patterns") Adding patterns
|
||||
|
||||
|
@ -119,7 +119,7 @@ p
|
|||
+code.
|
||||
# Add a new custom flag to the vocab, which is always False by default.
|
||||
# BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span.
|
||||
BAD_HTML_FLAG = doc.vocab.add_flag(lambda text: False)
|
||||
BAD_HTML_FLAG = nlp.vocab.add_flag(lambda text: False)
|
||||
|
||||
def merge_and_flag(matcher, doc, i, matches):
|
||||
match_id, start, end = matches[i]
|
||||
|
@ -221,7 +221,7 @@ p
|
|||
+cell match 0 or 1 times
|
||||
+cell optional, max one
|
||||
|
||||
+h(3, "quantifiers-example1") Quantifiers example: Using linguistic annotations
|
||||
+h(2, "example1") Example: Using linguistic annotations
|
||||
|
||||
p
|
||||
| Let's say you're analysing user comments and you want to find out what
|
||||
|
@ -283,7 +283,7 @@ p
|
|||
# set manual=True to make displaCy render straight from a dictionary
|
||||
displacy.serve(matched_sents, style='ent', manual=True)
|
||||
|
||||
+h(3, "quantifiers-example2") Quantifiers example: Phone numbers
|
||||
+h(2, "example2") Example: Phone numbers
|
||||
|
||||
p
|
||||
| Phone numbers can have many different formats and matching them is often
|
||||
|
@ -320,3 +320,114 @@ p
|
|||
| It'll produce more predictable results, is much easier to modify and
|
||||
| extend, and doesn't require any training data – only a set of
|
||||
| test cases.
|
||||
|
||||
+h(2, "example3") Example: Hashtags and emoji on social media
|
||||
|
||||
p
|
||||
| Social media posts, especially tweets, can be difficult to work with.
|
||||
| They're very short and often contain various emoji and hashtags. By only
|
||||
| looking at the plain text, you'll lose a lot of valuable semantic
|
||||
| information.
|
||||
|
||||
p
|
||||
| Let's say you've extracted a large sample of social media posts on a
|
||||
| specific topic, for example posts mentioning a brand name or product.
|
||||
| As the first step of your data exploration, you want to filter out posts
|
||||
| containing certain emoji and use them to assign a general sentiment
|
||||
| score, based on whether the expressed emotion is positive or negative,
|
||||
| e.g. #[span.o-icon.o-icon--inline 😀] or #[span.o-icon.o-icon--inline 😞].
|
||||
| You also want to find, merge and label hashtags like
|
||||
| #[code #MondayMotivation], to be able to ignore or analyse them later.
|
||||
|
||||
+aside("Note on sentiment analysis")
|
||||
| Ultimately, sentiment analysis is not always #[em that] easy. In
|
||||
| addition to the emoji, you'll also want to take specific words into
|
||||
| account and check the #[code subtree] for intensifiers like "very", to
|
||||
| increase the sentiment score. At some point, you might also want to train
|
||||
| a sentiment model. However, the approach described in this example is
|
||||
| very useful for #[strong bootstrapping rules to gather training data].
|
||||
| It's also an incredibly fast way to gather first insights into your data
|
||||
| – with about 1 million tweets, you'd be looking at a processing time of
|
||||
| #[strong under 1 minute].
|
||||
|
||||
p
|
||||
| By default, spaCy's tokenizer will split emoji into separate tokens. This
|
||||
| means that you can create a pattern for one or more emoji tokens. In this
|
||||
| case, a sequence of identical emoji should be treated as one instance.
|
||||
| Valid hashtags usually consist of a #[code #], plus a sequence of
|
||||
| ASCII characters with no whitespace, making them easy to match as well.
|
||||
|
||||
+code.
|
||||
from spacy.lang.en import English
|
||||
from spacy.matcher import Matcher
|
||||
|
||||
nlp = English() # we only want the tokenizer, so no need to load a model
|
||||
matcher = Matcher(nlp.vocab)
|
||||
|
||||
pos_emoji = [u'😀', u'😃', u'😂', u'🤣', u'😊', u'😍'] # positive emoji
|
||||
neg_emoji = [u'😞', u'😠', u'😩', u'😢', u'😭', u'😒'] # negative emoji
|
||||
|
||||
# add patterns to match one or more emoji tokens
|
||||
pos_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in pos_emoji]
|
||||
neg_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in neg_emoji]
|
||||
|
||||
matcher.add('HAPPY', label_sentiment, *pos_patterns) # add positive pattern
|
||||
matcher.add('SAD', label_sentiment, *neg_patterns) # add negative pattern
|
||||
|
||||
# add pattern to merge valid hashtag, i.e. '#' plus any ASCII token
|
||||
matcher.add('HASHTAG', merge_hashtag, [{'ORTH': '#'}, {'IS_ASCII': True}])
|
||||
|
||||
p
|
||||
| Because the #[code on_match] callback receives the ID of each match, you
|
||||
| can use the same function to handle the sentiment assignment for both
|
||||
| the positive and negative pattern. To keep it simple, we'll either add
|
||||
| or subtract #[code 0.1] points – this way, the score will also reflect
|
||||
| combinations of emoji, even positive #[em and] negative ones.
|
||||
|
||||
p
|
||||
| With a library like
|
||||
| #[+a("https://github.com/bcongdon/python-emojipedia") Emojipedia],
|
||||
| we can also retrieve a short description for each emoji – for example,
|
||||
| #[span.o-icon.o-icon--inline 😍]'s official title is "Smiling Face With
|
||||
| Heart-Eyes". Assigning it to the merged token's norm will make it
|
||||
| available as #[code token.norm_].
|
||||
|
||||
+code.
|
||||
from emojipedia import Emojipedia # installation: pip install emojipedia
|
||||
|
||||
def label_sentiment(matcher, doc, i, matches):
|
||||
match_id, start, end = matches[i]
|
||||
if match_id is 'HAPPY':
|
||||
doc.sentiment += 0.1 # add 0.1 for positive sentiment
|
||||
elif match_id is 'SAD':
|
||||
doc.sentiment -= 0.1 # subtract 0.1 for negative sentiment
|
||||
span = doc[start : end]
|
||||
emoji = Emojipedia.search(span[0].text) # get data for emoji
|
||||
span.merge(norm=emoji.title) # merge span and set NORM to emoji title
|
||||
|
||||
p
|
||||
| To label the hashtags, we first need to add a new custom flag.
|
||||
| #[code IS_HASHTAG] will be the flag's ID, which you can use to assign it
|
||||
| to the hashtag's span, and check its value via a token's
|
||||
| #[+api("token#check_flag") #[code code check_flag()]] method. On each
|
||||
| match, we merge the hashtag and assign the flag.
|
||||
|
||||
+code.
|
||||
# Add a new custom flag to the vocab, which is always False by default
|
||||
IS_HASHTAG = nlp.vocab.add_flag(lambda text: False)
|
||||
|
||||
def merge_hashtag(matcher, doc, i, matches):
|
||||
match_id, start, end = matches[i]
|
||||
span = doc[start : end]
|
||||
span.merge() # merge hashtag
|
||||
span.set_flag(IS_HASHTAG, True) # set IS_HASHTAG to True
|
||||
|
||||
p
|
||||
| To process a stream of social media posts, we can use
|
||||
| #[+api("language#pipe") #[code Language.pipe()]], which will return a
|
||||
| stream of #[code Doc] objects that we can pass to
|
||||
| #[+api("matcher#pipe") #[code Matcher.pipe()]].
|
||||
|
||||
+code.
|
||||
docs = nlp.pipe(LOTS_OF_TWEETS)
|
||||
matches = matcher.pipe(docs)
|
||||
|
|
|
@ -74,16 +74,14 @@ p
|
|||
+aside-code("meta.json", "json").
|
||||
{
|
||||
"name": "example_model",
|
||||
"lang": "en",
|
||||
"version": "1.0.0",
|
||||
"spacy_version": ">=2.0.0,<3.0.0",
|
||||
"description": "Example model for spaCy",
|
||||
"author": "You",
|
||||
"email": "you@example.com",
|
||||
"license": "CC BY-SA 3.0",
|
||||
"setup": {
|
||||
"lang": "en",
|
||||
"pipeline": ["token_vectors", "tagger"]
|
||||
}
|
||||
"pipeline": ["token_vectors", "tagger"]
|
||||
}
|
||||
|
||||
+code(false, "bash").
|
||||
|
@ -110,9 +108,9 @@ p
|
|||
+h(3, "models-custom") Customising the model setup
|
||||
|
||||
p
|
||||
| The meta.json includes a #[code setup] key that lets you customise how
|
||||
| the model should be initialised and loaded. You can define the language
|
||||
| data to be loaded and the
|
||||
| The meta.json includes the model details, like name, requirements and
|
||||
| license, and lets you customise how the model should be initialised and
|
||||
| loaded. You can define the language data to be loaded and the
|
||||
| #[+a("/docs/usage/language-processing-pipeline") processing pipeline] to
|
||||
| execute.
|
||||
|
||||
|
@ -183,9 +181,9 @@ p
|
|||
p
|
||||
| To load a model from a data directory, you can use
|
||||
| #[+api("spacy#load") #[code spacy.load()]] with the local path. This will
|
||||
| look for a meta.json in the directory and use the #[code setup] details
|
||||
| to initialise a #[code Language] class with a processing pipeline and
|
||||
| load in the model data.
|
||||
| look for a meta.json in the directory and use the #[code lang] and
|
||||
| #[code pipeline] settings to initialise a #[code Language] class with a
|
||||
| processing pipeline and load in the model data.
|
||||
|
||||
+code.
|
||||
nlp = spacy.load('/path/to/model')
|
||||
|
|
Loading…
Reference in New Issue
Block a user