diff --git a/spacy/__init__.py b/spacy/__init__.py index 8dc0937f5..6beb7955e 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import importlib from .compat import basestring_ -from .cli.info import info +from .cli.info import info as cli_info from .glossary import explain from .deprecated import resolve_load_name from . import util @@ -20,3 +20,7 @@ def load(name, **overrides): overrides['meta'] = meta overrides['path'] = model_path return cls(**overrides) + + +def info(model=None, markdown=False): + return cli_info(None, model, markdown) diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index ac608a64a..a0a76e5ec 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -24,8 +24,9 @@ CONVERTERS = { n_sents=("Number of sentences per doc", "option", "n", float), morphology=("Enable appending morphology to tags", "flag", "m", bool) ) -def convert(_, input_file, output_dir, n_sents, morphology): - """Convert files into JSON format for use with train command and other +def convert(cmd, input_file, output_dir, n_sents, morphology): + """ + Convert files into JSON format for use with train command and other experiment management functions. """ input_path = Path(input_file) diff --git a/spacy/cli/download.py b/spacy/cli/download.py index fdcacb891..b6e5549da 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -17,8 +17,9 @@ from .. import about direct=("force direct download. Needs model name with version and won't " "perform compatibility check", "flag", "d", bool) ) -def download(model, direct=False): - """Download compatible model from default download path using pip. Model +def download(cmd, model, direct=False): + """ + Download compatible model from default download path using pip. Model can be shortcut, model name or, if --direct flag is set, full model name with version. """ @@ -31,7 +32,7 @@ def download(model, direct=False): version = get_version(model_name, compatibility) download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version)) try: - link(model_name, model, force=True) + link(None, model_name, model, force=True) except: # Dirty, but since spacy.download and the auto-linking is mostly # a convenience wrapper, it's best to show a success message and diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 6f7467521..75aac10c7 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -14,7 +14,7 @@ from .. import util model=("optional: shortcut link of model", "positional", None, str), markdown=("generate Markdown for GitHub issues", "flag", "md", str) ) -def info(model=None, markdown=False): +def info(cmd, model=None, markdown=False): """Print info about spaCy installation. If a model shortcut link is speficied as an argument, print model information. Flag --markdown prints details in Markdown for easy copy-pasting to GitHub issues. diff --git a/spacy/cli/link.py b/spacy/cli/link.py index 1feef8bce..9aecdabfe 100644 --- a/spacy/cli/link.py +++ b/spacy/cli/link.py @@ -14,8 +14,9 @@ from .. import util link_name=("name of shortuct link to create", "positional", None, str), force=("force overwriting of existing link", "flag", "f", bool) ) -def link(origin, link_name, force=False): - """Create a symlink for models within the spacy/data directory. Accepts +def link(cmd, origin, link_name, force=False): + """ + Create a symlink for models within the spacy/data directory. Accepts either the name of a pip package, or the local path to the model data directory. Linking models allows loading them via spacy.load(link_name). """ diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 9acd0a2fa..e78a4eeb4 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -18,8 +18,9 @@ from .. import about meta=("path to meta.json", "option", "m", str), force=("force overwriting of existing folder in output directory", "flag", "f", bool) ) -def package(input_dir, output_dir, meta, force): - """Generate Python package for model data, including meta and required +def package(cmd, input_dir, output_dir, meta=None, force=False): + """ + Generate Python package for model data, including meta and required installation files. A new directory will be created in the specified output directory, and model data will be copied over. """ @@ -42,7 +43,7 @@ def package(input_dir, output_dir, meta, force): meta = util.read_json(meta_path) else: meta = generate_meta() - validate_meta(meta, ['lang', 'name', 'version']) + meta = validate_meta(meta, ['lang', 'name', 'version']) model_name = meta['lang'] + '_' + meta['name'] model_name_v = model_name + '-' + meta['version'] @@ -85,20 +86,32 @@ def generate_meta(): ('email', 'Author email', False), ('url', 'Author website', False), ('license', 'License', 'CC BY-NC 3.0')] - prints("Enter the package settings for your model.", title="Generating meta.json") meta = {} for setting, desc, default in settings: response = util.get_raw_input(desc, default) meta[setting] = default if response == '' and default else response + meta['pipeline'] = generate_pipeline() return meta +def generate_pipeline(): + prints("If set to 'True', the default pipeline is used. If set to 'False', " + "the pipeline will be disabled. Components should be specified as a " + "comma-separated list of component names, e.g. vectorizer, tagger, " + "parser, ner. For more information, see the docs on processing pipelines.", + title="Enter your model's pipeline components") + pipeline = util.get_raw_input("Pipeline components", True) + replace = {'True': True, 'False': False} + return replace[pipeline] if pipeline in replace else pipeline.split(', ') + + def validate_meta(meta, keys): for key in keys: if key not in meta or meta[key] == '': prints("This setting is required to build your package.", title='No "%s" setting found in meta.json' % key, exits=1) + return meta def get_template(filepath): diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 7bbda5a47..b1e9446ed 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -32,9 +32,11 @@ from .. import displacy no_parser=("Don't train parser", "flag", "P", bool), no_entities=("Don't train NER", "flag", "N", bool) ) -def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, +def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, use_gpu=False, no_tagger=False, no_parser=False, no_entities=False): - """Train a model. Expects data in spaCy's JSON format.""" + """ + Train a model. Expects data in spaCy's JSON format. + """ n_sents = n_sents or None output_path = util.ensure_path(output_dir) train_path = util.ensure_path(train_data) @@ -70,12 +72,12 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, optimizer = nlp.begin_training(lambda: corpus.train_tuples, use_gpu=use_gpu) - print("Itn.\tDep. Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %") + print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %") try: for i in range(n_iter): with tqdm.tqdm(total=corpus.count_train(), leave=False) as pbar: train_docs = corpus.train_docs(nlp, projectivize=True, - gold_preproc=False, max_length=1000) + gold_preproc=False, max_length=0) losses = {} for batch in minibatch(train_docs, size=batch_sizes): docs, golds = zip(*batch) diff --git a/spacy/lang/bn/punctuation.py b/spacy/lang/bn/punctuation.py index 66b7d967c..96485dd55 100644 --- a/spacy/lang/bn/punctuation.py +++ b/spacy/lang/bn/punctuation.py @@ -1,8 +1,8 @@ # coding: utf8 from __future__ import unicode_literals -from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, UNITS -from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES +from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS +from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES, UNITS _currency = r"\$|¢|£|€|¥|฿|৳" @@ -10,16 +10,16 @@ _quotes = QUOTES.replace("'", '') _list_punct = LIST_PUNCT + '। ॥'.strip().split() -_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES) +_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS) -_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + +_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS + [r'(?<=[0-9])\+', r'(?<=°[FfCcKk])\.', r'(?<=[0-9])(?:{})'.format(_currency), r'(?<=[0-9])(?:{})'.format(UNITS), r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)]) -_infixes = (LIST_ELLIPSES + +_infixes = (LIST_ELLIPSES + LIST_ICONS + [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 5b81eddde..bec685646 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -20,7 +20,6 @@ _upper = [_latin_upper] _lower = [_latin_lower] _uncased = [_bengali, _hebrew] - ALPHA = merge_char_classes(_upper + _lower + _uncased) ALPHA_LOWER = merge_char_classes(_lower + _uncased) ALPHA_UPPER = merge_char_classes(_upper + _uncased) @@ -33,13 +32,14 @@ _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$' _punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &' _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «' _hyphens = '- – — -- ---' - +_other_symbols = r'[\p{So}]' UNITS = merge_chars(_units) CURRENCY = merge_chars(_currency) QUOTES = merge_chars(_quotes) PUNCT = merge_chars(_punct) HYPHENS = merge_chars(_hyphens) +ICONS = _other_symbols LIST_UNITS = split_chars(_units) LIST_CURRENCY = split_chars(_currency) @@ -47,3 +47,4 @@ LIST_QUOTES = split_chars(_quotes) LIST_PUNCT = split_chars(_punct) LIST_HYPHENS = split_chars(_hyphens) LIST_ELLIPSES = [r'\.\.+', '…'] +LIST_ICONS = [_other_symbols] diff --git a/spacy/lang/punctuation.py b/spacy/lang/punctuation.py index 74bb28f5f..680f5cff0 100644 --- a/spacy/lang/punctuation.py +++ b/spacy/lang/punctuation.py @@ -2,15 +2,16 @@ from __future__ import unicode_literals from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY -from .char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES -from .char_classes import CURRENCY, UNITS +from .char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS +from .char_classes import QUOTES, CURRENCY, UNITS _prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + - LIST_CURRENCY) + LIST_CURRENCY + LIST_ICONS) -_suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + +_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS + + ["'s", "'S", "’s", "’S"] + [r'(?<=[0-9])\+', r'(?<=°[FfCcKk])\.', r'(?<=[0-9])(?:{})'.format(CURRENCY), @@ -19,7 +20,7 @@ _suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QU r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER)]) -_infixes = (LIST_ELLIPSES + +_infixes = (LIST_ELLIPSES + LIST_ICONS + [r'(?<=[0-9])[+\-\*^](?=[0-9-])', r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), diff --git a/spacy/language.py b/spacy/language.py index e4c18f8ca..7adae0ed5 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -212,18 +212,17 @@ class Language(object): """ tok2vec = self.pipeline[0] feats = tok2vec.doc2feats(docs) - procs = list(self.pipeline[1:]) - random.shuffle(procs) grads = {} def get_grads(W, dW, key=None): grads[key] = (W, dW) - for proc in procs: + for proc in self.pipeline[1:]: if not hasattr(proc, 'update'): continue tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop) d_tokvecses = proc.update((docs, tokvecses), golds, drop=drop, sgd=get_grads, losses=losses) - bp_tokvecses(d_tokvecses, sgd=sgd) + if d_tokvecses is not None: + bp_tokvecses(d_tokvecses, sgd=sgd) for key, (W, dW) in grads.items(): sgd(W, dW, key=key) # Clear the tensor variable, to free GPU memory. diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 6db6e5ae1..095dcc5e7 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -432,6 +432,8 @@ cdef class Parser: 0.0) todo = [(s, g) for (s, g) in zip(states, golds) if not s.is_final() and g is not None] + if not todo: + return None backprops = [] d_tokvecs = state2vec.ops.allocate(tokvecs.shape) diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py index aab27714e..70fb103dc 100644 --- a/spacy/tests/tokenizer/test_exceptions.py +++ b/spacy/tests/tokenizer/test_exceptions.py @@ -1,7 +1,4 @@ # coding: utf-8 -"""Test that tokenizer exceptions and emoticons are handled correctly.""" - - from __future__ import unicode_literals import pytest @@ -39,3 +36,12 @@ def test_tokenizer_handles_emoticons(tokenizer): def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length): tokens = tokenizer(text) assert len(tokens) == length + + +@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8), + ('i💙you', 3), ('🤘🤘yay!', 4)]) +def test_tokenizer_handles_emoji(tokenizer, text, length): + exceptions = ["hu"] + tokens = tokenizer(text) + if tokens[0].lang_ not in exceptions: + assert len(tokens) == length diff --git a/website/docs/usage/language-processing-pipeline.jade b/website/docs/usage/language-processing-pipeline.jade index ce23a1666..1392fc2f8 100644 --- a/website/docs/usage/language-processing-pipeline.jade +++ b/website/docs/usage/language-processing-pipeline.jade @@ -19,19 +19,17 @@ p p | When you load a model, spaCy first consults the model's - | #[+a("/docs/usage/saving-loading#models-generating") meta.json] for its - | #[code setup] details. This typically includes the ID of a language class, + | #[+a("/docs/usage/saving-loading#models-generating") meta.json]. The + | meta typically includes the model details, the ID of a language class, | and an optional list of pipeline components. spaCy then does the | following: +aside-code("meta.json (excerpt)", "json"). { "name": "example_model", + "lang": "en" "description": "Example model for spaCy", - "setup": { - "lang": "en", - "pipeline": ["token_vectors", "tagger"] - } + "pipeline": ["token_vectors", "tagger"] } +list("numbers") @@ -287,17 +285,15 @@ p p | In the model package's meta.json, specify the language class and pipeline - | IDs in #[code setup]: + | IDs: +code("meta.json (excerpt)", "json"). { - "name": "my_sentiment_model", + "name": "sentiment_model", + "lang": "en", "version": "1.0.0", "spacy_version": ">=2.0.0,<3.0.0", - "setup": { - "lang": "en", - "pipeline": ["vectorizer", "sentiment"] - } + "pipeline": ["vectorizer", "sentiment"] } p @@ -307,7 +303,7 @@ p | by your custom #[code "sentiment"] factory. +code. - nlp = spacy.load('my_sentiment_model') + nlp = spacy.load('en_sentiment_model') doc = nlp(u'I love pizza') assert doc.sentiment diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade index 4a9a2315f..7de486070 100644 --- a/website/docs/usage/lightning-tour.jade +++ b/website/docs/usage/lightning-tour.jade @@ -129,13 +129,14 @@ p +code. import spacy from spacy.tokens.doc import Doc + from spacy.vocab import Vocab nlp = spacy.load('en') moby_dick = open('moby_dick.txt', 'r') doc = nlp(moby_dick) doc.to_disk('/moby_dick.bin') - new_doc = Doc().from_disk('/moby_dick.bin') + new_doc = Doc(Vocab()).from_disk('/moby_dick.bin') +infobox | #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading] @@ -148,9 +149,14 @@ p nlp = spacy.load('en') matcher = Matcher(nlp.vocab) - # match "Google I/O" or "Google i/o" - pattern = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}] - matcher.add('GoogleIO', None, pattern) + + def set_sentiment(matcher, doc, i, matches): + doc.sentiment += 0.1 + + pattern1 = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}] + pattern2 = [[{'ORTH': emoji, 'OP': '+'}] for emoji in ['😀', '😂', '🤣', '😍']] + matcher.add('GoogleIO', None, pattern1) # match "Google I/O" or "Google i/o" + matcher.add('HAPPY', set_sentiment, pattern2) # match one or more happy emoji matches = nlp(LOTS_OF TEXT) +infobox diff --git a/website/docs/usage/rule-based-matching.jade b/website/docs/usage/rule-based-matching.jade index a54b70b89..fde6da6ef 100644 --- a/website/docs/usage/rule-based-matching.jade +++ b/website/docs/usage/rule-based-matching.jade @@ -11,7 +11,7 @@ p | You can also associate patterns with entity IDs, to allow some basic | entity linking or disambiguation. -+aside("What about \"real\" regular expressions?") +//-+aside("What about \"real\" regular expressions?") +h(2, "adding-patterns") Adding patterns @@ -119,7 +119,7 @@ p +code. # Add a new custom flag to the vocab, which is always False by default. # BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span. - BAD_HTML_FLAG = doc.vocab.add_flag(lambda text: False) + BAD_HTML_FLAG = nlp.vocab.add_flag(lambda text: False) def merge_and_flag(matcher, doc, i, matches): match_id, start, end = matches[i] @@ -221,7 +221,7 @@ p +cell match 0 or 1 times +cell optional, max one -+h(3, "quantifiers-example1") Quantifiers example: Using linguistic annotations ++h(2, "example1") Example: Using linguistic annotations p | Let's say you're analysing user comments and you want to find out what @@ -283,7 +283,7 @@ p # set manual=True to make displaCy render straight from a dictionary displacy.serve(matched_sents, style='ent', manual=True) -+h(3, "quantifiers-example2") Quantifiers example: Phone numbers ++h(2, "example2") Example: Phone numbers p | Phone numbers can have many different formats and matching them is often @@ -320,3 +320,114 @@ p | It'll produce more predictable results, is much easier to modify and | extend, and doesn't require any training data – only a set of | test cases. + ++h(2, "example3") Example: Hashtags and emoji on social media + +p + | Social media posts, especially tweets, can be difficult to work with. + | They're very short and often contain various emoji and hashtags. By only + | looking at the plain text, you'll lose a lot of valuable semantic + | information. + +p + | Let's say you've extracted a large sample of social media posts on a + | specific topic, for example posts mentioning a brand name or product. + | As the first step of your data exploration, you want to filter out posts + | containing certain emoji and use them to assign a general sentiment + | score, based on whether the expressed emotion is positive or negative, + | e.g. #[span.o-icon.o-icon--inline 😀] or #[span.o-icon.o-icon--inline 😞]. + | You also want to find, merge and label hashtags like + | #[code #MondayMotivation], to be able to ignore or analyse them later. + ++aside("Note on sentiment analysis") + | Ultimately, sentiment analysis is not always #[em that] easy. In + | addition to the emoji, you'll also want to take specific words into + | account and check the #[code subtree] for intensifiers like "very", to + | increase the sentiment score. At some point, you might also want to train + | a sentiment model. However, the approach described in this example is + | very useful for #[strong bootstrapping rules to gather training data]. + | It's also an incredibly fast way to gather first insights into your data + | – with about 1 million tweets, you'd be looking at a processing time of + | #[strong under 1 minute]. + +p + | By default, spaCy's tokenizer will split emoji into separate tokens. This + | means that you can create a pattern for one or more emoji tokens. In this + | case, a sequence of identical emoji should be treated as one instance. + | Valid hashtags usually consist of a #[code #], plus a sequence of + | ASCII characters with no whitespace, making them easy to match as well. + ++code. + from spacy.lang.en import English + from spacy.matcher import Matcher + + nlp = English() # we only want the tokenizer, so no need to load a model + matcher = Matcher(nlp.vocab) + + pos_emoji = [u'😀', u'😃', u'😂', u'🤣', u'😊', u'😍'] # positive emoji + neg_emoji = [u'😞', u'😠', u'😩', u'😢', u'😭', u'😒'] # negative emoji + + # add patterns to match one or more emoji tokens + pos_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in pos_emoji] + neg_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in neg_emoji] + + matcher.add('HAPPY', label_sentiment, *pos_patterns) # add positive pattern + matcher.add('SAD', label_sentiment, *neg_patterns) # add negative pattern + + # add pattern to merge valid hashtag, i.e. '#' plus any ASCII token + matcher.add('HASHTAG', merge_hashtag, [{'ORTH': '#'}, {'IS_ASCII': True}]) + +p + | Because the #[code on_match] callback receives the ID of each match, you + | can use the same function to handle the sentiment assignment for both + | the positive and negative pattern. To keep it simple, we'll either add + | or subtract #[code 0.1] points – this way, the score will also reflect + | combinations of emoji, even positive #[em and] negative ones. + +p + | With a library like + | #[+a("https://github.com/bcongdon/python-emojipedia") Emojipedia], + | we can also retrieve a short description for each emoji – for example, + | #[span.o-icon.o-icon--inline 😍]'s official title is "Smiling Face With + | Heart-Eyes". Assigning it to the merged token's norm will make it + | available as #[code token.norm_]. + ++code. + from emojipedia import Emojipedia # installation: pip install emojipedia + + def label_sentiment(matcher, doc, i, matches): + match_id, start, end = matches[i] + if match_id is 'HAPPY': + doc.sentiment += 0.1 # add 0.1 for positive sentiment + elif match_id is 'SAD': + doc.sentiment -= 0.1 # subtract 0.1 for negative sentiment + span = doc[start : end] + emoji = Emojipedia.search(span[0].text) # get data for emoji + span.merge(norm=emoji.title) # merge span and set NORM to emoji title + +p + | To label the hashtags, we first need to add a new custom flag. + | #[code IS_HASHTAG] will be the flag's ID, which you can use to assign it + | to the hashtag's span, and check its value via a token's + | #[+api("token#check_flag") #[code code check_flag()]] method. On each + | match, we merge the hashtag and assign the flag. + ++code. + # Add a new custom flag to the vocab, which is always False by default + IS_HASHTAG = nlp.vocab.add_flag(lambda text: False) + + def merge_hashtag(matcher, doc, i, matches): + match_id, start, end = matches[i] + span = doc[start : end] + span.merge() # merge hashtag + span.set_flag(IS_HASHTAG, True) # set IS_HASHTAG to True + +p + | To process a stream of social media posts, we can use + | #[+api("language#pipe") #[code Language.pipe()]], which will return a + | stream of #[code Doc] objects that we can pass to + | #[+api("matcher#pipe") #[code Matcher.pipe()]]. + ++code. + docs = nlp.pipe(LOTS_OF_TWEETS) + matches = matcher.pipe(docs) diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade index 477db925c..1ecb7d7ee 100644 --- a/website/docs/usage/saving-loading.jade +++ b/website/docs/usage/saving-loading.jade @@ -74,16 +74,14 @@ p +aside-code("meta.json", "json"). { "name": "example_model", + "lang": "en", "version": "1.0.0", "spacy_version": ">=2.0.0,<3.0.0", "description": "Example model for spaCy", "author": "You", "email": "you@example.com", "license": "CC BY-SA 3.0", - "setup": { - "lang": "en", - "pipeline": ["token_vectors", "tagger"] - } + "pipeline": ["token_vectors", "tagger"] } +code(false, "bash"). @@ -110,9 +108,9 @@ p +h(3, "models-custom") Customising the model setup p - | The meta.json includes a #[code setup] key that lets you customise how - | the model should be initialised and loaded. You can define the language - | data to be loaded and the + | The meta.json includes the model details, like name, requirements and + | license, and lets you customise how the model should be initialised and + | loaded. You can define the language data to be loaded and the | #[+a("/docs/usage/language-processing-pipeline") processing pipeline] to | execute. @@ -183,9 +181,9 @@ p p | To load a model from a data directory, you can use | #[+api("spacy#load") #[code spacy.load()]] with the local path. This will - | look for a meta.json in the directory and use the #[code setup] details - | to initialise a #[code Language] class with a processing pipeline and - | load in the model data. + | look for a meta.json in the directory and use the #[code lang] and + | #[code pipeline] settings to initialise a #[code Language] class with a + | processing pipeline and load in the model data. +code. nlp = spacy.load('/path/to/model')