From a8e58e04efc5b57a2425595eaf1e049c23a37352 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 27 May 2017 17:57:10 +0200 Subject: [PATCH 01/25] Add symbols class to punctuation rules to handle emoji (see #1088) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently doesn't work for Hungarian, because of conflicts with the custom punctuation rules. Also doesn't take multi-character emoji like πŸ‘©πŸ½β€πŸ’» into account. --- spacy/lang/bn/punctuation.py | 10 +++++----- spacy/lang/char_classes.py | 5 +++-- spacy/lang/punctuation.py | 11 ++++++----- spacy/tests/tokenizer/test_exceptions.py | 12 +++++++++--- 4 files changed, 23 insertions(+), 15 deletions(-) diff --git a/spacy/lang/bn/punctuation.py b/spacy/lang/bn/punctuation.py index 66b7d967c..96485dd55 100644 --- a/spacy/lang/bn/punctuation.py +++ b/spacy/lang/bn/punctuation.py @@ -1,8 +1,8 @@ # coding: utf8 from __future__ import unicode_literals -from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, UNITS -from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES +from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS +from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES, UNITS _currency = r"\$|Β’|Β£|€|Β₯|ΰΈΏ|৳" @@ -10,16 +10,16 @@ _quotes = QUOTES.replace("'", '') _list_punct = LIST_PUNCT + 'ΰ₯€ ΰ₯₯'.strip().split() -_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES) +_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS) -_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + +_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS + [r'(?<=[0-9])\+', r'(?<=Β°[FfCcKk])\.', r'(?<=[0-9])(?:{})'.format(_currency), r'(?<=[0-9])(?:{})'.format(UNITS), r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%Β²\-\)\]\+', QUOTES]), _currency)]) -_infixes = (LIST_ELLIPSES + +_infixes = (LIST_ELLIPSES + LIST_ICONS + [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 5b81eddde..bec685646 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -20,7 +20,6 @@ _upper = [_latin_upper] _lower = [_latin_lower] _uncased = [_bengali, _hebrew] - ALPHA = merge_char_classes(_upper + _lower + _uncased) ALPHA_LOWER = merge_char_classes(_lower + _uncased) ALPHA_UPPER = merge_char_classes(_upper + _uncased) @@ -33,13 +32,14 @@ _currency = r'\$ Β£ € Β₯ ΰΈΏ US\$ C\$ A\$' _punct = r'… , : ; \! \? ΒΏ Β‘ \( \) \[ \] \{ \} < > _ # \* &' _quotes = r'\' \'\' " ” β€œ `` ` β€˜ Β΄ β€š , β€ž Β» Β«' _hyphens = '- – β€” -- ---' - +_other_symbols = r'[\p{So}]' UNITS = merge_chars(_units) CURRENCY = merge_chars(_currency) QUOTES = merge_chars(_quotes) PUNCT = merge_chars(_punct) HYPHENS = merge_chars(_hyphens) +ICONS = _other_symbols LIST_UNITS = split_chars(_units) LIST_CURRENCY = split_chars(_currency) @@ -47,3 +47,4 @@ LIST_QUOTES = split_chars(_quotes) LIST_PUNCT = split_chars(_punct) LIST_HYPHENS = split_chars(_hyphens) LIST_ELLIPSES = [r'\.\.+', '…'] +LIST_ICONS = [_other_symbols] diff --git a/spacy/lang/punctuation.py b/spacy/lang/punctuation.py index 74bb28f5f..680f5cff0 100644 --- a/spacy/lang/punctuation.py +++ b/spacy/lang/punctuation.py @@ -2,15 +2,16 @@ from __future__ import unicode_literals from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY -from .char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES -from .char_classes import CURRENCY, UNITS +from .char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS +from .char_classes import QUOTES, CURRENCY, UNITS _prefixes = (['Β§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + - LIST_CURRENCY) + LIST_CURRENCY + LIST_ICONS) -_suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + +_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS + + ["'s", "'S", "’s", "’S"] + [r'(?<=[0-9])\+', r'(?<=Β°[FfCcKk])\.', r'(?<=[0-9])(?:{})'.format(CURRENCY), @@ -19,7 +20,7 @@ _suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QU r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER)]) -_infixes = (LIST_ELLIPSES + +_infixes = (LIST_ELLIPSES + LIST_ICONS + [r'(?<=[0-9])[+\-\*^](?=[0-9-])', r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py index aab27714e..70fb103dc 100644 --- a/spacy/tests/tokenizer/test_exceptions.py +++ b/spacy/tests/tokenizer/test_exceptions.py @@ -1,7 +1,4 @@ # coding: utf-8 -"""Test that tokenizer exceptions and emoticons are handled correctly.""" - - from __future__ import unicode_literals import pytest @@ -39,3 +36,12 @@ def test_tokenizer_handles_emoticons(tokenizer): def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length): tokens = tokenizer(text) assert len(tokens) == length + + +@pytest.mark.parametrize('text,length', [('can you still dunk?πŸ•πŸ”πŸ˜΅LOL', 8), + ('iπŸ’™you', 3), ('🀘🀘yay!', 4)]) +def test_tokenizer_handles_emoji(tokenizer, text, length): + exceptions = ["hu"] + tokens = tokenizer(text) + if tokens[0].lang_ not in exceptions: + assert len(tokens) == length From e05bcd6aa838a7098c699a920e92628296961927 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 27 May 2017 17:57:46 +0200 Subject: [PATCH 02/25] Update docs to reflect flattened model meta.json Don't use "setup" key and instead, keep "lang" on root level and add "pipeline". --- .../usage/language-processing-pipeline.jade | 22 ++++++++----------- website/docs/usage/saving-loading.jade | 18 +++++++-------- 2 files changed, 17 insertions(+), 23 deletions(-) diff --git a/website/docs/usage/language-processing-pipeline.jade b/website/docs/usage/language-processing-pipeline.jade index ce23a1666..1392fc2f8 100644 --- a/website/docs/usage/language-processing-pipeline.jade +++ b/website/docs/usage/language-processing-pipeline.jade @@ -19,19 +19,17 @@ p p | When you load a model, spaCy first consults the model's - | #[+a("/docs/usage/saving-loading#models-generating") meta.json] for its - | #[code setup] details. This typically includes the ID of a language class, + | #[+a("/docs/usage/saving-loading#models-generating") meta.json]. The + | meta typically includes the model details, the ID of a language class, | and an optional list of pipeline components. spaCy then does the | following: +aside-code("meta.json (excerpt)", "json"). { "name": "example_model", + "lang": "en" "description": "Example model for spaCy", - "setup": { - "lang": "en", - "pipeline": ["token_vectors", "tagger"] - } + "pipeline": ["token_vectors", "tagger"] } +list("numbers") @@ -287,17 +285,15 @@ p p | In the model package's meta.json, specify the language class and pipeline - | IDs in #[code setup]: + | IDs: +code("meta.json (excerpt)", "json"). { - "name": "my_sentiment_model", + "name": "sentiment_model", + "lang": "en", "version": "1.0.0", "spacy_version": ">=2.0.0,<3.0.0", - "setup": { - "lang": "en", - "pipeline": ["vectorizer", "sentiment"] - } + "pipeline": ["vectorizer", "sentiment"] } p @@ -307,7 +303,7 @@ p | by your custom #[code "sentiment"] factory. +code. - nlp = spacy.load('my_sentiment_model') + nlp = spacy.load('en_sentiment_model') doc = nlp(u'I love pizza') assert doc.sentiment diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade index 477db925c..1ecb7d7ee 100644 --- a/website/docs/usage/saving-loading.jade +++ b/website/docs/usage/saving-loading.jade @@ -74,16 +74,14 @@ p +aside-code("meta.json", "json"). { "name": "example_model", + "lang": "en", "version": "1.0.0", "spacy_version": ">=2.0.0,<3.0.0", "description": "Example model for spaCy", "author": "You", "email": "you@example.com", "license": "CC BY-SA 3.0", - "setup": { - "lang": "en", - "pipeline": ["token_vectors", "tagger"] - } + "pipeline": ["token_vectors", "tagger"] } +code(false, "bash"). @@ -110,9 +108,9 @@ p +h(3, "models-custom") Customising the model setup p - | The meta.json includes a #[code setup] key that lets you customise how - | the model should be initialised and loaded. You can define the language - | data to be loaded and the + | The meta.json includes the model details, like name, requirements and + | license, and lets you customise how the model should be initialised and + | loaded. You can define the language data to be loaded and the | #[+a("/docs/usage/language-processing-pipeline") processing pipeline] to | execute. @@ -183,9 +181,9 @@ p p | To load a model from a data directory, you can use | #[+api("spacy#load") #[code spacy.load()]] with the local path. This will - | look for a meta.json in the directory and use the #[code setup] details - | to initialise a #[code Language] class with a processing pipeline and - | load in the model data. + | look for a meta.json in the directory and use the #[code lang] and + | #[code pipeline] settings to initialise a #[code Language] class with a + | processing pipeline and load in the model data. +code. nlp = spacy.load('/path/to/model') From 0d33ead507bfc79ac341fd9b0bbe3a1e8aacc1d9 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 27 May 2017 17:58:06 +0200 Subject: [PATCH 03/25] Fix initialisation of Doc in lightning tour example --- website/docs/usage/lightning-tour.jade | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade index 4a9a2315f..eefb7a11a 100644 --- a/website/docs/usage/lightning-tour.jade +++ b/website/docs/usage/lightning-tour.jade @@ -129,13 +129,14 @@ p +code. import spacy from spacy.tokens.doc import Doc + from spacy.vocab import Vocab nlp = spacy.load('en') moby_dick = open('moby_dick.txt', 'r') doc = nlp(moby_dick) doc.to_disk('/moby_dick.bin') - new_doc = Doc().from_disk('/moby_dick.bin') + new_doc = Doc(Vocab()).from_disk('/moby_dick.bin') +infobox | #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading] From 22bf5f63bfb4a37fc8b01724c121d2abbfecaf6e Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 27 May 2017 17:58:18 +0200 Subject: [PATCH 04/25] Update Matcher docs and add social media analysis example --- website/docs/usage/rule-based-matching.jade | 119 +++++++++++++++++++- 1 file changed, 115 insertions(+), 4 deletions(-) diff --git a/website/docs/usage/rule-based-matching.jade b/website/docs/usage/rule-based-matching.jade index a54b70b89..fde6da6ef 100644 --- a/website/docs/usage/rule-based-matching.jade +++ b/website/docs/usage/rule-based-matching.jade @@ -11,7 +11,7 @@ p | You can also associate patterns with entity IDs, to allow some basic | entity linking or disambiguation. -+aside("What about \"real\" regular expressions?") +//-+aside("What about \"real\" regular expressions?") +h(2, "adding-patterns") Adding patterns @@ -119,7 +119,7 @@ p +code. # Add a new custom flag to the vocab, which is always False by default. # BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span. - BAD_HTML_FLAG = doc.vocab.add_flag(lambda text: False) + BAD_HTML_FLAG = nlp.vocab.add_flag(lambda text: False) def merge_and_flag(matcher, doc, i, matches): match_id, start, end = matches[i] @@ -221,7 +221,7 @@ p +cell match 0 or 1 times +cell optional, max one -+h(3, "quantifiers-example1") Quantifiers example: Using linguistic annotations ++h(2, "example1") Example: Using linguistic annotations p | Let's say you're analysing user comments and you want to find out what @@ -283,7 +283,7 @@ p # set manual=True to make displaCy render straight from a dictionary displacy.serve(matched_sents, style='ent', manual=True) -+h(3, "quantifiers-example2") Quantifiers example: Phone numbers ++h(2, "example2") Example: Phone numbers p | Phone numbers can have many different formats and matching them is often @@ -320,3 +320,114 @@ p | It'll produce more predictable results, is much easier to modify and | extend, and doesn't require any training data – only a set of | test cases. + ++h(2, "example3") Example: Hashtags and emoji on social media + +p + | Social media posts, especially tweets, can be difficult to work with. + | They're very short and often contain various emoji and hashtags. By only + | looking at the plain text, you'll lose a lot of valuable semantic + | information. + +p + | Let's say you've extracted a large sample of social media posts on a + | specific topic, for example posts mentioning a brand name or product. + | As the first step of your data exploration, you want to filter out posts + | containing certain emoji and use them to assign a general sentiment + | score, based on whether the expressed emotion is positive or negative, + | e.g. #[span.o-icon.o-icon--inline πŸ˜€] or #[span.o-icon.o-icon--inline 😞]. + | You also want to find, merge and label hashtags like + | #[code #MondayMotivation], to be able to ignore or analyse them later. + ++aside("Note on sentiment analysis") + | Ultimately, sentiment analysis is not always #[em that] easy. In + | addition to the emoji, you'll also want to take specific words into + | account and check the #[code subtree] for intensifiers like "very", to + | increase the sentiment score. At some point, you might also want to train + | a sentiment model. However, the approach described in this example is + | very useful for #[strong bootstrapping rules to gather training data]. + | It's also an incredibly fast way to gather first insights into your data + | – with about 1 million tweets, you'd be looking at a processing time of + | #[strong under 1 minute]. + +p + | By default, spaCy's tokenizer will split emoji into separate tokens. This + | means that you can create a pattern for one or more emoji tokens. In this + | case, a sequence of identical emoji should be treated as one instance. + | Valid hashtags usually consist of a #[code #], plus a sequence of + | ASCII characters with no whitespace, making them easy to match as well. + ++code. + from spacy.lang.en import English + from spacy.matcher import Matcher + + nlp = English() # we only want the tokenizer, so no need to load a model + matcher = Matcher(nlp.vocab) + + pos_emoji = [u'πŸ˜€', u'πŸ˜ƒ', u'πŸ˜‚', u'🀣', u'😊', u'😍'] # positive emoji + neg_emoji = [u'😞', u'😠', u'😩', u'😒', u'😭', u'πŸ˜’'] # negative emoji + + # add patterns to match one or more emoji tokens + pos_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in pos_emoji] + neg_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in neg_emoji] + + matcher.add('HAPPY', label_sentiment, *pos_patterns) # add positive pattern + matcher.add('SAD', label_sentiment, *neg_patterns) # add negative pattern + + # add pattern to merge valid hashtag, i.e. '#' plus any ASCII token + matcher.add('HASHTAG', merge_hashtag, [{'ORTH': '#'}, {'IS_ASCII': True}]) + +p + | Because the #[code on_match] callback receives the ID of each match, you + | can use the same function to handle the sentiment assignment for both + | the positive and negative pattern. To keep it simple, we'll either add + | or subtract #[code 0.1] points – this way, the score will also reflect + | combinations of emoji, even positive #[em and] negative ones. + +p + | With a library like + | #[+a("https://github.com/bcongdon/python-emojipedia") Emojipedia], + | we can also retrieve a short description for each emoji – for example, + | #[span.o-icon.o-icon--inline 😍]'s official title is "Smiling Face With + | Heart-Eyes". Assigning it to the merged token's norm will make it + | available as #[code token.norm_]. + ++code. + from emojipedia import Emojipedia # installation: pip install emojipedia + + def label_sentiment(matcher, doc, i, matches): + match_id, start, end = matches[i] + if match_id is 'HAPPY': + doc.sentiment += 0.1 # add 0.1 for positive sentiment + elif match_id is 'SAD': + doc.sentiment -= 0.1 # subtract 0.1 for negative sentiment + span = doc[start : end] + emoji = Emojipedia.search(span[0].text) # get data for emoji + span.merge(norm=emoji.title) # merge span and set NORM to emoji title + +p + | To label the hashtags, we first need to add a new custom flag. + | #[code IS_HASHTAG] will be the flag's ID, which you can use to assign it + | to the hashtag's span, and check its value via a token's + | #[+api("token#check_flag") #[code code check_flag()]] method. On each + | match, we merge the hashtag and assign the flag. + ++code. + # Add a new custom flag to the vocab, which is always False by default + IS_HASHTAG = nlp.vocab.add_flag(lambda text: False) + + def merge_hashtag(matcher, doc, i, matches): + match_id, start, end = matches[i] + span = doc[start : end] + span.merge() # merge hashtag + span.set_flag(IS_HASHTAG, True) # set IS_HASHTAG to True + +p + | To process a stream of social media posts, we can use + | #[+api("language#pipe") #[code Language.pipe()]], which will return a + | stream of #[code Doc] objects that we can pass to + | #[+api("matcher#pipe") #[code Matcher.pipe()]]. + ++code. + docs = nlp.pipe(LOTS_OF_TWEETS) + matches = matcher.pipe(docs) From 086a06e7d750da5852a447effdb32a376bd86ec7 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 27 May 2017 20:01:46 +0200 Subject: [PATCH 05/25] Fix CLI docstrings and add command as first argument Workaround for Plac --- spacy/__init__.py | 6 +++++- spacy/cli/convert.py | 5 +++-- spacy/cli/download.py | 7 ++++--- spacy/cli/info.py | 2 +- spacy/cli/link.py | 5 +++-- spacy/cli/package.py | 5 +++-- spacy/cli/train.py | 6 ++++-- 7 files changed, 23 insertions(+), 13 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index 8dc0937f5..6beb7955e 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import importlib from .compat import basestring_ -from .cli.info import info +from .cli.info import info as cli_info from .glossary import explain from .deprecated import resolve_load_name from . import util @@ -20,3 +20,7 @@ def load(name, **overrides): overrides['meta'] = meta overrides['path'] = model_path return cls(**overrides) + + +def info(model=None, markdown=False): + return cli_info(None, model, markdown) diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index e95ffd08b..82b39bba2 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -24,8 +24,9 @@ CONVERTERS = { n_sents=("Number of sentences per doc", "option", "n", float), morphology=("Enable appending morphology to tags", "flag", "m", bool) ) -def convert(_, input_file, output_dir, n_sents, morphology): - """Convert files into JSON format for use with train command and other +def convert(cmd, input_file, output_dir, n_sents, morphology): + """ + Convert files into JSON format for use with train command and other experiment management functions. """ input_path = Path(input_file) diff --git a/spacy/cli/download.py b/spacy/cli/download.py index fdcacb891..b6e5549da 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -17,8 +17,9 @@ from .. import about direct=("force direct download. Needs model name with version and won't " "perform compatibility check", "flag", "d", bool) ) -def download(model, direct=False): - """Download compatible model from default download path using pip. Model +def download(cmd, model, direct=False): + """ + Download compatible model from default download path using pip. Model can be shortcut, model name or, if --direct flag is set, full model name with version. """ @@ -31,7 +32,7 @@ def download(model, direct=False): version = get_version(model_name, compatibility) download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version)) try: - link(model_name, model, force=True) + link(None, model_name, model, force=True) except: # Dirty, but since spacy.download and the auto-linking is mostly # a convenience wrapper, it's best to show a success message and diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 6f7467521..75aac10c7 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -14,7 +14,7 @@ from .. import util model=("optional: shortcut link of model", "positional", None, str), markdown=("generate Markdown for GitHub issues", "flag", "md", str) ) -def info(model=None, markdown=False): +def info(cmd, model=None, markdown=False): """Print info about spaCy installation. If a model shortcut link is speficied as an argument, print model information. Flag --markdown prints details in Markdown for easy copy-pasting to GitHub issues. diff --git a/spacy/cli/link.py b/spacy/cli/link.py index 1feef8bce..9aecdabfe 100644 --- a/spacy/cli/link.py +++ b/spacy/cli/link.py @@ -14,8 +14,9 @@ from .. import util link_name=("name of shortuct link to create", "positional", None, str), force=("force overwriting of existing link", "flag", "f", bool) ) -def link(origin, link_name, force=False): - """Create a symlink for models within the spacy/data directory. Accepts +def link(cmd, origin, link_name, force=False): + """ + Create a symlink for models within the spacy/data directory. Accepts either the name of a pip package, or the local path to the model data directory. Linking models allows loading them via spacy.load(link_name). """ diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 9acd0a2fa..1c3128d99 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -18,8 +18,9 @@ from .. import about meta=("path to meta.json", "option", "m", str), force=("force overwriting of existing folder in output directory", "flag", "f", bool) ) -def package(input_dir, output_dir, meta, force): - """Generate Python package for model data, including meta and required +def package(cmd, input_dir, output_dir, meta=None, force=False): + """ + Generate Python package for model data, including meta and required installation files. A new directory will be created in the specified output directory, and model data will be copied over. """ diff --git a/spacy/cli/train.py b/spacy/cli/train.py index ed146cb24..25b53e49d 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -32,9 +32,11 @@ from .. import displacy no_parser=("Don't train parser", "flag", "P", bool), no_entities=("Don't train NER", "flag", "N", bool) ) -def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, +def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, use_gpu=False, no_tagger=False, no_parser=False, no_entities=False): - """Train a model. Expects data in spaCy's JSON format.""" + """ + Train a model. Expects data in spaCy's JSON format. + """ n_sents = n_sents or None output_path = util.ensure_path(output_dir) train_path = util.ensure_path(train_data) From 1203959625954fc1164485883ff49e9b5f3b43c3 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 27 May 2017 20:02:01 +0200 Subject: [PATCH 06/25] Add pipeline setting to meta.json generator --- spacy/cli/package.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 1c3128d99..e78a4eeb4 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -43,7 +43,7 @@ def package(cmd, input_dir, output_dir, meta=None, force=False): meta = util.read_json(meta_path) else: meta = generate_meta() - validate_meta(meta, ['lang', 'name', 'version']) + meta = validate_meta(meta, ['lang', 'name', 'version']) model_name = meta['lang'] + '_' + meta['name'] model_name_v = model_name + '-' + meta['version'] @@ -86,20 +86,32 @@ def generate_meta(): ('email', 'Author email', False), ('url', 'Author website', False), ('license', 'License', 'CC BY-NC 3.0')] - prints("Enter the package settings for your model.", title="Generating meta.json") meta = {} for setting, desc, default in settings: response = util.get_raw_input(desc, default) meta[setting] = default if response == '' and default else response + meta['pipeline'] = generate_pipeline() return meta +def generate_pipeline(): + prints("If set to 'True', the default pipeline is used. If set to 'False', " + "the pipeline will be disabled. Components should be specified as a " + "comma-separated list of component names, e.g. vectorizer, tagger, " + "parser, ner. For more information, see the docs on processing pipelines.", + title="Enter your model's pipeline components") + pipeline = util.get_raw_input("Pipeline components", True) + replace = {'True': True, 'False': False} + return replace[pipeline] if pipeline in replace else pipeline.split(', ') + + def validate_meta(meta, keys): for key in keys: if key not in meta or meta[key] == '': prints("This setting is required to build your package.", title='No "%s" setting found in meta.json' % key, exits=1) + return meta def get_template(filepath): From ae11c8d60f07f5f9257a347f51b72d93aaea3699 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 27 May 2017 20:02:20 +0200 Subject: [PATCH 07/25] Add emoji sentiment to lightning tour matcher example --- website/docs/usage/lightning-tour.jade | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade index eefb7a11a..7de486070 100644 --- a/website/docs/usage/lightning-tour.jade +++ b/website/docs/usage/lightning-tour.jade @@ -149,9 +149,14 @@ p nlp = spacy.load('en') matcher = Matcher(nlp.vocab) - # match "Google I/O" or "Google i/o" - pattern = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}] - matcher.add('GoogleIO', None, pattern) + + def set_sentiment(matcher, doc, i, matches): + doc.sentiment += 0.1 + + pattern1 = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}] + pattern2 = [[{'ORTH': emoji, 'OP': '+'}] for emoji in ['πŸ˜€', 'πŸ˜‚', '🀣', '😍']] + matcher.add('GoogleIO', None, pattern1) # match "Google I/O" or "Google i/o" + matcher.add('HAPPY', set_sentiment, pattern2) # match one or more happy emoji matches = nlp(LOTS_OF TEXT) +infobox From 7cc9c3e9a6f28422485eb2a054d12850481aeb71 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 May 2017 15:44:42 -0500 Subject: [PATCH 08/25] Fix convert CLI --- spacy/cli/convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index e95ffd08b..ac608a64a 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -39,4 +39,4 @@ def convert(_, input_file, output_dir, n_sents, morphology): prints("Can't find converter for %s" % input_path.parts[-1], title="Unknown format", exits=1) CONVERTERS[file_ext](input_path, output_path, - n_sents=n_sents, morphology=morphology) + n_sents=n_sents, use_morphology=morphology) From 34bbad8e0e115e412e857c71d5f4d0b3ab339681 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 May 2017 15:46:06 -0500 Subject: [PATCH 09/25] Add __reduce__ methods on parser subclasses. Fixes pickling. --- spacy/pipeline.pyx | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 98b79d709..724891c9b 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -335,6 +335,9 @@ cdef class NeuralDependencyParser(NeuralParser): name = 'parser' TransitionSystem = ArcEager + def __reduce__(self): + return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None) + cdef class NeuralEntityRecognizer(NeuralParser): name = 'entity' @@ -342,6 +345,10 @@ cdef class NeuralEntityRecognizer(NeuralParser): nr_feature = 6 + def __reduce__(self): + return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None) + + cdef class BeamDependencyParser(BeamParser): TransitionSystem = ArcEager From 5e4312feede7c2511b4d61a5723077c1b16c142d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 May 2017 15:47:02 -0500 Subject: [PATCH 10/25] Evaluate loaded class, to ensure save/load works --- spacy/cli/train.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index b25cdcbd5..7bbda5a47 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -84,11 +84,11 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, pbar.update(len(docs)) with nlp.use_params(optimizer.averages): - scorer = nlp.evaluate(corpus.dev_docs(nlp, gold_preproc=False)) with (output_path / ('model%d.pickle' % i)).open('wb') as file_: dill.dump(nlp, file_, -1) - - + with (output_path / ('model%d.pickle' % i)).open('rb') as file_: + nlp_loaded = dill.load(file_) + scorer = nlp_loaded.evaluate(corpus.dev_docs(nlp_loaded, gold_preproc=False)) print_progress(i, losses, scorer.scores) finally: print("Saving model...") From 655ca58c16880c50661039c4db7181b4700cd0e5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 May 2017 15:49:37 -0500 Subject: [PATCH 11/25] Clarifying change to StateC.clone --- spacy/syntax/_state.pxd | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd index 4b2b47270..0b29412bf 100644 --- a/spacy/syntax/_state.pxd +++ b/spacy/syntax/_state.pxd @@ -335,17 +335,18 @@ cdef cppclass StateC: this._break = this._b_i void clone(const StateC* src) nogil: + this.length = src.length memcpy(this._sent, src._sent, this.length * sizeof(TokenC)) memcpy(this._stack, src._stack, this.length * sizeof(int)) memcpy(this._buffer, src._buffer, this.length * sizeof(int)) memcpy(this._ents, src._ents, this.length * sizeof(Entity)) memcpy(this.shifted, src.shifted, this.length * sizeof(this.shifted[0])) - this.length = src.length this._b_i = src._b_i this._s_i = src._s_i this._e_i = src._e_i this._break = src._break this.offset = src.offset + this._empty_token = src._empty_token void fast_forward() nogil: # space token attachement policy: From 99316fa631efd86a5ab5d68b11654c7366ece650 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 May 2017 15:50:21 -0500 Subject: [PATCH 12/25] Use ordered dict to specify actions --- spacy/syntax/arc_eager.pyx | 14 ++++++++------ spacy/syntax/ner.pyx | 31 ++++++++++++++++++++++--------- 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index f7c1c7922..2e424c1a9 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -9,6 +9,7 @@ import ctypes from libc.stdint cimport uint32_t from libc.string cimport memcpy from cymem.cymem cimport Pool +from collections import OrderedDict from .stateclass cimport StateClass from ._state cimport StateC, is_space_token @@ -312,12 +313,13 @@ cdef class ArcEager(TransitionSystem): @classmethod def get_actions(cls, **kwargs): actions = kwargs.get('actions', - { - SHIFT: [''], - REDUCE: [''], - RIGHT: [], - LEFT: [], - BREAK: ['ROOT']}) + OrderedDict(( + (SHIFT, ['']), + (REDUCE, ['']), + (RIGHT, []), + (LEFT, []), + (BREAK, ['ROOT']) + ))) seen_actions = set() for label in kwargs.get('left_labels', []): if label.upper() != 'ROOT': diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index af42eded4..f8db0a433 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -2,6 +2,7 @@ from __future__ import unicode_literals from thinc.typedefs cimport weight_t +from collections import OrderedDict from .stateclass cimport StateClass from ._state cimport StateC @@ -51,17 +52,29 @@ cdef bint _entity_is_sunk(StateClass st, Transition* golds) nogil: cdef class BiluoPushDown(TransitionSystem): + def __init__(self, *args, **kwargs): + TransitionSystem.__init__(self, *args, **kwargs) + + def __reduce__(self): + labels_by_action = OrderedDict() + cdef Transition t + for trans in self.c[:self.n_moves]: + label_str = self.strings[trans.label] + labels_by_action.setdefault(trans.move, []).append(label_str) + return (BiluoPushDown, (self.strings, labels_by_action), + None, None) + @classmethod def get_actions(cls, **kwargs): actions = kwargs.get('actions', - { - MISSING: [''], - BEGIN: [], - IN: [], - LAST: [], - UNIT: [], - OUT: [''] - }) + OrderedDict(( + (MISSING, ['']), + (BEGIN, []), + (IN, []), + (LAST, []), + (UNIT, []), + (OUT, ['']) + ))) seen_entities = set() for entity_type in kwargs.get('entity_types', []): if entity_type in seen_entities: @@ -90,7 +103,7 @@ cdef class BiluoPushDown(TransitionSystem): def move_name(self, int move, int label): if move == OUT: return 'O' - elif move == 'MISSING': + elif move == MISSING: return 'M' else: return MOVE_NAMES[move] + '-' + self.strings[label] From 8de9829f094fbf1ed418c527236218667baa1989 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 May 2017 15:50:40 -0500 Subject: [PATCH 13/25] Don't overwrite model in initialization, when loading --- spacy/_ml.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spacy/_ml.py b/spacy/_ml.py index f589704a6..ac7849bbb 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -19,6 +19,8 @@ import numpy def _init_for_precomputed(W, ops): + if (W**2).sum() != 0.: + return reshaped = W.reshape((W.shape[1], W.shape[0] * W.shape[2])) ops.xavier_uniform_init(reshaped) W[:] = reshaped.reshape(W.shape) @@ -247,6 +249,7 @@ def doc2feats(cols=None): model.cols = cols return model + def print_shape(prefix): def forward(X, drop=0.): return X, lambda dX, **kwargs: dX From 3eea5383a1adc179ed7d7feb2c957b1d78f0171b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 May 2017 15:51:55 -0500 Subject: [PATCH 14/25] Add move_names property to parser --- spacy/syntax/nn_parser.pyx | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 35966d536..6db6e5ae1 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -518,6 +518,14 @@ cdef class Parser: xp.add.at(d_tokvecs, ids, d_state_features * active_feats) + @property + def move_names(self): + names = [] + for i in range(self.moves.n_moves): + name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label) + names.append(name) + return names + def get_batch_model(self, batch_size, tokvecs, stream, dropout): lower, upper = self.model state2vec = precompute_hiddens(batch_size, tokvecs, From 7ebd26b8aae34464c3b02cbc9b497bfe0ebfa7d2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 May 2017 15:52:20 -0500 Subject: [PATCH 15/25] Use ordered dict to specify transitions --- spacy/syntax/transition_system.pyx | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 07102aeb0..211b2c950 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -5,7 +5,7 @@ from __future__ import unicode_literals from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF from cymem.cymem cimport Pool from thinc.typedefs cimport weight_t -from collections import defaultdict +from collections import defaultdict, OrderedDict from ..structs cimport TokenC from .stateclass cimport StateClass @@ -26,7 +26,7 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL: cdef class TransitionSystem: - def __init__(self, StringStore string_table, dict labels_by_action): + def __init__(self, StringStore string_table, labels_by_action): self.mem = Pool() self.strings = string_table self.n_moves = 0 @@ -34,14 +34,14 @@ cdef class TransitionSystem: self.c = self.mem.alloc(self._size, sizeof(Transition)) - for action, label_strs in sorted(labels_by_action.items()): + for action, label_strs in labels_by_action.items(): for label_str in label_strs: self.add_action(int(action), label_str) self.root_label = self.strings['ROOT'] self.init_beam_state = _init_state def __reduce__(self): - labels_by_action = {} + labels_by_action = OrderedDict() cdef Transition t for trans in self.c[:self.n_moves]: label_str = self.strings[trans.label] @@ -77,6 +77,11 @@ cdef class TransitionSystem: history.append(i) action.do(state.c, action.label) break + else: + print(gold.words) + print(gold.ner) + print(history) + raise ValueError("Could not find gold move") return history cdef int initialize_state(self, StateC* state) nogil: From b03fb2d7b068f4752fda7cb5783d3c08dd0adb63 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 28 May 2017 00:03:16 +0200 Subject: [PATCH 16/25] Update 101 and usage docs --- website/assets/img/docs/pipeline.svg | 2 +- website/docs/usage/_spacy-101/_vocab-stringstore.jade | 4 +++- website/docs/usage/lightning-tour.jade | 2 ++ website/docs/usage/rule-based-matching.jade | 2 +- 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/website/assets/img/docs/pipeline.svg b/website/assets/img/docs/pipeline.svg index e42c2362f..2ff00d787 100644 --- a/website/assets/img/docs/pipeline.svg +++ b/website/assets/img/docs/pipeline.svg @@ -2,7 +2,7 @@ diff --git a/website/docs/usage/_spacy-101/_vocab-stringstore.jade b/website/docs/usage/_spacy-101/_vocab-stringstore.jade index 3f551c9e1..dd300b5b9 100644 --- a/website/docs/usage/_spacy-101/_vocab-stringstore.jade +++ b/website/docs/usage/_spacy-101/_vocab-stringstore.jade @@ -89,4 +89,6 @@ p p | Even though both #[code Doc] objects contain the same words, the internal - | integer IDs are very different. + | integer IDs are very different. The same applies for all other strings, + | like the annotation scheme. To avoid mismatched IDs, spaCy will always + | export the vocab if you save a #[code Doc] or #[code nlp] object. diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade index 7de486070..8cf651be0 100644 --- a/website/docs/usage/lightning-tour.jade +++ b/website/docs/usage/lightning-tour.jade @@ -139,6 +139,8 @@ p new_doc = Doc(Vocab()).from_disk('/moby_dick.bin') +infobox + | #[strong API:] #[+api("language") #[code Language]], + | #[+api("doc") #[code Doc]] | #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading] +h(2, "rule-matcher") Match text with token rules diff --git a/website/docs/usage/rule-based-matching.jade b/website/docs/usage/rule-based-matching.jade index fde6da6ef..1fd398ad9 100644 --- a/website/docs/usage/rule-based-matching.jade +++ b/website/docs/usage/rule-based-matching.jade @@ -345,7 +345,7 @@ p | account and check the #[code subtree] for intensifiers like "very", to | increase the sentiment score. At some point, you might also want to train | a sentiment model. However, the approach described in this example is - | very useful for #[strong bootstrapping rules to gather training data]. + | very useful for #[strong bootstrapping rules to collect training data]. | It's also an incredibly fast way to gather first insights into your data | – with about 1 million tweets, you'd be looking at a processing time of | #[strong under 1 minute]. From db116cbedabccb65a100898a3d285e1c2ee804a6 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 28 May 2017 00:03:31 +0200 Subject: [PATCH 17/25] Update tokenization 101 and add illustration --- website/assets/img/docs/tokenization.svg | 123 ++++++++++++++++++ .../docs/usage/_spacy-101/_tokenization.jade | 44 +++++++ website/docs/usage/spacy-101.jade | 7 +- 3 files changed, 171 insertions(+), 3 deletions(-) create mode 100644 website/assets/img/docs/tokenization.svg diff --git a/website/assets/img/docs/tokenization.svg b/website/assets/img/docs/tokenization.svg new file mode 100644 index 000000000..cc185a3a7 --- /dev/null +++ b/website/assets/img/docs/tokenization.svg @@ -0,0 +1,123 @@ + + + + + β€œLet’s + + + go + + + to + + + N.Y.!” + + + β€œ + + + Let’s + + + go + + + to + + + N.Y.!” + + β€œ + + + Let + + + go + + + to + + + N.Y.!” + + + ’s + + + β€œ + + + Let + + + go + + + to + + + N.Y.! + + + ’s + + + ” + + + β€œ + + + Let + + + go + + + to + + + N.Y. + + + ’s + + + ” + + + ! + + β€œ + + Let + + go + + to + + N.Y. + + ’s + + ” + + ! + + EXCEPTION + + PREFIX + + SUFFIX + + SUFFIX + + EXCEPTION + + DONE + diff --git a/website/docs/usage/_spacy-101/_tokenization.jade b/website/docs/usage/_spacy-101/_tokenization.jade index 64e3f5881..95a9cc520 100644 --- a/website/docs/usage/_spacy-101/_tokenization.jade +++ b/website/docs/usage/_spacy-101/_tokenization.jade @@ -16,3 +16,47 @@ p +row for cell in ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "for", "$", "1", "billion"] +cell=cell + +p + | Fist, the raw text is split on whitespace characters, similar to + | #[code text.split(' ')]. Then, the tokenizer processes the text from + | left to right. On each substring, it performs two checks: + ++list("numbers") + +item + | #[strong Does the substring match a tokenizer exception rule?] For + | example, "don't" does not contain whitespace, but should be split + | into two tokens, "do" and "n't", while "U.K." should always + | remain one token. + +item + | #[strong Can a prefix, suffix or infixes be split off?]. For example + | punctuation like commas, periods, hyphens or quotes. + +p + | If there's a match, the rule is applied and the tokenizer continues its + | loop, starting with the newly split substrings. This way, spaCy can split + | #[strong complex, nested tokens] like combinations of abbreviations and + | multiple punctuation marks. + ++aside + | #[strong Tokenizer exception:] Special-case rule to split a string into + | several tokens or prevent a token from being split when punctuation rules + | are applied.#[br] + | #[strong Prefix:] Character(s) at the beginning, e.g. + | #[code $], #[code (], #[code β€œ], #[code ΒΏ].#[br] + | #[strong Suffix:] Character(s) at the end, e.g. + | #[code km], #[code )], #[code ”], #[code !].#[br] + | #[strong Infix:] Character(s) in between, e.g. + | #[code -], #[code --], #[code /], #[code …].#[br] + ++image + include ../../../assets/img/docs/tokenization.svg + .u-text-right + +button("/assets/img/docs/tokenization.svg", false, "secondary").u-text-tag View large graphic + +p + | While punctuation rules are usually pretty general, tokenizer exceptions + | strongly depend on the specifics of the individual language. This is + | why each #[+a("/docs/api/language-models") available language] has its + | own subclass like #[code English] or #[code German], that loads in lists + | of hard-coded data and exception rules. diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index 7c6525004..8b2d0c17e 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -94,9 +94,10 @@ p include _spacy-101/_tokenization +infobox - | To learn more about how spaCy's tokenizer and its rules work in detail, - | how to #[strong customise] it and how to #[strong add your own tokenizer] - | to a processing pipeline, see the usage guide on + | To learn more about how spaCy's tokenization rules work in detail, + | how to #[strong customise and replace] the default tokenizer and how to + | #[strong add language-specific data], see the usage guides on + | #[+a("/docs/usage/adding-languages") adding languages] and | #[+a("/docs/usage/customizing-tokenizer") customising the tokenizer]. +h(3, "annotations-pos-deps") Part-of-speech tags and dependencies From c8543c823792710dae5b0c6d77dc31c53fec177c Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 28 May 2017 00:04:04 +0200 Subject: [PATCH 18/25] Fix formatting and docstrings and remove deprecated function --- spacy/util.py | 22 +++++++++------------- spacy/vocab.pyx | 2 -- 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/spacy/util.py b/spacy/util.py index e42bde810..a30b35a06 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -177,10 +177,13 @@ def get_async(stream, numpy_array): def itershuffle(iterable, bufsize=1000): """Shuffle an iterator. This works by holding `bufsize` items back - and yielding them sometime later. Obviously, this is not unbiased -- + and yielding them sometime later. Obviously, this is not unbiased – but should be good enough for batching. Larger bufsize means less bias. - From https://gist.github.com/andres-erbsen/1307752 + + iterable (iterable): Iterator to shuffle. + bufsize (int): Items to hold back. + YIELDS (iterable): The shuffled iterator. """ iterable = iter(iterable) buf = [] @@ -315,17 +318,16 @@ def normalize_slice(length, start, stop, step=None): def compounding(start, stop, compound): - '''Yield an infinite series of compounding values. Each time the + """Yield an infinite series of compounding values. Each time the generator is called, a value is produced by multiplying the previous value by the compound rate. - EXAMPLE - + EXAMPLE: >>> sizes = compounding(1., 10., 1.5) >>> assert next(sizes) == 1. >>> assert next(sizes) == 1 * 1.5 >>> assert next(sizes) == 1.5 * 1.5 - ''' + """ def clip(value): return max(value, stop) if (start>stop) else min(value, stop) curr = float(start) @@ -335,7 +337,7 @@ def compounding(start, stop, compound): def decaying(start, stop, decay): - '''Yield an infinite series of linearly decaying values.''' + """Yield an infinite series of linearly decaying values.""" def clip(value): return max(value, stop) if (start>stop) else min(value, stop) nr_upd = 1. @@ -344,12 +346,6 @@ def decaying(start, stop, decay): nr_upd += 1 -def check_renamed_kwargs(renamed, kwargs): - for old, new in renamed.items(): - if old in kwargs: - raise TypeError("Keyword argument %s now renamed to %s" % (old, new)) - - def read_json(location): """Open and load JSON from file. diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index d7d27a3e4..55fde0123 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -53,8 +53,6 @@ cdef class Vocab: vice versa. RETURNS (Vocab): The newly constructed vocab object. """ - util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs) - lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} tag_map = tag_map if tag_map is not None else {} if lemmatizer in (None, True, False): From c1983621fbe34659b9243b1af603ed9b85495ac6 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 28 May 2017 00:22:00 +0200 Subject: [PATCH 19/25] Update util functions for model loading --- spacy/__init__.py | 12 +--- spacy/cli/info.py | 10 +++- spacy/cli/link.py | 2 +- spacy/util.py | 111 +++++++++++++++++++++++++------------ website/docs/api/util.jade | 90 ++++++++++++++++-------------- 5 files changed, 132 insertions(+), 93 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index 6beb7955e..f9e29037f 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,9 +1,6 @@ # coding: utf8 from __future__ import unicode_literals -import importlib - -from .compat import basestring_ from .cli.info import info as cli_info from .glossary import explain from .deprecated import resolve_load_name @@ -12,14 +9,7 @@ from . import util def load(name, **overrides): name = resolve_load_name(name, **overrides) - model_path = util.resolve_model_path(name) - meta = util.parse_package_meta(model_path) - if 'lang' not in meta: - raise IOError('No language setting found in model meta.') - cls = util.get_lang_class(meta['lang']) - overrides['meta'] = meta - overrides['path'] = model_path - return cls(**overrides) + return util.load_model(name) def info(model=None, markdown=False): diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 75aac10c7..70f054d84 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -20,8 +20,14 @@ def info(cmd, model=None, markdown=False): prints details in Markdown for easy copy-pasting to GitHub issues. """ if model: - model_path = util.resolve_model_path(model) - meta = util.parse_package_meta(model_path) + if util.is_package(model): + model_path = util.get_package_path(model) + else: + model_path = util.get_data_path() / model + meta_path = model_path / 'meta.json' + if not meta_path.is_file(): + prints(meta_path, title="Can't find model meta.json", exits=1) + meta = read_json(meta_path) if model_path.resolve() != model_path: meta['link'] = path2str(model_path) meta['source'] = path2str(model_path.resolve()) diff --git a/spacy/cli/link.py b/spacy/cli/link.py index 9aecdabfe..66824c042 100644 --- a/spacy/cli/link.py +++ b/spacy/cli/link.py @@ -21,7 +21,7 @@ def link(cmd, origin, link_name, force=False): directory. Linking models allows loading them via spacy.load(link_name). """ if util.is_package(origin): - model_path = util.get_model_package_path(origin) + model_path = util.get_package_path(model) else: model_path = Path(origin) if not model_path.exists(): diff --git a/spacy/util.py b/spacy/util.py index a30b35a06..25fe198f4 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -78,27 +78,86 @@ def ensure_path(path): return path -def resolve_model_path(name): - """Resolve a model name or string to a model path. +def load_model(name): + """Load a model from a shortcut link, package or data path. name (unicode): Package name, shortcut link or model path. - RETURNS (Path): Path to model data directory. + RETURNS (Language): `Language` class with the loaded model. """ data_path = get_data_path() if not data_path or not data_path.exists(): raise IOError("Can't find spaCy data path: %s" % path2str(data_path)) if isinstance(name, basestring_): - if (data_path / name).exists(): # in data dir or shortcut link - return (data_path / name) - if is_package(name): # installed as a package - return get_model_package_path(name) - if Path(name).exists(): # path to model - return Path(name) - elif hasattr(name, 'exists'): # Path or Path-like object - return name + if (data_path / name).exists(): # in data dir or shortcut + return load_model_from_path(data_path / name) + if is_package(name): # installed as package + return load_model_from_pkg(name) + if Path(name).exists(): # path to model data directory + return load_data_from_path(Path(name)) + elif hasattr(name, 'exists'): # Path or Path-like to model data + return load_data_from_path(name) raise IOError("Can't find model '%s'" % name) +def load_model_from_init_py(init_file): + """Helper function to use in the `load()` method of a model package's + __init__.py. + + init_file (unicode): Path to model's __init__.py, i.e. `__file__`. + RETURNS (Language): `Language` class with loaded model. + """ + model_path = Path(init_file).parent + return load_data_from_path(model_path, package=True) + + +def load_model_from_path(model_path): + """Import and load a model package from its file path. + + path (unicode or Path): Path to package directory. + RETURNS (Language): `Language` class with loaded model. + """ + model_path = ensure_path(model_path) + spec = importlib.util.spec_from_file_location('model', model_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module.load() + + +def load_model_from_pkg(name): + """Import and load a model package. + + name (unicode): Name of model package installed via pip. + RETURNS (Language): `Language` class with loaded model. + """ + module = importlib.import_module(name) + return module.load() + + +def load_data_from_path(model_path, package=False): + """Initialie a `Language` class with a loaded model from a model data path. + + model_path (unicode or Path): Path to model data directory. + package (bool): Does the path point to the parent package directory? + RETURNS (Language): `Language` class with loaded model. + """ + model_path = ensure_path(model_path) + meta_path = model_path / 'meta.json' + if not meta_path.is_file(): + raise IOError("Could not read meta.json from %s" % location) + meta = read_json(location) + for setting in ['lang', 'name', 'version']: + if setting not in meta: + raise IOError('No %s setting found in model meta.json' % setting) + if package: + model_data_path = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version']) + model_path = model_path / model_data_path + if not model_path.exists(): + raise ValueError("Can't find model directory: %s" % path2str(model_path)) + cls = get_lang_class(meta['lang']) + nlp = cls(pipeline=meta.get('pipeline', True)) + return nlp.from_disk(model_path) + + def is_package(name): """Check if string maps to a package installed via pip. @@ -112,36 +171,16 @@ def is_package(name): return False -def get_model_package_path(package_name): - """Get path to a model package installed via pip. +def get_package_path(name): + """Get the path to an installed package. - package_name (unicode): Name of installed package. - RETURNS (Path): Path to model data directory. + name (unicode): Package name. + RETURNS (Path): Path to installed package. """ # Here we're importing the module just to find it. This is worryingly # indirect, but it's otherwise very difficult to find the package. - # Python's installation and import rules are very complicated. pkg = importlib.import_module(package_name) - package_path = Path(pkg.__file__).parent.parent - meta = parse_package_meta(package_path / package_name) - model_name = '%s-%s' % (package_name, meta['version']) - return package_path / package_name / model_name - - -def parse_package_meta(package_path, require=True): - """Check if a meta.json exists in a package and return its contents. - - package_path (Path): Path to model package directory. - require (bool): If True, raise error if no meta.json is found. - RETURNS (dict or None): Model meta.json data or None. - """ - location = package_path / 'meta.json' - if location.is_file(): - return read_json(location) - elif require: - raise IOError("Could not read meta.json from %s" % location) - else: - return None + return Path(pkg.__file__).parent def is_in_jupyter(): diff --git a/website/docs/api/util.jade b/website/docs/api/util.jade index 717abf34a..3e132b7b4 100644 --- a/website/docs/api/util.jade +++ b/website/docs/api/util.jade @@ -1,12 +1,10 @@ -//- πŸ’« DOCS > API > ANNOTATION SPECS +//- πŸ’« DOCS > API > UTIL include ../../_includes/_mixins p | spaCy comes with a small collection of utility functions located in | #[+src(gh("spaCy", "spacy/util.py")) spacy/util.py]. - -+infobox("Important note") | Because utility functions are mostly intended for | #[strong internal use within spaCy], their behaviour may change with | future releases. The functions documented on this page should be safe @@ -74,15 +72,23 @@ p +cell #[code Language] +cell Language class. -+h(2, "resolve_model_path") util.resolve_model_path ++h(2, "load_model") util.load_model +tag function +tag-new(2) -p Resolve a model name or string to a model path. +p + | Load a model from a shortcut link, package or data path. If called with a + | shortcut link or package name, spaCy will assume the model is a Python + | package and import and call its #[code load()] method. If called with a + | path, spaCy will assume it's a data directory, read the language and + | pipeline settings from the meta.json and initialise a #[code Language] + | class. The model data will then be loaded in via + | #[+api("language#from_disk") #[code Language.from_disk()]]. +aside-code("Example"). - model_path = util.resolve_model_path('en') - model_path = util.resolve_model_path('/path/to/en') + nlp = util.load_model('en') + nlp = util.load_model('en_core_web_sm') + nlp = util.load_model('/path/to/data') +table(["Name", "Type", "Description"]) +row @@ -92,8 +98,33 @@ p Resolve a model name or string to a model path. +footrow +cell returns - +cell #[code Path] - +cell Path to model data directory. + +cell #[code Language] + +cell #[code Language] class with the loaded model. + ++h(2, "load_model_from_init_py") util.load_model_from_init_py + +tag function + +tag-new(2) + +p + | A helper function to use in the #[code load()] method of a model package's + | #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) __init__.py]. + ++aside-code("Example"). + from spacy.util import load_model_from_init_py + + def load(): + return load_model_from_init_py(__file__) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code init_file] + +cell unicode + +cell Path to model's __init__.py, i.e. #[code __file__]. + + +footrow + +cell returns + +cell #[code Language] + +cell #[code Language] class with the loaded model. +h(2, "is_package") util.is_package +tag function @@ -117,16 +148,18 @@ p +cell #[code bool] +cell #[code True] if installed package, #[code False] if not. -+h(2, "get_model_package_path") util.get_model_package_path ++h(2, "get_package_path") util.get_package_path +tag function + +tag-new(2) p - | Get path to a #[+a("/docs/usage/models") model package] installed via pip. - | Currently imports the package to find it and parse its meta data. + | Get path to an installed package. Mainly used to resolve the location of + | #[+a("/docs/usage/models") model packages]. Currently imports the package + | to find its path. +aside-code("Example"). - util.get_model_package_path('en_core_web_sm') - # /usr/lib/python3.6/site-packages/en_core_web_sm/en_core_web_sm-1.2.0 + util.get_package_path('en_core_web_sm') + # /usr/lib/python3.6/site-packages/en_core_web_sm +table(["Name", "Type", "Description"]) +row @@ -137,37 +170,8 @@ p +footrow +cell returns +cell #[code Path] - +cell Path to model data directory. - -+h(2, "parse_package_meta") util.parse_package_meta - +tag function - -p - | Check if a #[code meta.json] exists in a model package and return its - | contents. - -+aside-code("Example"). - if util.is_package('en_core_web_sm'): - path = util.get_model_package_path('en_core_web_sm') - meta = util.parse_package_meta(path, require=True) - # {'name': 'core_web_sm', 'lang': 'en', ...} - -+table(["Name", "Type", "Description"]) - +row - +cell #[code package_path] - +cell #[code Path] +cell Path to model package directory. - +row - +cell #[code require] - +cell #[code bool] - +cell If #[code True], raise error if no #[code meta.json] is found. - - +footrow - +cell returns - +cell dict / #[code None] - +cell Model meta data or #[code None]. - +h(2, "is_in_jupyter") util.is_in_jupyter +tag function +tag-new(2) From eb703f7656a85fa3a7bf01877edd3b9bfd7f7e7d Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 28 May 2017 00:32:43 +0200 Subject: [PATCH 20/25] Update API docs --- website/docs/api/_data.json | 3 ++- website/docs/api/spacy.jade | 11 ++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/website/docs/api/_data.json b/website/docs/api/_data.json index f6a6a7e31..2af9bca1b 100644 --- a/website/docs/api/_data.json +++ b/website/docs/api/_data.json @@ -158,7 +158,8 @@ "binder": { "title": "Binder", - "tag": "class" + "tag": "class", + "source": "spacy/tokens/binder.pyx" }, "annotation": { diff --git a/website/docs/api/spacy.jade b/website/docs/api/spacy.jade index f2fcfde2c..a45307378 100644 --- a/website/docs/api/spacy.jade +++ b/website/docs/api/spacy.jade @@ -11,8 +11,13 @@ p | the name of an installed | #[+a("/docs/usage/saving-loading#generating") model package], a unicode | path or a #[code Path]-like object. spaCy will try resolving the load - | argument in this order. The #[code Language] class to initialise will be - | determined based on the model's settings. + | argument in this order. If a model is loaded from a shortcut link or + | package name, spaCy will assume it's a Python package and import it and + | call the model's own #[code load()] method. If a model is loaded from a + | path, spaCy will assume it's a data directory, read the language and + | pipeline settings off the meta.json and initialise the #[code Language] + | class. The data will be loaded in via + | #[+api("language#from_disk") #[code Language.from_disk()]]. +aside-code("Example"). nlp = spacy.load('en') # shortcut link @@ -20,7 +25,7 @@ p nlp = spacy.load('/path/to/en') # unicode path nlp = spacy.load(Path('/path/to/en')) # pathlib Path - nlp = spacy.load('en', disable['parser', 'tagger']) + nlp = spacy.load('en', disable=['parser', 'tagger']) +table(["Name", "Type", "Description"]) +row From 01a7b10319cf8e73a0c88faf8de8f8ecb1426dfa Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 28 May 2017 00:32:54 +0200 Subject: [PATCH 21/25] Add fallback fonts to illustrations --- website/assets/img/docs/architecture.svg | 8 ++++---- website/assets/img/docs/language_data.svg | 6 +++--- website/assets/img/docs/pipeline.svg | 6 +++--- website/assets/img/docs/tokenization.svg | 4 ++-- website/assets/img/docs/vocab_stringstore.svg | 8 ++++---- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/website/assets/img/docs/architecture.svg b/website/assets/img/docs/architecture.svg index f586b75eb..c1d12d79b 100644 --- a/website/assets/img/docs/architecture.svg +++ b/website/assets/img/docs/architecture.svg @@ -1,9 +1,9 @@ Language diff --git a/website/assets/img/docs/language_data.svg b/website/assets/img/docs/language_data.svg index b74fffba6..31e1a1b29 100644 --- a/website/assets/img/docs/language_data.svg +++ b/website/assets/img/docs/language_data.svg @@ -1,8 +1,8 @@ diff --git a/website/assets/img/docs/pipeline.svg b/website/assets/img/docs/pipeline.svg index 2ff00d787..8f9dc6dac 100644 --- a/website/assets/img/docs/pipeline.svg +++ b/website/assets/img/docs/pipeline.svg @@ -1,8 +1,8 @@ diff --git a/website/assets/img/docs/tokenization.svg b/website/assets/img/docs/tokenization.svg index cc185a3a7..f5b164725 100644 --- a/website/assets/img/docs/tokenization.svg +++ b/website/assets/img/docs/tokenization.svg @@ -1,7 +1,7 @@ diff --git a/website/assets/img/docs/vocab_stringstore.svg b/website/assets/img/docs/vocab_stringstore.svg index f660a8604..644453737 100644 --- a/website/assets/img/docs/vocab_stringstore.svg +++ b/website/assets/img/docs/vocab_stringstore.svg @@ -1,9 +1,9 @@ From 33e332e67ce7163982806dc5b45a97c6de697486 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 28 May 2017 00:57:59 +0200 Subject: [PATCH 22/25] Remove unused export --- spacy/lang/en/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index 7b7d4e1bb..7e1da789b 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -35,4 +35,4 @@ class English(Language): Defaults = EnglishDefaults -__all__ = ['English', 'EnglishDefaults'] +__all__ = ['English'] From 84189c1cab1f8534597cbdf740a8ba51ac1d086a Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 28 May 2017 00:58:59 +0200 Subject: [PATCH 23/25] Add 'xx' language ID for multi-language support Allows models to specify their language ID as 'xx'. --- spacy/lang/xx/__init__.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 spacy/lang/xx/__init__.py diff --git a/spacy/lang/xx/__init__.py b/spacy/lang/xx/__init__.py new file mode 100644 index 000000000..fef8c9d59 --- /dev/null +++ b/spacy/lang/xx/__init__.py @@ -0,0 +1,26 @@ +# coding: utf8 +from __future__ import unicode_literals + + +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ...language import Language +from ...attrs import LANG +from ...util import update_exc + + +class MultiLanguageDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'xx' + + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) + + +class MultiLanguage(Language): + """Language class to be used for models that support multiple languages. + This module allows models to specify their language ID as 'xx'. + """ + lang = 'xx' + Defaults = MultiLanguageDefaults + + +__all__ = ['MultiLanguage'] From eb5a8be9ade339d7c0a9c01e8075c9ee6827f749 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 28 May 2017 01:15:44 +0200 Subject: [PATCH 24/25] Update language overview and add section on 'xx' lang class --- website/docs/api/language-models.jade | 43 +++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/website/docs/api/language-models.jade b/website/docs/api/language-models.jade index 0990de358..74007f228 100644 --- a/website/docs/api/language-models.jade +++ b/website/docs/api/language-models.jade @@ -2,7 +2,10 @@ include ../../_includes/_mixins -p spaCy currently supports the following languages and capabilities: +p + | spaCy currently provides models for the following languages and + | capabilities: + +aside-code("Download language models", "bash"). python -m spacy download en @@ -22,12 +25,16 @@ p spaCy currently supports the following languages and capabilities: +row +cell French #[code fr] - each icon in [ "pro", "pro", "con", "pro", "con", "pro", "pro", "con" ] + each icon in [ "pro", "con", "con", "pro", "con", "pro", "pro", "con" ] +cell.u-text-center #[+procon(icon)] -+h(2, "available") Available models + +row + +cell Spanish #[code es] + each icon in [ "pro", "pro", "con", "pro", "pro", "pro", "pro", "con" ] + +cell.u-text-center #[+procon(icon)] -include ../usage/_models-list +p + +button("/docs/usage/models", true, "primary") See available models +h(2, "alpha-support") Alpha tokenization support @@ -52,9 +59,35 @@ p | #[+a("https://github.com/mocobeta/janome") Janome]. +table([ "Language", "Code", "Source" ]) - each language, code in { es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian BokmΓ₯l", da: "Danish", hu: "Hungarian", pl: "Polish", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" } + each language, code in { it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian BokmΓ₯l", da: "Danish", hu: "Hungarian", pl: "Polish", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" } +row +cell #{language} +cell #[code=code] +cell +src(gh("spaCy", "spacy/lang/" + code)) lang/#{code} + ++h(2, "multi-language") Multi-language support + +tag-new(2) + +p + | As of v2.0, spaCy supports models trained on more than one language. This + | is especially useful for named entity recognition. The language ID used + | for multi-language or language-neutral models is #[code xx]. The + | language class, a generic subclass containing only the base language data, + | can be found in #[+src(gh("spaCy", "spacy/lang/xx")) lang/xx]. + +p + | To load your model with the neutral, multi-language class, simply set + | #[code "language": "xx"] in your + | #[+a("/docs/usage/saving-loading#models-generating") model package]'s + | meta.json. You can also import the class directly, or call + | #[+api("util#get_lang_class") #[code util.get_lang_class()]] for + | lazy-loading. + ++code("Standard import"). + from spacy.lang.xx import MultiLanguage + nlp = MultiLanguage() + ++code("With lazy-loading"). + from spacy.util import get_lang_class + nlp = get_lang_class('xx') From 10d05c2b9274073da0edac0379e3a42d97816992 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 28 May 2017 01:30:12 +0200 Subject: [PATCH 25/25] Fix typos, wording and formatting --- .../docs/usage/_spacy-101/_similarity.jade | 2 +- .../usage/language-processing-pipeline.jade | 2 +- website/docs/usage/spacy-101.jade | 10 ++- website/docs/usage/v2.jade | 85 +++++++++---------- 4 files changed, 49 insertions(+), 50 deletions(-) diff --git a/website/docs/usage/_spacy-101/_similarity.jade b/website/docs/usage/_spacy-101/_similarity.jade index c99bc9658..6eed1eb7f 100644 --- a/website/docs/usage/_spacy-101/_similarity.jade +++ b/website/docs/usage/_spacy-101/_similarity.jade @@ -5,7 +5,7 @@ p | #[strong how similar they are]. Predicting similarity is useful for | building recommendation systems or flagging duplicates. For example, you | can suggest a user content that's similar to what they're currently - | looking at, or label a support ticket as a duplicate, if it's very + | looking at, or label a support ticket as a duplicate if it's very | similar to an already existing one. p diff --git a/website/docs/usage/language-processing-pipeline.jade b/website/docs/usage/language-processing-pipeline.jade index 1392fc2f8..ffad01ead 100644 --- a/website/docs/usage/language-processing-pipeline.jade +++ b/website/docs/usage/language-processing-pipeline.jade @@ -144,7 +144,7 @@ p +table(["Argument", "Type", "Description"]) +row +cell #[code vocab] - +cell #[coce Vocab] + +cell #[code Vocab] +cell | Shared data between components, including strings, morphology, | vectors etc. diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index 8b2d0c17e..6a1f780dc 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -65,7 +65,7 @@ p | spaCy provides a variety of linguistic annotations to give you insights | into a text's grammatical structure. This includes the word types, | i.e. the parts of speech, and how the words are related to each other. - | For example, if you're analysing text, it makes a #[em huge] difference + | For example, if you're analysing text, it makes a huge difference | whether a noun is the subject of a sentence, or the object – or whether | "google" is used as a verb, or refers to the website or company in a | specific context. @@ -119,9 +119,11 @@ include _spacy-101/_named-entities +infobox | To learn more about entity recognition in spaCy, how to - | #[strong add your own entities] to a document and how to train and update - | the entity predictions of a model, see the usage guide on - | #[+a("/docs/usage/entity-recognition") named entity recognition]. + | #[strong add your own entities] to a document and how to + | #[strong train and update] the entity predictions of a model, see the + | usage guides on + | #[+a("/docs/usage/entity-recognition") named entity recognition] and + | #[+a("/docs/usage/training-ner") training the named entity recognizer]. +h(2, "vectors-similarity") Word vectors and similarity +tag-model("vectors") diff --git a/website/docs/usage/v2.jade b/website/docs/usage/v2.jade index 23b234c43..25aae8706 100644 --- a/website/docs/usage/v2.jade +++ b/website/docs/usage/v2.jade @@ -20,19 +20,18 @@ p nlp = Language(pipeline=['my_factory', mycomponent]) p - | It's now much easier to customise the pipeline with your own components. - | Components are functions that receive a #[code Doc] object, modify and - | return it. If your component is stateful, you'll want to create a new one - | for each pipeline. You can do that by defining and registering a factory - | which receives the shared #[code Vocab] object and returns a component. - -p - | spaCy's default components – the vectorizer, tagger, parser and entity - | recognizer, can be added to your pipeline by using their string IDs. - | This way, you won't have to worry about finding and implementing them – - | to use the default tagger, simply add #[code "tagger"] to the pipeline, + | It's now much easier to #[strong customise the pipeline] with your own + | components, functions that receive a #[code Doc] object, modify and + | return it. If your component is stateful, you can define and register a + | factory which receives the shared #[code Vocab] object and returns a + |Β  component. spaCy's default components can be added to your pipeline by + | using their string IDs. This way, you won't have to worry about finding + | and implementing them – simply add #[code "tagger"] to the pipeline, | and spaCy will know what to do. ++image + include ../../assets/img/docs/pipeline.svg + +infobox | #[strong API:] #[+api("language") #[code Language]] | #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text] @@ -96,11 +95,10 @@ p | #[code Language] class, or load a model that initialises one. This allows | languages to contain more custom data, e.g. lemmatizer lookup tables, or | complex regular expressions. The language data has also been tidied up - | and simplified. It's now also possible to overwrite the functions that - | compute lexical attributes like #[code like_num], and supply - | language-specific syntax iterators, e.g. to determine noun chunks. spaCy - | now also supports simple lookup-based lemmatization. The data is stored - | in a dictionary mapping a string to its lemma. + | and simplified. spaCy now also supports simple lookup-based lemmatization. + ++image + include ../../assets/img/docs/language_data.svg +infobox | #[strong API:] #[+api("language") #[code Language]] @@ -111,13 +109,10 @@ p +aside-code("Example"). from spacy.matcher import Matcher - from spacy.attrs import LOWER, IS_PUNCT matcher = Matcher(nlp.vocab) - matcher.add('HelloWorld', None, - [{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}], - [{LOWER: 'hello'}, {LOWER: 'world'}]) + matcher.add('HEARTS', None, [{'ORTH': '❀️', 'OP': '+'}]) assert len(matcher) == 1 - assert 'HelloWorld' in matcher + assert 'HEARTS' in matcher p | Patterns can now be added to the matcher by calling @@ -157,28 +152,8 @@ p +cell #[+api("language#to_disk") #[code Language.to_disk]] +row - +cell #[code Tokenizer.load] - +cell - | #[+api("tokenizer#from_disk") #[code Tokenizer.from_disk]] - | #[+api("tokenizer#from_bytes") #[code Tokenizer.from_bytes]] - - +row - +cell #[code Tagger.load] - +cell - | #[+api("tagger#from_disk") #[code Tagger.from_disk]] - | #[+api("tagger#from_bytes") #[code Tagger.from_bytes]] - - +row - +cell #[code DependencyParser.load] - +cell - | #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]] - | #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]] - - +row - +cell #[code EntityRecognizer.load] - +cell - | #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]] - | #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]] + +cell #[code Language.create_make_doc] + +cell #[+api("language#attributes") #[code Language.tokenizer]] +row +cell @@ -212,6 +187,28 @@ p | #[+api("stringstore#to_disk") #[code StringStore.to_disk]] | #[+api("stringstore#to_bytes") #[code StringStore.to_bytes]] + +row + +cell #[code Tokenizer.load] + +cell - + + +row + +cell #[code Tagger.load] + +cell + | #[+api("tagger#from_disk") #[code Tagger.from_disk]] + | #[+api("tagger#from_bytes") #[code Tagger.from_bytes]] + + +row + +cell #[code DependencyParser.load] + +cell + | #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]] + | #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]] + + +row + +cell #[code EntityRecognizer.load] + +cell + | #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]] + | #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]] + +row +cell #[code Matcher.load] +cell - @@ -232,7 +229,7 @@ p +row +cell #[code Doc.read_bytes] - +cell + +cell #[+api("binder") #[code Binder]] +row +cell #[code Token.is_ancestor_of]