diff --git a/spacy/tests/serialize/test_serialization.py b/spacy/tests/serialize/test_serialization.py index 52c42b94d..036035095 100644 --- a/spacy/tests/serialize/test_serialization.py +++ b/spacy/tests/serialize/test_serialization.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from ..util import get_doc, assert_docs_equal from ...tokens import Doc +from ...vocab import Vocab import pytest @@ -22,6 +23,15 @@ def test_serialize_empty_doc(en_vocab): for token1, token2 in zip(doc, doc2): assert token1.text == token2.text + +@pytest.mark.xfail +@pytest.mark.parametrize('text', ['rat']) +def test_serialize_vocab(en_vocab, text): + text_hash = en_vocab.strings.add(text) + vocab_bytes = en_vocab.to_bytes() + new_vocab = Vocab().from_bytes(vocab_bytes) + assert new_vocab.strings(text_hash) == text + # #@pytest.mark.parametrize('text', [TEXT]) #def test_serialize_tokens(en_vocab, text): diff --git a/spacy/tests/stringstore/test_stringstore.py b/spacy/tests/stringstore/test_stringstore.py index 228f69b53..65b994606 100644 --- a/spacy/tests/stringstore/test_stringstore.py +++ b/spacy/tests/stringstore/test_stringstore.py @@ -6,6 +6,25 @@ from ...strings import StringStore import pytest +def test_stringstore_from_api_docs(stringstore): + apple_hash = stringstore.add('apple') + assert apple_hash == 8566208034543834098 + assert stringstore[apple_hash] == u'apple' + + assert u'apple' in stringstore + assert u'cherry' not in stringstore + + orange_hash = stringstore.add('orange') + all_strings = [s for s in stringstore] + assert all_strings == [u'apple', u'orange'] + + banana_hash = stringstore.add('banana') + assert len(stringstore) == 3 + assert banana_hash == 2525716904149915114 + assert stringstore[banana_hash] == u'banana' + assert stringstore[u'banana'] == banana_hash + + @pytest.mark.parametrize('text1,text2,text3', [(b'Hello', b'goodbye', b'hello')]) def test_stringstore_save_bytes(stringstore, text1, text2, text3): key = stringstore.add(text1) diff --git a/spacy/tests/test_matcher.py b/spacy/tests/test_matcher.py index 9bbc9b24d..388aab03e 100644 --- a/spacy/tests/test_matcher.py +++ b/spacy/tests/test_matcher.py @@ -20,6 +20,41 @@ def matcher(en_vocab): return matcher +def test_matcher_from_api_docs(en_vocab): + matcher = Matcher(en_vocab) + pattern = [{'ORTH': 'test'}] + assert len(matcher) == 0 + matcher.add('Rule', None, pattern) + assert len(matcher) == 1 + matcher.remove('Rule') + assert 'Rule' not in matcher + matcher.add('Rule', None, pattern) + assert 'Rule' in matcher + on_match, patterns = matcher.get('Rule') + assert len(patterns[0]) + + +@pytest.mark.xfail +def test_matcher_from_usage_docs(en_vocab): + text = "Wow 😀 This is really cool! 😂 😂" + doc = get_doc(en_vocab, words=text.split(' ')) + pos_emoji = [u'😀', u'😃', u'😂', u'🤣', u'😊', u'😍'] + pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emoji] + + def label_sentiment(matcher, doc, i, matches): + match_id, start, end = matches[i] + if doc.vocab.strings[match_id] == 'HAPPY': + doc.sentiment += 0.1 + span = doc[start : end] + token = span.merge(norm='happy emoji') + + matcher = Matcher(en_vocab) + matcher.add('HAPPY', label_sentiment, *pos_patterns) + matches = matcher(doc) + assert doc.sentiment != 0 + assert doc[1].norm_ == 'happy emoji' + + @pytest.mark.parametrize('words', [["Some", "words"]]) def test_matcher_init(en_vocab, words): matcher = Matcher(en_vocab) diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 2c0ff0520..0872a01b6 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -3,6 +3,10 @@ from __future__ import unicode_literals from ..util import ensure_path from ..util import model_to_bytes, model_from_bytes +from .. import util +from ..displacy import parse_deps, parse_ents +from ..tokens import Span +from .util import get_doc from pathlib import Path import pytest @@ -12,7 +16,7 @@ from thinc.api import chain @pytest.mark.parametrize('text', ['hello/world', 'hello world']) def test_util_ensure_path_succeeds(text): - path = ensure_path(text) + path = util.ensure_path(text) assert isinstance(path, Path) @@ -47,3 +51,43 @@ def test_multi_model_load_missing_dims(): model_from_bytes(model2, data) assert model2._layers[0].b[0, 0] == 1 assert model2._layers[1].b[0, 0] == 2 + +@pytest.mark.parametrize('package', ['thinc']) +def test_util_is_package(package): + """Test that an installed package via pip is recognised by util.is_package.""" + assert util.is_package(package) + + +@pytest.mark.parametrize('package', ['thinc']) +def test_util_get_package_path(package): + """Test that a Path object is returned for a package name.""" + path = util.get_package_path(package) + assert isinstance(path, Path) + + +def test_displacy_parse_ents(en_vocab): + """Test that named entities on a Doc are converted into displaCy's format.""" + doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"]) + doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings[u'ORG'])] + ents = parse_ents(doc) + assert isinstance(ents, dict) + assert ents['text'] == 'But Google is starting from behind ' + assert ents['ents'] == [{'start': 4, 'end': 10, 'label': 'ORG'}] + + +def test_displacy_parse_deps(en_vocab): + """Test that deps and tags on a Doc are converted into displaCy's format.""" + words = ["This", "is", "a", "sentence"] + heads = [1, 0, 1, -2] + tags = ['DT', 'VBZ', 'DT', 'NN'] + deps = ['nsubj', 'ROOT', 'det', 'attr'] + doc = get_doc(en_vocab, words=words, heads=heads, tags=tags, deps=deps) + deps = parse_deps(doc) + assert isinstance(deps, dict) + assert deps['words'] == [{'text': 'This', 'tag': 'DT'}, + {'text': 'is', 'tag': 'VBZ'}, + {'text': 'a', 'tag': 'DT'}, + {'text': 'sentence', 'tag': 'NN'}] + assert deps['arcs'] == [{'start': 0, 'end': 1, 'label': 'nsubj', 'dir': 'left'}, + {'start': 2, 'end': 3, 'label': 'det', 'dir': 'left'}, + {'start': 1, 'end': 3, 'label': 'attr', 'dir': 'right'}] diff --git a/spacy/util.py b/spacy/util.py index d93e6f1c5..ba7873640 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -183,7 +183,7 @@ def get_package_path(name): """ # Here we're importing the module just to find it. This is worryingly # indirect, but it's otherwise very difficult to find the package. - pkg = importlib.import_module(package_name) + pkg = importlib.import_module(name) return Path(pkg.__file__).parent diff --git a/website/_includes/_page-docs.jade b/website/_includes/_page-docs.jade index 26b82381f..d11e22502 100644 --- a/website/_includes/_page-docs.jade +++ b/website/_includes/_page-docs.jade @@ -19,9 +19,12 @@ main.o-main.o-main--sidebar.o-main--aside if ALPHA - +infobox("⚠️ You are viewing the spaCy v2.0 alpha docs") - | This page is part of the alpha documentation for spaCy v2.0 - | and does not reflect the state of the latest stable release. + +infobox("⚠️ You are viewing the spaCy v2.0.0 alpha docs") + strong This page is part of the alpha documentation for spaCy v2.0. + | It does not reflect the state of the latest stable release. + | Because v2.0 is still under development, the actual + | implementation may differ from the intended state described + | here. | #[+a("#") See here] for more information on how to install | and test the new version. To read the official docs for | v1.x, #[+a("https://spacy.io/docs") go here]. diff --git a/website/assets/js/prism.js b/website/assets/js/prism.js index 1bb2c4b85..85a241b51 100644 --- a/website/assets/js/prism.js +++ b/website/assets/js/prism.js @@ -16,7 +16,7 @@ Prism.languages.json={property:/".*?"(?=\s*:)/gi,string:/"(?!:)(\\?[^"])*?"(?!:) !function(a){var e=/\\([^a-z()[\]]|[a-z\*]+)/i,n={"equation-command":{pattern:e,alias:"regex"}};a.languages.latex={comment:/%.*/m,cdata:{pattern:/(\\begin\{((?:verbatim|lstlisting)\*?)\})([\w\W]*?)(?=\\end\{\2\})/,lookbehind:!0},equation:[{pattern:/\$(?:\\?[\w\W])*?\$|\\\((?:\\?[\w\W])*?\\\)|\\\[(?:\\?[\w\W])*?\\\]/,inside:n,alias:"string"},{pattern:/(\\begin\{((?:equation|math|eqnarray|align|multline|gather)\*?)\})([\w\W]*?)(?=\\end\{\2\})/,lookbehind:!0,inside:n,alias:"string"}],keyword:{pattern:/(\\(?:begin|end|ref|cite|label|usepackage|documentclass)(?:\[[^\]]+\])?\{)[^}]+(?=\})/,lookbehind:!0},url:{pattern:/(\\url\{)[^}]+(?=\})/,lookbehind:!0},headline:{pattern:/(\\(?:part|chapter|section|subsection|frametitle|subsubsection|paragraph|subparagraph|subsubparagraph|subsubsubparagraph)\*?(?:\[[^\]]+\])?\{)[^}]+(?=\}(?:\[[^\]]+\])?)/,lookbehind:!0,alias:"class-name"},"function":{pattern:e,alias:"selector"},punctuation:/[[\]{}&]/}}(Prism); Prism.languages.makefile={comment:{pattern:/(^|[^\\])#(?:\\(?:\r\n|[\s\S])|.)*/,lookbehind:!0},string:/(["'])(?:\\(?:\r\n|[\s\S])|(?!\1)[^\\\r\n])*\1/,builtin:/\.[A-Z][^:#=\s]+(?=\s*:(?!=))/,symbol:{pattern:/^[^:=\r\n]+(?=\s*:(?!=))/m,inside:{variable:/\$+(?:[^(){}:#=\s]+|(?=[({]))/}},variable:/\$+(?:[^(){}:#=\s]+|\([@*%<^+?][DF]\)|(?=[({]))/,keyword:[/-include\b|\b(?:define|else|endef|endif|export|ifn?def|ifn?eq|include|override|private|sinclude|undefine|unexport|vpath)\b/,{pattern:/(\()(?:addsuffix|abspath|and|basename|call|dir|error|eval|file|filter(?:-out)?|findstring|firstword|flavor|foreach|guile|if|info|join|lastword|load|notdir|or|origin|patsubst|realpath|shell|sort|strip|subst|suffix|value|warning|wildcard|word(?:s|list)?)(?=[ \t])/,lookbehind:!0}],operator:/(?:::|[?:+!])?=|[|@]/,punctuation:/[:;(){}]/}; Prism.languages.markdown=Prism.languages.extend("markup",{}),Prism.languages.insertBefore("markdown","prolog",{blockquote:{pattern:/^>(?:[\t ]*>)*/m,alias:"punctuation"},code:[{pattern:/^(?: {4}|\t).+/m,alias:"keyword"},{pattern:/``.+?``|`[^`\n]+`/,alias:"keyword"}],title:[{pattern:/\w+.*(?:\r?\n|\r)(?:==+|--+)/,alias:"important",inside:{punctuation:/==+$|--+$/}},{pattern:/(^\s*)#+.+/m,lookbehind:!0,alias:"important",inside:{punctuation:/^#+|#+$/}}],hr:{pattern:/(^\s*)([*-])([\t ]*\2){2,}(?=\s*$)/m,lookbehind:!0,alias:"punctuation"},list:{pattern:/(^\s*)(?:[*+-]|\d+\.)(?=[\t ].)/m,lookbehind:!0,alias:"punctuation"},"url-reference":{pattern:/!?\[[^\]]+\]:[\t ]+(?:\S+|<(?:\\.|[^>\\])+>)(?:[\t ]+(?:"(?:\\.|[^"\\])*"|'(?:\\.|[^'\\])*'|\((?:\\.|[^)\\])*\)))?/,inside:{variable:{pattern:/^(!?\[)[^\]]+/,lookbehind:!0},string:/(?:"(?:\\.|[^"\\])*"|'(?:\\.|[^'\\])*'|\((?:\\.|[^)\\])*\))$/,punctuation:/^[\[\]!:]|[<>]/},alias:"url"},bold:{pattern:/(^|[^\\])(\*\*|__)(?:(?:\r?\n|\r)(?!\r?\n|\r)|.)+?\2/,lookbehind:!0,inside:{punctuation:/^\*\*|^__|\*\*$|__$/}},italic:{pattern:/(^|[^\\])([*_])(?:(?:\r?\n|\r)(?!\r?\n|\r)|.)+?\2/,lookbehind:!0,inside:{punctuation:/^[*_]|[*_]$/}},url:{pattern:/!?\[[^\]]+\](?:\([^\s)]+(?:[\t ]+"(?:\\.|[^"\\])*")?\)| ?\[[^\]\n]*\])/,inside:{variable:{pattern:/(!?\[)[^\]]+(?=\]$)/,lookbehind:!0},string:{pattern:/"(?:\\.|[^"\\])*"(?=\)$)/}}}}),Prism.languages.markdown.bold.inside.url=Prism.util.clone(Prism.languages.markdown.url),Prism.languages.markdown.italic.inside.url=Prism.util.clone(Prism.languages.markdown.url),Prism.languages.markdown.bold.inside.italic=Prism.util.clone(Prism.languages.markdown.italic),Prism.languages.markdown.italic.inside.bold=Prism.util.clone(Prism.languages.markdown.bold); -Prism.languages.python={"triple-quoted-string":{pattern:/"""[\s\S]+?"""|'''[\s\S]+?'''/,alias:"string"},comment:{pattern:/(^|[^\\])#.*/,lookbehind:!0},string:/("|')(?:\\?.)*?\1/,"function":{pattern:/((?:^|\s)def[ \t]+)[a-zA-Z_][a-zA-Z0-9_]*(?=\()/g,lookbehind:!0},"class-name":{pattern:/(\bclass\s+)[a-z0-9_]+/i,lookbehind:!0},keyword:/\b(?:as|assert|async|await|break|class|continue|def|del|elif|else|except|exec|finally|for|from|global|if|import|in|is|lambda|pass|print|raise|return|try|while|with|yield)\b/,"boolean":/\b(?:True|False)\b/,number:/\b-?(?:0[bo])?(?:(?:\d|0x[\da-f])[\da-f]*\.?\d*|\.\d+)(?:e[+-]?\d+)?j?L?\b/i,operator:/[-+%=]=?|!=|\*\*?=?|\/\/?=?|<[<=>]?|>[=>]?|[&|^~]|\b(?:or|and|not)\b/,punctuation:/[{}[\];(),.:]/}; +Prism.languages.python={"triple-quoted-string":{pattern:/"""[\s\S]+?"""|'''[\s\S]+?'''/,alias:"string"},comment:{pattern:/(^|[^\\])#.*/,lookbehind:!0},string:/("|')(?:\\?.)*?\1/,"function":{pattern:/((?:^|\s)def[ \t]+)[a-zA-Z_][a-zA-Z0-9_]*(?=\()/g,lookbehind:!0},"class-name":{pattern:/(\bclass\s+)[a-z0-9_]+/i,lookbehind:!0},keyword:/\b(?:as|assert|async|await|break|class|continue|def|del|elif|else|except|exec|finally|for|from|global|if|import|in|is|lambda|pass|print|raise|return|try|while|with|yield)\b/,"boolean":/\b(?:True|False)\b/,number:/\b-?(?:0[bo])?(?:(?:\d|0x[\da-f])[\da-f]*\.?\d*|\.\d+)(?:e[+-]?\d+)?j?\b/i,operator:/[-+%=]=?|!=|\*\*?=?|\/\/?=?|<[<=>]?|>[=>]?|[&|^~]|\b(?:or|and|not)\b/,punctuation:/[{}[\];(),.:]/}; Prism.languages.rest={table:[{pattern:/(\s*)(?:\+[=-]+)+\+(?:\r?\n|\r)(?:\1(?:[+|].+)+[+|](?:\r?\n|\r))+\1(?:\+[=-]+)+\+/,lookbehind:!0,inside:{punctuation:/\||(?:\+[=-]+)+\+/}},{pattern:/(\s*)(?:=+ +)+=+((?:\r?\n|\r)\1.+)+(?:\r?\n|\r)\1(?:=+ +)+=+(?=(?:\r?\n|\r){2}|\s*$)/,lookbehind:!0,inside:{punctuation:/[=-]+/}}],"substitution-def":{pattern:/(^\s*\.\. )\|(?:[^|\s](?:[^|]*[^|\s])?)\| [^:]+::/m,lookbehind:!0,inside:{substitution:{pattern:/^\|(?:[^|\s]|[^|\s][^|]*[^|\s])\|/,alias:"attr-value",inside:{punctuation:/^\||\|$/}},directive:{pattern:/( +)[^:]+::/,lookbehind:!0,alias:"function",inside:{punctuation:/::$/}}}},"link-target":[{pattern:/(^\s*\.\. )\[[^\]]+\]/m,lookbehind:!0,alias:"string",inside:{punctuation:/^\[|\]$/}},{pattern:/(^\s*\.\. )_(?:`[^`]+`|(?:[^:\\]|\\.)+):/m,lookbehind:!0,alias:"string",inside:{punctuation:/^_|:$/}}],directive:{pattern:/(^\s*\.\. )[^:]+::/m,lookbehind:!0,alias:"function",inside:{punctuation:/::$/}},comment:{pattern:/(^\s*\.\.)(?:(?: .+)?(?:(?:\r?\n|\r).+)+| .+)(?=(?:\r?\n|\r){2}|$)/m,lookbehind:!0},title:[{pattern:/^(([!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~])\2+)(?:\r?\n|\r).+(?:\r?\n|\r)\1$/m,inside:{punctuation:/^[!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~]+|[!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~]+$/,important:/.+/}},{pattern:/(^|(?:\r?\n|\r){2}).+(?:\r?\n|\r)([!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~])\2+(?=\r?\n|\r|$)/,lookbehind:!0,inside:{punctuation:/[!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~]+$/,important:/.+/}}],hr:{pattern:/((?:\r?\n|\r){2})([!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~])\2{3,}(?=(?:\r?\n|\r){2})/,lookbehind:!0,alias:"punctuation"},field:{pattern:/(^\s*):[^:\r\n]+:(?= )/m,lookbehind:!0,alias:"attr-name"},"command-line-option":{pattern:/(^\s*)(?:[+-][a-z\d]|(?:\-\-|\/)[a-z\d-]+)(?:[ =](?:[a-z][a-z\d_-]*|<[^<>]+>))?(?:, (?:[+-][a-z\d]|(?:\-\-|\/)[a-z\d-]+)(?:[ =](?:[a-z][a-z\d_-]*|<[^<>]+>))?)*(?=(?:\r?\n|\r)? {2,}\S)/im,lookbehind:!0,alias:"symbol"},"literal-block":{pattern:/::(?:\r?\n|\r){2}([ \t]+).+(?:(?:\r?\n|\r)\1.+)*/,inside:{"literal-block-punctuation":{pattern:/^::/,alias:"punctuation"}}},"quoted-literal-block":{pattern:/::(?:\r?\n|\r){2}([!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~]).*(?:(?:\r?\n|\r)\1.*)*/,inside:{"literal-block-punctuation":{pattern:/^(?:::|([!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~])\1*)/m,alias:"punctuation"}}},"list-bullet":{pattern:/(^\s*)(?:[*+\-•‣⁃]|\(?(?:\d+|[a-z]|[ivxdclm]+)\)|(?:\d+|[a-z]|[ivxdclm]+)\.)(?= )/im,lookbehind:!0,alias:"punctuation"},"doctest-block":{pattern:/(^\s*)>>> .+(?:(?:\r?\n|\r).+)*/m,lookbehind:!0,inside:{punctuation:/^>>>/}},inline:[{pattern:/(^|[\s\-:\/'"<(\[{])(?::[^:]+:`.*?`|`.*?`:[^:]+:|(\*\*?|``?|\|)(?!\s).*?[^\s]\2(?=[\s\-.,:;!?\\\/'")\]}]|$))/m,lookbehind:!0,inside:{bold:{pattern:/(^\*\*).+(?=\*\*$)/,lookbehind:!0},italic:{pattern:/(^\*).+(?=\*$)/,lookbehind:!0},"inline-literal":{pattern:/(^``).+(?=``$)/,lookbehind:!0,alias:"symbol"},role:{pattern:/^:[^:]+:|:[^:]+:$/,alias:"function",inside:{punctuation:/^:|:$/}},"interpreted-text":{pattern:/(^`).+(?=`$)/,lookbehind:!0,alias:"attr-value"},substitution:{pattern:/(^\|).+(?=\|$)/,lookbehind:!0,alias:"attr-value"},punctuation:/\*\*?|``?|\|/}}],link:[{pattern:/\[[^\]]+\]_(?=[\s\-.,:;!?\\\/'")\]}]|$)/,alias:"string",inside:{punctuation:/^\[|\]_$/}},{pattern:/(?:\b[a-z\d](?:[_.:+]?[a-z\d]+)*_?_|`[^`]+`_?_|_`[^`]+`)(?=[\s\-.,:;!?\\\/'")\]}]|$)/i,alias:"string",inside:{punctuation:/^_?`|`$|`?_?_$/}}],punctuation:{pattern:/(^\s*)(?:\|(?= |$)|(?:---?|—|\.\.|__)(?= )|\.\.$)/m,lookbehind:!0}}; !function(e){e.languages.sass=e.languages.extend("css",{comment:{pattern:/^([ \t]*)\/[\/*].*(?:(?:\r?\n|\r)\1[ \t]+.+)*/m,lookbehind:!0}}),e.languages.insertBefore("sass","atrule",{"atrule-line":{pattern:/^(?:[ \t]*)[@+=].+/m,inside:{atrule:/(?:@[\w-]+|[+=])/m}}}),delete e.languages.sass.atrule;var a=/((\$[-_\w]+)|(#\{\$[-_\w]+\}))/i,t=[/[+*\/%]|[=!]=|<=?|>=?|\b(?:and|or|not)\b/,{pattern:/(\s+)-(?=\s)/,lookbehind:!0}];e.languages.insertBefore("sass","property",{"variable-line":{pattern:/^[ \t]*\$.+/m,inside:{punctuation:/:/,variable:a,operator:t}},"property-line":{pattern:/^[ \t]*(?:[^:\s]+ *:.*|:[^:\s]+.*)/m,inside:{property:[/[^:\s]+(?=\s*:)/,{pattern:/(:)[^:\s]+/,lookbehind:!0}],punctuation:/:/,variable:a,operator:t,important:e.languages.sass.important}}}),delete e.languages.sass.property,delete e.languages.sass.important,delete e.languages.sass.selector,e.languages.insertBefore("sass","punctuation",{selector:{pattern:/([ \t]*)\S(?:,?[^,\r\n]+)*(?:,(?:\r?\n|\r)\1[ \t]+\S(?:,?[^,\r\n]+)*)*/,lookbehind:!0}})}(Prism); Prism.languages.scss=Prism.languages.extend("css",{comment:{pattern:/(^|[^\\])(?:\/\*[\w\W]*?\*\/|\/\/.*)/,lookbehind:!0},atrule:{pattern:/@[\w-]+(?:\([^()]+\)|[^(])*?(?=\s+[{;])/,inside:{rule:/@[\w-]+/}},url:/(?:[-a-z]+-)*url(?=\()/i,selector:{pattern:/(?=\S)[^@;\{\}\(\)]?([^@;\{\}\(\)]|&|#\{\$[-_\w]+\})+(?=\s*\{(\}|\s|[^\}]+(:|\{)[^\}]+))/m,inside:{placeholder:/%[-_\w]+/}}}),Prism.languages.insertBefore("scss","atrule",{keyword:[/@(?:if|else(?: if)?|for|each|while|import|extend|debug|warn|mixin|include|function|return|content)/i,{pattern:/( +)(?:from|through)(?= )/,lookbehind:!0}]}),Prism.languages.insertBefore("scss","property",{variable:/\$[-_\w]+|#\{\$[-_\w]+\}/}),Prism.languages.insertBefore("scss","function",{placeholder:{pattern:/%[-_\w]+/,alias:"selector"},statement:/\B!(?:default|optional)\b/i,"boolean":/\b(?:true|false)\b/,"null":/\bnull\b/,operator:{pattern:/(\s)(?:[-+*\/%]|[=!]=|<=?|>=?|and|or|not)(?=\s)/,lookbehind:!0}}),Prism.languages.scss.atrule.inside.rest=Prism.util.clone(Prism.languages.scss); diff --git a/website/docs/api/matcher.jade b/website/docs/api/matcher.jade index c837fe434..e7c0aaaf2 100644 --- a/website/docs/api/matcher.jade +++ b/website/docs/api/matcher.jade @@ -5,14 +5,13 @@ include ../../_includes/_mixins p Match sequences of tokens, based on pattern rules. +infobox("⚠️ Deprecation note") - .o-block - | As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity] - | are deprecated and have been replaced with a simpler - | #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of - | patterns and a callback for a given match ID. #[code Matcher.get_entity] - | is now called #[+api("matcher#get") #[code matcher.get]]. - | #[code Matcher.load] (not useful, as it didn't allow specifying callbacks), - | and #[code Matcher.has_entity] (now redundant) have been removed. + | As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity] + | are deprecated and have been replaced with a simpler + | #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of + | patterns and a callback for a given match ID. #[code Matcher.get_entity] + | is now called #[+api("matcher#get") #[code matcher.get]]. + | #[code Matcher.load] (not useful, as it didn't allow specifying callbacks), + | and #[code Matcher.has_entity] (now redundant) have been removed. +h(2, "init") Matcher.__init__ +tag method @@ -146,9 +145,9 @@ p Check whether the matcher contains rules for a match ID. +aside-code("Example"). matcher = Matcher(nlp.vocab) - assert 'Rule' in matcher == False + assert 'Rule' not in matcher matcher.add('Rule', None, [{'ORTH': 'test'}]) - assert 'Rule' in matcher == True + assert 'Rule' in matcher +table(["Name", "Type", "Description"]) +row @@ -226,9 +225,9 @@ p +aside-code("Example"). matcher.add('Rule', None, [{'ORTH': 'test'}]) - assert 'Rule' in matcher == True + assert 'Rule' in matcher matcher.remove('Rule') - assert 'Rule' in matcher == False + assert 'Rule' not in matcher +table(["Name", "Type", "Description"]) +row @@ -248,8 +247,7 @@ p +aside-code("Example"). pattern = [{'ORTH': 'test'}] matcher.add('Rule', None, pattern) - (on_match, patterns) = matcher.get('Rule') - assert patterns = [pattern] + on_match, patterns = matcher.get('Rule') +table(["Name", "Type", "Description"]) +row diff --git a/website/docs/api/stringstore.jade b/website/docs/api/stringstore.jade index 969c8a6a5..c17fb1db9 100644 --- a/website/docs/api/stringstore.jade +++ b/website/docs/api/stringstore.jade @@ -51,7 +51,7 @@ p Retrieve a string from a given hash, or vice versa. +aside-code("Example"). stringstore = StringStore([u'apple', u'orange']) apple_hash = stringstore[u'apple'] - assert apple_hash == 8566208034543834098L + assert apple_hash == 8566208034543834098 assert stringstore[apple_hash] == u'apple' +table(["Name", "Type", "Description"]) @@ -72,8 +72,8 @@ p Check whether a string is in the store. +aside-code("Example"). stringstore = StringStore([u'apple', u'orange']) - assert u'apple' in stringstore == True - assert u'cherry' in stringstore == False + assert u'apple' in stringstore + assert not u'cherry' in stringstore +table(["Name", "Type", "Description"]) +row @@ -115,7 +115,7 @@ p Add a string to the #[code StringStore]. stringstore = StringStore([u'apple', u'orange']) banana_hash = stringstore.add(u'banana') assert len(stringstore) == 3 - assert banana_hash == 2525716904149915114L + assert banana_hash == 2525716904149915114 assert stringstore[banana_hash] == u'banana' assert stringstore[u'banana'] == banana_hash @@ -215,3 +215,25 @@ p Load state from a binary string. +cell returns +cell #[code StringStore] +cell The #[code StringStore] object. + ++h(2, "util") Utilities + ++h(3, "hash_string") strings.hash_string + +tag function + +p Get a 64-bit hash for a given string. + ++aside-code("Example"). + from spacy.strings import hash_string + assert hash_string(u'apple') == 8566208034543834098 + ++table(["Name", "Type", "Description"]) + +row + +cell #[code string] + +cell unicode + +cell The string to hash. + + +footrow + +cell returns + +cell uint64 + +cell The hash. diff --git a/website/docs/api/vocab.jade b/website/docs/api/vocab.jade index ce62612d3..4d3e0828a 100644 --- a/website/docs/api/vocab.jade +++ b/website/docs/api/vocab.jade @@ -34,10 +34,10 @@ p Create the vocabulary. +row +cell #[code strings] - +cell #[code StringStore] + +cell #[code StringStore] or list +cell - | A #[code StringStore] that maps strings to hash values, and vice - | versa. + | A #[+api("stringstore") #[code StringStore]] that maps + | strings to hash values, and vice versa, or a list of strings. +footrow +cell returns diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index 79d0b28f1..3d344eb2a 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -42,7 +42,7 @@ }, "spacy-101": { - "title": "spaCy 101", + "title": "spaCy 101 – Everything you need to know", "next": "lightning-tour", "quickstart": true }, diff --git a/website/docs/usage/_spacy-101/_pipelines.jade b/website/docs/usage/_spacy-101/_pipelines.jade index 654ca86e4..c21c9f97c 100644 --- a/website/docs/usage/_spacy-101/_pipelines.jade +++ b/website/docs/usage/_spacy-101/_pipelines.jade @@ -63,3 +63,16 @@ p +code(false, "json"). "pipeline": ["tensorizer", "tagger", "parser", "ner"] + +p + | Although you can mix and match pipeline components, their + | #[strong order and combination] is usually important. Some components may + | require certain modifications on the #[code Doc] to process it. For + | example, the default pipeline first applies the tensorizer, which + | pre-processes the doc and encodes its internal + | #[strong meaning representations] as an array of floats, also called a + | #[strong tensor]. This includes the tokens and their context, which is + | required for the next component, the tagger, to make predictions of the + | part-of-speech tags. Because spaCy's models are neural network models, + | they only "speak" tensors and expect the input #[code Doc] to have + | a #[code tensor]. diff --git a/website/docs/usage/_spacy-101/_tokenization.jade b/website/docs/usage/_spacy-101/_tokenization.jade index 95a9cc520..c48a43e72 100644 --- a/website/docs/usage/_spacy-101/_tokenization.jade +++ b/website/docs/usage/_spacy-101/_tokenization.jade @@ -29,7 +29,7 @@ p | into two tokens, "do" and "n't", while "U.K." should always | remain one token. +item - | #[strong Can a prefix, suffix or infixes be split off?]. For example + | #[strong Can a prefix, suffix or infixes be split off?] For example | punctuation like commas, periods, hyphens or quotes. p diff --git a/website/docs/usage/_spacy-101/_vocab.jade b/website/docs/usage/_spacy-101/_vocab.jade index e59518a25..8e74cd2c9 100644 --- a/website/docs/usage/_spacy-101/_vocab.jade +++ b/website/docs/usage/_spacy-101/_vocab.jade @@ -5,7 +5,7 @@ p | #[+api("vocab") #[code Vocab]], that will be | #[strong shared by multiple documents]. To save memory, spaCy also | encodes all strings to #[strong hash values] – in this case for example, - | "coffee" has the hash #[code 3197928453018144401L]. Entity labels like + | "coffee" has the hash #[code 3197928453018144401]. Entity labels like | "ORG" and part-of-speech tags like "VERB" are also encoded. Internally, | spaCy only "speaks" in hash values. @@ -17,7 +17,7 @@ p | #[strong Doc]: A processed container of tokens in context.#[br] | #[strong Vocab]: The collection of lexemes.#[br] | #[strong StringStore]: The dictionary mapping hash values to strings, for - | example #[code 3197928453018144401L] → "coffee". + | example #[code 3197928453018144401] → "coffee". +image include ../../../assets/img/docs/vocab_stringstore.svg @@ -35,8 +35,13 @@ p +code. doc = nlp(u'I like coffee') - assert doc.vocab.strings[u'coffee'] == 3197928453018144401L - assert doc.vocab.strings[3197928453018144401L] == u'coffee' + assert doc.vocab.strings[u'coffee'] == 3197928453018144401 + assert doc.vocab.strings[3197928453018144401] == u'coffee' + ++aside("What does 'L' at the end of a hash mean?") + | If you return a hash value in the #[strong Python 2 interpreter], it'll + | show up as #[code 3197928453018144401L]. The #[code L] just means "long + | integer" – it's #[strong not] actually a part of the hash value. p | Now that all strings are encoded, the entries in the vocabulary @@ -65,9 +70,9 @@ p +table(["text", "orth", "shape", "prefix", "suffix", "is_alpha", "is_digit"]) - var style = [0, 1, 1, 0, 0, 1, 1] - +annotation-row(["I", "4690420944186131903L", "X", "I", "I", true, false], style) - +annotation-row(["love", "3702023516439754181L", "xxxx", "l", "ove", true, false], style) - +annotation-row(["coffee", "3197928453018144401L", "xxxx", "c", "ffe", true, false], style) + +annotation-row(["I", "4690420944186131903", "X", "I", "I", true, false], style) + +annotation-row(["love", "3702023516439754181", "xxxx", "l", "ove", true, false], style) + +annotation-row(["coffee", "3197928453018144401", "xxxx", "c", "ffe", true, false], style) p | The mapping of words to hashes doesn't depend on any state. To make sure @@ -79,7 +84,7 @@ p p | However, hashes #[strong cannot be reversed] and there's no way to - | resolve #[code 3197928453018144401L] back to "coffee". All spaCy can do + | resolve #[code 3197928453018144401] back to "coffee". All spaCy can do | is look it up in the vocabulary. That's why you always need to make | sure all objects you create have access to the same vocabulary. If they | don't, spaCy might not be able to find the strings it needs. @@ -89,17 +94,17 @@ p from spacy.vocab import Vocab doc = nlp(u'I like coffee') # original Doc - assert doc.vocab.strings[u'coffee'] == 3197928453018144401L # get hash - assert doc.vocab.strings[3197928453018144401L] == u'coffee' # 👍 + assert doc.vocab.strings[u'coffee'] == 3197928453018144401 # get hash + assert doc.vocab.strings[3197928453018144401] == u'coffee' # 👍 empty_doc = Doc(Vocab()) # new Doc with empty Vocab - # doc.vocab.strings[3197928453018144401L] will raise an error :( + # doc.vocab.strings[3197928453018144401] will raise an error :( empty_doc.vocab.strings.add(u'coffee') # add "coffee" and generate hash - assert doc.vocab.strings[3197928453018144401L] == u'coffee' # 👍 + assert doc.vocab.strings[3197928453018144401] == u'coffee' # 👍 new_doc = Doc(doc.vocab) # create new doc with first doc's vocab - assert doc.vocab.strings[3197928453018144401L] == u'coffee' # 👍 + assert doc.vocab.strings[3197928453018144401] == u'coffee' # 👍 p | If the vocabulary doesn't contain a hash for "coffee", spaCy will diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade index f144b4f05..89dac830c 100644 --- a/website/docs/usage/lightning-tour.jade +++ b/website/docs/usage/lightning-tour.jade @@ -53,9 +53,9 @@ p +code. doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion') apple = doc[0] - assert [apple.pos_, apple.pos] == [u'PROPN', 17049293600679659579L] - assert [apple.tag_, apple.tag] == [u'NNP', 15794550382381185553L] - assert [apple.shape_, apple.shape] == [u'Xxxxx', 16072095006890171862L] + assert [apple.pos_, apple.pos] == [u'PROPN', 17049293600679659579] + assert [apple.tag_, apple.tag] == [u'NNP', 15794550382381185553] + assert [apple.shape_, apple.shape] == [u'Xxxxx', 16072095006890171862] assert apple.is_alpha == True assert apple.is_punct == False @@ -72,16 +72,16 @@ p +code. doc = nlp(u'I love coffee') - coffee_hash = nlp.vocab.strings[u'coffee'] # 3197928453018144401L + coffee_hash = nlp.vocab.strings[u'coffee'] # 3197928453018144401 coffee_text = nlp.vocab.strings[coffee_hash] # 'coffee' - assert doc[2].orth == coffee_hash == 3197928453018144401L + assert doc[2].orth == coffee_hash == 3197928453018144401 assert doc[2].text == coffee_text == u'coffee' - beer_hash = doc.vocab.strings.add(u'beer') # 3073001599257881079L + beer_hash = doc.vocab.strings.add(u'beer') # 3073001599257881079 beer_text = doc.vocab.strings[beer_hash] # 'beer' - unicorn_hash = doc.vocab.strings.add(u'🦄 ') # 18234233413267120783L + unicorn_hash = doc.vocab.strings.add(u'🦄 ') # 18234233413267120783 unicorn_text = doc.vocab.strings[unicorn_hash] # '🦄 ' +infobox diff --git a/website/docs/usage/rule-based-matching.jade b/website/docs/usage/rule-based-matching.jade index 9813abd2e..8588729b6 100644 --- a/website/docs/usage/rule-based-matching.jade +++ b/website/docs/usage/rule-based-matching.jade @@ -352,8 +352,7 @@ p p | By default, spaCy's tokenizer will split emoji into separate tokens. This - | means that you can create a pattern for one or more emoji tokens. In this - | case, a sequence of identical emoji should be treated as one instance. + | means that you can create a pattern for one or more emoji tokens. | Valid hashtags usually consist of a #[code #], plus a sequence of | ASCII characters with no whitespace, making them easy to match as well. @@ -368,8 +367,8 @@ p neg_emoji = [u'😞', u'😠', u'😩', u'😢', u'😭', u'😒'] # negative emoji # add patterns to match one or more emoji tokens - pos_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in pos_emoji] - neg_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in neg_emoji] + pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emoji] + neg_patterns = [[{'ORTH': emoji}] for emoji in neg_emoji] matcher.add('HAPPY', label_sentiment, *pos_patterns) # add positive pattern matcher.add('SAD', label_sentiment, *neg_patterns) # add negative pattern @@ -397,9 +396,9 @@ p def label_sentiment(matcher, doc, i, matches): match_id, start, end = matches[i] - if match_id is 'HAPPY': + if doc.vocab.strings[match_id] == 'HAPPY': # don't forget to get string! doc.sentiment += 0.1 # add 0.1 for positive sentiment - elif match_id is 'SAD': + elif doc.vocab.strings[match_id] == 'SAD': doc.sentiment -= 0.1 # subtract 0.1 for negative sentiment span = doc[start : end] emoji = Emojipedia.search(span[0].text) # get data for emoji diff --git a/website/docs/usage/v2.jade b/website/docs/usage/v2.jade index 944ed56f5..2123a04af 100644 --- a/website/docs/usage/v2.jade +++ b/website/docs/usage/v2.jade @@ -54,8 +54,8 @@ p +aside-code("Example"). doc = nlp(u'I love coffee') - assert doc.vocab.strings[u'coffee'] == 3197928453018144401L - assert doc.vocab.strings[3197928453018144401L] == u'coffee' + assert doc.vocab.strings[u'coffee'] == 3197928453018144401 + assert doc.vocab.strings[3197928453018144401] == u'coffee' beer_hash = doc.vocab.strings.add(u'beer') assert doc.vocab.strings[u'beer'] == beer_hash @@ -343,8 +343,8 @@ p +code-new. nlp.vocab.strings.add(u'coffee') - nlp.vocab.strings[u'coffee'] # 3197928453018144401L - other_nlp.vocab.strings[u'coffee'] # 3197928453018144401L + nlp.vocab.strings[u'coffee'] # 3197928453018144401 + other_nlp.vocab.strings[u'coffee'] # 3197928453018144401 +code-old. nlp.vocab.strings[u'coffee'] # 3672