Merge changes to test_misc

This commit is contained in:
Matthew Honnibal 2017-05-29 12:26:02 +02:00
commit f4aafca222
17 changed files with 207 additions and 59 deletions

View File

@ -3,6 +3,7 @@ from __future__ import unicode_literals
from ..util import get_doc, assert_docs_equal from ..util import get_doc, assert_docs_equal
from ...tokens import Doc from ...tokens import Doc
from ...vocab import Vocab
import pytest import pytest
@ -22,6 +23,15 @@ def test_serialize_empty_doc(en_vocab):
for token1, token2 in zip(doc, doc2): for token1, token2 in zip(doc, doc2):
assert token1.text == token2.text assert token1.text == token2.text
@pytest.mark.xfail
@pytest.mark.parametrize('text', ['rat'])
def test_serialize_vocab(en_vocab, text):
text_hash = en_vocab.strings.add(text)
vocab_bytes = en_vocab.to_bytes()
new_vocab = Vocab().from_bytes(vocab_bytes)
assert new_vocab.strings(text_hash) == text
# #
#@pytest.mark.parametrize('text', [TEXT]) #@pytest.mark.parametrize('text', [TEXT])
#def test_serialize_tokens(en_vocab, text): #def test_serialize_tokens(en_vocab, text):

View File

@ -6,6 +6,25 @@ from ...strings import StringStore
import pytest import pytest
def test_stringstore_from_api_docs(stringstore):
apple_hash = stringstore.add('apple')
assert apple_hash == 8566208034543834098
assert stringstore[apple_hash] == u'apple'
assert u'apple' in stringstore
assert u'cherry' not in stringstore
orange_hash = stringstore.add('orange')
all_strings = [s for s in stringstore]
assert all_strings == [u'apple', u'orange']
banana_hash = stringstore.add('banana')
assert len(stringstore) == 3
assert banana_hash == 2525716904149915114
assert stringstore[banana_hash] == u'banana'
assert stringstore[u'banana'] == banana_hash
@pytest.mark.parametrize('text1,text2,text3', [(b'Hello', b'goodbye', b'hello')]) @pytest.mark.parametrize('text1,text2,text3', [(b'Hello', b'goodbye', b'hello')])
def test_stringstore_save_bytes(stringstore, text1, text2, text3): def test_stringstore_save_bytes(stringstore, text1, text2, text3):
key = stringstore.add(text1) key = stringstore.add(text1)

View File

@ -20,6 +20,41 @@ def matcher(en_vocab):
return matcher return matcher
def test_matcher_from_api_docs(en_vocab):
matcher = Matcher(en_vocab)
pattern = [{'ORTH': 'test'}]
assert len(matcher) == 0
matcher.add('Rule', None, pattern)
assert len(matcher) == 1
matcher.remove('Rule')
assert 'Rule' not in matcher
matcher.add('Rule', None, pattern)
assert 'Rule' in matcher
on_match, patterns = matcher.get('Rule')
assert len(patterns[0])
@pytest.mark.xfail
def test_matcher_from_usage_docs(en_vocab):
text = "Wow 😀 This is really cool! 😂 😂"
doc = get_doc(en_vocab, words=text.split(' '))
pos_emoji = [u'😀', u'😃', u'😂', u'🤣', u'😊', u'😍']
pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emoji]
def label_sentiment(matcher, doc, i, matches):
match_id, start, end = matches[i]
if doc.vocab.strings[match_id] == 'HAPPY':
doc.sentiment += 0.1
span = doc[start : end]
token = span.merge(norm='happy emoji')
matcher = Matcher(en_vocab)
matcher.add('HAPPY', label_sentiment, *pos_patterns)
matches = matcher(doc)
assert doc.sentiment != 0
assert doc[1].norm_ == 'happy emoji'
@pytest.mark.parametrize('words', [["Some", "words"]]) @pytest.mark.parametrize('words', [["Some", "words"]])
def test_matcher_init(en_vocab, words): def test_matcher_init(en_vocab, words):
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)

View File

@ -3,6 +3,10 @@ from __future__ import unicode_literals
from ..util import ensure_path from ..util import ensure_path
from ..util import model_to_bytes, model_from_bytes from ..util import model_to_bytes, model_from_bytes
from .. import util
from ..displacy import parse_deps, parse_ents
from ..tokens import Span
from .util import get_doc
from pathlib import Path from pathlib import Path
import pytest import pytest
@ -12,7 +16,7 @@ from thinc.api import chain
@pytest.mark.parametrize('text', ['hello/world', 'hello world']) @pytest.mark.parametrize('text', ['hello/world', 'hello world'])
def test_util_ensure_path_succeeds(text): def test_util_ensure_path_succeeds(text):
path = ensure_path(text) path = util.ensure_path(text)
assert isinstance(path, Path) assert isinstance(path, Path)
@ -47,3 +51,43 @@ def test_multi_model_load_missing_dims():
model_from_bytes(model2, data) model_from_bytes(model2, data)
assert model2._layers[0].b[0, 0] == 1 assert model2._layers[0].b[0, 0] == 1
assert model2._layers[1].b[0, 0] == 2 assert model2._layers[1].b[0, 0] == 2
@pytest.mark.parametrize('package', ['thinc'])
def test_util_is_package(package):
"""Test that an installed package via pip is recognised by util.is_package."""
assert util.is_package(package)
@pytest.mark.parametrize('package', ['thinc'])
def test_util_get_package_path(package):
"""Test that a Path object is returned for a package name."""
path = util.get_package_path(package)
assert isinstance(path, Path)
def test_displacy_parse_ents(en_vocab):
"""Test that named entities on a Doc are converted into displaCy's format."""
doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings[u'ORG'])]
ents = parse_ents(doc)
assert isinstance(ents, dict)
assert ents['text'] == 'But Google is starting from behind '
assert ents['ents'] == [{'start': 4, 'end': 10, 'label': 'ORG'}]
def test_displacy_parse_deps(en_vocab):
"""Test that deps and tags on a Doc are converted into displaCy's format."""
words = ["This", "is", "a", "sentence"]
heads = [1, 0, 1, -2]
tags = ['DT', 'VBZ', 'DT', 'NN']
deps = ['nsubj', 'ROOT', 'det', 'attr']
doc = get_doc(en_vocab, words=words, heads=heads, tags=tags, deps=deps)
deps = parse_deps(doc)
assert isinstance(deps, dict)
assert deps['words'] == [{'text': 'This', 'tag': 'DT'},
{'text': 'is', 'tag': 'VBZ'},
{'text': 'a', 'tag': 'DT'},
{'text': 'sentence', 'tag': 'NN'}]
assert deps['arcs'] == [{'start': 0, 'end': 1, 'label': 'nsubj', 'dir': 'left'},
{'start': 2, 'end': 3, 'label': 'det', 'dir': 'left'},
{'start': 1, 'end': 3, 'label': 'attr', 'dir': 'right'}]

View File

@ -183,7 +183,7 @@ def get_package_path(name):
""" """
# Here we're importing the module just to find it. This is worryingly # Here we're importing the module just to find it. This is worryingly
# indirect, but it's otherwise very difficult to find the package. # indirect, but it's otherwise very difficult to find the package.
pkg = importlib.import_module(package_name) pkg = importlib.import_module(name)
return Path(pkg.__file__).parent return Path(pkg.__file__).parent

View File

@ -19,9 +19,12 @@ main.o-main.o-main--sidebar.o-main--aside
if ALPHA if ALPHA
+infobox("⚠️ You are viewing the spaCy v2.0 alpha docs") +infobox("⚠️ You are viewing the spaCy v2.0.0 alpha docs")
| This page is part of the alpha documentation for spaCy v2.0 strong This page is part of the alpha documentation for spaCy v2.0.
| and does not reflect the state of the latest stable release. | It does not reflect the state of the latest stable release.
| Because v2.0 is still under development, the actual
| implementation may differ from the intended state described
| here.
| #[+a("#") See here] for more information on how to install | #[+a("#") See here] for more information on how to install
| and test the new version. To read the official docs for | and test the new version. To read the official docs for
| v1.x, #[+a("https://spacy.io/docs") go here]. | v1.x, #[+a("https://spacy.io/docs") go here].

View File

@ -16,7 +16,7 @@ Prism.languages.json={property:/".*?"(?=\s*:)/gi,string:/"(?!:)(\\?[^"])*?"(?!:)
!function(a){var e=/\\([^a-z()[\]]|[a-z\*]+)/i,n={"equation-command":{pattern:e,alias:"regex"}};a.languages.latex={comment:/%.*/m,cdata:{pattern:/(\\begin\{((?:verbatim|lstlisting)\*?)\})([\w\W]*?)(?=\\end\{\2\})/,lookbehind:!0},equation:[{pattern:/\$(?:\\?[\w\W])*?\$|\\\((?:\\?[\w\W])*?\\\)|\\\[(?:\\?[\w\W])*?\\\]/,inside:n,alias:"string"},{pattern:/(\\begin\{((?:equation|math|eqnarray|align|multline|gather)\*?)\})([\w\W]*?)(?=\\end\{\2\})/,lookbehind:!0,inside:n,alias:"string"}],keyword:{pattern:/(\\(?:begin|end|ref|cite|label|usepackage|documentclass)(?:\[[^\]]+\])?\{)[^}]+(?=\})/,lookbehind:!0},url:{pattern:/(\\url\{)[^}]+(?=\})/,lookbehind:!0},headline:{pattern:/(\\(?:part|chapter|section|subsection|frametitle|subsubsection|paragraph|subparagraph|subsubparagraph|subsubsubparagraph)\*?(?:\[[^\]]+\])?\{)[^}]+(?=\}(?:\[[^\]]+\])?)/,lookbehind:!0,alias:"class-name"},"function":{pattern:e,alias:"selector"},punctuation:/[[\]{}&]/}}(Prism); !function(a){var e=/\\([^a-z()[\]]|[a-z\*]+)/i,n={"equation-command":{pattern:e,alias:"regex"}};a.languages.latex={comment:/%.*/m,cdata:{pattern:/(\\begin\{((?:verbatim|lstlisting)\*?)\})([\w\W]*?)(?=\\end\{\2\})/,lookbehind:!0},equation:[{pattern:/\$(?:\\?[\w\W])*?\$|\\\((?:\\?[\w\W])*?\\\)|\\\[(?:\\?[\w\W])*?\\\]/,inside:n,alias:"string"},{pattern:/(\\begin\{((?:equation|math|eqnarray|align|multline|gather)\*?)\})([\w\W]*?)(?=\\end\{\2\})/,lookbehind:!0,inside:n,alias:"string"}],keyword:{pattern:/(\\(?:begin|end|ref|cite|label|usepackage|documentclass)(?:\[[^\]]+\])?\{)[^}]+(?=\})/,lookbehind:!0},url:{pattern:/(\\url\{)[^}]+(?=\})/,lookbehind:!0},headline:{pattern:/(\\(?:part|chapter|section|subsection|frametitle|subsubsection|paragraph|subparagraph|subsubparagraph|subsubsubparagraph)\*?(?:\[[^\]]+\])?\{)[^}]+(?=\}(?:\[[^\]]+\])?)/,lookbehind:!0,alias:"class-name"},"function":{pattern:e,alias:"selector"},punctuation:/[[\]{}&]/}}(Prism);
Prism.languages.makefile={comment:{pattern:/(^|[^\\])#(?:\\(?:\r\n|[\s\S])|.)*/,lookbehind:!0},string:/(["'])(?:\\(?:\r\n|[\s\S])|(?!\1)[^\\\r\n])*\1/,builtin:/\.[A-Z][^:#=\s]+(?=\s*:(?!=))/,symbol:{pattern:/^[^:=\r\n]+(?=\s*:(?!=))/m,inside:{variable:/\$+(?:[^(){}:#=\s]+|(?=[({]))/}},variable:/\$+(?:[^(){}:#=\s]+|\([@*%<^+?][DF]\)|(?=[({]))/,keyword:[/-include\b|\b(?:define|else|endef|endif|export|ifn?def|ifn?eq|include|override|private|sinclude|undefine|unexport|vpath)\b/,{pattern:/(\()(?:addsuffix|abspath|and|basename|call|dir|error|eval|file|filter(?:-out)?|findstring|firstword|flavor|foreach|guile|if|info|join|lastword|load|notdir|or|origin|patsubst|realpath|shell|sort|strip|subst|suffix|value|warning|wildcard|word(?:s|list)?)(?=[ \t])/,lookbehind:!0}],operator:/(?:::|[?:+!])?=|[|@]/,punctuation:/[:;(){}]/}; Prism.languages.makefile={comment:{pattern:/(^|[^\\])#(?:\\(?:\r\n|[\s\S])|.)*/,lookbehind:!0},string:/(["'])(?:\\(?:\r\n|[\s\S])|(?!\1)[^\\\r\n])*\1/,builtin:/\.[A-Z][^:#=\s]+(?=\s*:(?!=))/,symbol:{pattern:/^[^:=\r\n]+(?=\s*:(?!=))/m,inside:{variable:/\$+(?:[^(){}:#=\s]+|(?=[({]))/}},variable:/\$+(?:[^(){}:#=\s]+|\([@*%<^+?][DF]\)|(?=[({]))/,keyword:[/-include\b|\b(?:define|else|endef|endif|export|ifn?def|ifn?eq|include|override|private|sinclude|undefine|unexport|vpath)\b/,{pattern:/(\()(?:addsuffix|abspath|and|basename|call|dir|error|eval|file|filter(?:-out)?|findstring|firstword|flavor|foreach|guile|if|info|join|lastword|load|notdir|or|origin|patsubst|realpath|shell|sort|strip|subst|suffix|value|warning|wildcard|word(?:s|list)?)(?=[ \t])/,lookbehind:!0}],operator:/(?:::|[?:+!])?=|[|@]/,punctuation:/[:;(){}]/};
Prism.languages.markdown=Prism.languages.extend("markup",{}),Prism.languages.insertBefore("markdown","prolog",{blockquote:{pattern:/^>(?:[\t ]*>)*/m,alias:"punctuation"},code:[{pattern:/^(?: {4}|\t).+/m,alias:"keyword"},{pattern:/``.+?``|`[^`\n]+`/,alias:"keyword"}],title:[{pattern:/\w+.*(?:\r?\n|\r)(?:==+|--+)/,alias:"important",inside:{punctuation:/==+$|--+$/}},{pattern:/(^\s*)#+.+/m,lookbehind:!0,alias:"important",inside:{punctuation:/^#+|#+$/}}],hr:{pattern:/(^\s*)([*-])([\t ]*\2){2,}(?=\s*$)/m,lookbehind:!0,alias:"punctuation"},list:{pattern:/(^\s*)(?:[*+-]|\d+\.)(?=[\t ].)/m,lookbehind:!0,alias:"punctuation"},"url-reference":{pattern:/!?\[[^\]]+\]:[\t ]+(?:\S+|<(?:\\.|[^>\\])+>)(?:[\t ]+(?:"(?:\\.|[^"\\])*"|'(?:\\.|[^'\\])*'|\((?:\\.|[^)\\])*\)))?/,inside:{variable:{pattern:/^(!?\[)[^\]]+/,lookbehind:!0},string:/(?:"(?:\\.|[^"\\])*"|'(?:\\.|[^'\\])*'|\((?:\\.|[^)\\])*\))$/,punctuation:/^[\[\]!:]|[<>]/},alias:"url"},bold:{pattern:/(^|[^\\])(\*\*|__)(?:(?:\r?\n|\r)(?!\r?\n|\r)|.)+?\2/,lookbehind:!0,inside:{punctuation:/^\*\*|^__|\*\*$|__$/}},italic:{pattern:/(^|[^\\])([*_])(?:(?:\r?\n|\r)(?!\r?\n|\r)|.)+?\2/,lookbehind:!0,inside:{punctuation:/^[*_]|[*_]$/}},url:{pattern:/!?\[[^\]]+\](?:\([^\s)]+(?:[\t ]+"(?:\\.|[^"\\])*")?\)| ?\[[^\]\n]*\])/,inside:{variable:{pattern:/(!?\[)[^\]]+(?=\]$)/,lookbehind:!0},string:{pattern:/"(?:\\.|[^"\\])*"(?=\)$)/}}}}),Prism.languages.markdown.bold.inside.url=Prism.util.clone(Prism.languages.markdown.url),Prism.languages.markdown.italic.inside.url=Prism.util.clone(Prism.languages.markdown.url),Prism.languages.markdown.bold.inside.italic=Prism.util.clone(Prism.languages.markdown.italic),Prism.languages.markdown.italic.inside.bold=Prism.util.clone(Prism.languages.markdown.bold); Prism.languages.markdown=Prism.languages.extend("markup",{}),Prism.languages.insertBefore("markdown","prolog",{blockquote:{pattern:/^>(?:[\t ]*>)*/m,alias:"punctuation"},code:[{pattern:/^(?: {4}|\t).+/m,alias:"keyword"},{pattern:/``.+?``|`[^`\n]+`/,alias:"keyword"}],title:[{pattern:/\w+.*(?:\r?\n|\r)(?:==+|--+)/,alias:"important",inside:{punctuation:/==+$|--+$/}},{pattern:/(^\s*)#+.+/m,lookbehind:!0,alias:"important",inside:{punctuation:/^#+|#+$/}}],hr:{pattern:/(^\s*)([*-])([\t ]*\2){2,}(?=\s*$)/m,lookbehind:!0,alias:"punctuation"},list:{pattern:/(^\s*)(?:[*+-]|\d+\.)(?=[\t ].)/m,lookbehind:!0,alias:"punctuation"},"url-reference":{pattern:/!?\[[^\]]+\]:[\t ]+(?:\S+|<(?:\\.|[^>\\])+>)(?:[\t ]+(?:"(?:\\.|[^"\\])*"|'(?:\\.|[^'\\])*'|\((?:\\.|[^)\\])*\)))?/,inside:{variable:{pattern:/^(!?\[)[^\]]+/,lookbehind:!0},string:/(?:"(?:\\.|[^"\\])*"|'(?:\\.|[^'\\])*'|\((?:\\.|[^)\\])*\))$/,punctuation:/^[\[\]!:]|[<>]/},alias:"url"},bold:{pattern:/(^|[^\\])(\*\*|__)(?:(?:\r?\n|\r)(?!\r?\n|\r)|.)+?\2/,lookbehind:!0,inside:{punctuation:/^\*\*|^__|\*\*$|__$/}},italic:{pattern:/(^|[^\\])([*_])(?:(?:\r?\n|\r)(?!\r?\n|\r)|.)+?\2/,lookbehind:!0,inside:{punctuation:/^[*_]|[*_]$/}},url:{pattern:/!?\[[^\]]+\](?:\([^\s)]+(?:[\t ]+"(?:\\.|[^"\\])*")?\)| ?\[[^\]\n]*\])/,inside:{variable:{pattern:/(!?\[)[^\]]+(?=\]$)/,lookbehind:!0},string:{pattern:/"(?:\\.|[^"\\])*"(?=\)$)/}}}}),Prism.languages.markdown.bold.inside.url=Prism.util.clone(Prism.languages.markdown.url),Prism.languages.markdown.italic.inside.url=Prism.util.clone(Prism.languages.markdown.url),Prism.languages.markdown.bold.inside.italic=Prism.util.clone(Prism.languages.markdown.italic),Prism.languages.markdown.italic.inside.bold=Prism.util.clone(Prism.languages.markdown.bold);
Prism.languages.python={"triple-quoted-string":{pattern:/"""[\s\S]+?"""|'''[\s\S]+?'''/,alias:"string"},comment:{pattern:/(^|[^\\])#.*/,lookbehind:!0},string:/("|')(?:\\?.)*?\1/,"function":{pattern:/((?:^|\s)def[ \t]+)[a-zA-Z_][a-zA-Z0-9_]*(?=\()/g,lookbehind:!0},"class-name":{pattern:/(\bclass\s+)[a-z0-9_]+/i,lookbehind:!0},keyword:/\b(?:as|assert|async|await|break|class|continue|def|del|elif|else|except|exec|finally|for|from|global|if|import|in|is|lambda|pass|print|raise|return|try|while|with|yield)\b/,"boolean":/\b(?:True|False)\b/,number:/\b-?(?:0[bo])?(?:(?:\d|0x[\da-f])[\da-f]*\.?\d*|\.\d+)(?:e[+-]?\d+)?j?L?\b/i,operator:/[-+%=]=?|!=|\*\*?=?|\/\/?=?|<[<=>]?|>[=>]?|[&|^~]|\b(?:or|and|not)\b/,punctuation:/[{}[\];(),.:]/}; Prism.languages.python={"triple-quoted-string":{pattern:/"""[\s\S]+?"""|'''[\s\S]+?'''/,alias:"string"},comment:{pattern:/(^|[^\\])#.*/,lookbehind:!0},string:/("|')(?:\\?.)*?\1/,"function":{pattern:/((?:^|\s)def[ \t]+)[a-zA-Z_][a-zA-Z0-9_]*(?=\()/g,lookbehind:!0},"class-name":{pattern:/(\bclass\s+)[a-z0-9_]+/i,lookbehind:!0},keyword:/\b(?:as|assert|async|await|break|class|continue|def|del|elif|else|except|exec|finally|for|from|global|if|import|in|is|lambda|pass|print|raise|return|try|while|with|yield)\b/,"boolean":/\b(?:True|False)\b/,number:/\b-?(?:0[bo])?(?:(?:\d|0x[\da-f])[\da-f]*\.?\d*|\.\d+)(?:e[+-]?\d+)?j?\b/i,operator:/[-+%=]=?|!=|\*\*?=?|\/\/?=?|<[<=>]?|>[=>]?|[&|^~]|\b(?:or|and|not)\b/,punctuation:/[{}[\];(),.:]/};
Prism.languages.rest={table:[{pattern:/(\s*)(?:\+[=-]+)+\+(?:\r?\n|\r)(?:\1(?:[+|].+)+[+|](?:\r?\n|\r))+\1(?:\+[=-]+)+\+/,lookbehind:!0,inside:{punctuation:/\||(?:\+[=-]+)+\+/}},{pattern:/(\s*)(?:=+ +)+=+((?:\r?\n|\r)\1.+)+(?:\r?\n|\r)\1(?:=+ +)+=+(?=(?:\r?\n|\r){2}|\s*$)/,lookbehind:!0,inside:{punctuation:/[=-]+/}}],"substitution-def":{pattern:/(^\s*\.\. )\|(?:[^|\s](?:[^|]*[^|\s])?)\| [^:]+::/m,lookbehind:!0,inside:{substitution:{pattern:/^\|(?:[^|\s]|[^|\s][^|]*[^|\s])\|/,alias:"attr-value",inside:{punctuation:/^\||\|$/}},directive:{pattern:/( +)[^:]+::/,lookbehind:!0,alias:"function",inside:{punctuation:/::$/}}}},"link-target":[{pattern:/(^\s*\.\. )\[[^\]]+\]/m,lookbehind:!0,alias:"string",inside:{punctuation:/^\[|\]$/}},{pattern:/(^\s*\.\. )_(?:`[^`]+`|(?:[^:\\]|\\.)+):/m,lookbehind:!0,alias:"string",inside:{punctuation:/^_|:$/}}],directive:{pattern:/(^\s*\.\. )[^:]+::/m,lookbehind:!0,alias:"function",inside:{punctuation:/::$/}},comment:{pattern:/(^\s*\.\.)(?:(?: .+)?(?:(?:\r?\n|\r).+)+| .+)(?=(?:\r?\n|\r){2}|$)/m,lookbehind:!0},title:[{pattern:/^(([!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~])\2+)(?:\r?\n|\r).+(?:\r?\n|\r)\1$/m,inside:{punctuation:/^[!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~]+|[!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~]+$/,important:/.+/}},{pattern:/(^|(?:\r?\n|\r){2}).+(?:\r?\n|\r)([!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~])\2+(?=\r?\n|\r|$)/,lookbehind:!0,inside:{punctuation:/[!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~]+$/,important:/.+/}}],hr:{pattern:/((?:\r?\n|\r){2})([!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~])\2{3,}(?=(?:\r?\n|\r){2})/,lookbehind:!0,alias:"punctuation"},field:{pattern:/(^\s*):[^:\r\n]+:(?= )/m,lookbehind:!0,alias:"attr-name"},"command-line-option":{pattern:/(^\s*)(?:[+-][a-z\d]|(?:\-\-|\/)[a-z\d-]+)(?:[ =](?:[a-z][a-z\d_-]*|<[^<>]+>))?(?:, (?:[+-][a-z\d]|(?:\-\-|\/)[a-z\d-]+)(?:[ =](?:[a-z][a-z\d_-]*|<[^<>]+>))?)*(?=(?:\r?\n|\r)? {2,}\S)/im,lookbehind:!0,alias:"symbol"},"literal-block":{pattern:/::(?:\r?\n|\r){2}([ \t]+).+(?:(?:\r?\n|\r)\1.+)*/,inside:{"literal-block-punctuation":{pattern:/^::/,alias:"punctuation"}}},"quoted-literal-block":{pattern:/::(?:\r?\n|\r){2}([!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~]).*(?:(?:\r?\n|\r)\1.*)*/,inside:{"literal-block-punctuation":{pattern:/^(?:::|([!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~])\1*)/m,alias:"punctuation"}}},"list-bullet":{pattern:/(^\s*)(?:[*+\-•‣⁃]|\(?(?:\d+|[a-z]|[ivxdclm]+)\)|(?:\d+|[a-z]|[ivxdclm]+)\.)(?= )/im,lookbehind:!0,alias:"punctuation"},"doctest-block":{pattern:/(^\s*)>>> .+(?:(?:\r?\n|\r).+)*/m,lookbehind:!0,inside:{punctuation:/^>>>/}},inline:[{pattern:/(^|[\s\-:\/'"<(\[{])(?::[^:]+:`.*?`|`.*?`:[^:]+:|(\*\*?|``?|\|)(?!\s).*?[^\s]\2(?=[\s\-.,:;!?\\\/'")\]}]|$))/m,lookbehind:!0,inside:{bold:{pattern:/(^\*\*).+(?=\*\*$)/,lookbehind:!0},italic:{pattern:/(^\*).+(?=\*$)/,lookbehind:!0},"inline-literal":{pattern:/(^``).+(?=``$)/,lookbehind:!0,alias:"symbol"},role:{pattern:/^:[^:]+:|:[^:]+:$/,alias:"function",inside:{punctuation:/^:|:$/}},"interpreted-text":{pattern:/(^`).+(?=`$)/,lookbehind:!0,alias:"attr-value"},substitution:{pattern:/(^\|).+(?=\|$)/,lookbehind:!0,alias:"attr-value"},punctuation:/\*\*?|``?|\|/}}],link:[{pattern:/\[[^\]]+\]_(?=[\s\-.,:;!?\\\/'")\]}]|$)/,alias:"string",inside:{punctuation:/^\[|\]_$/}},{pattern:/(?:\b[a-z\d](?:[_.:+]?[a-z\d]+)*_?_|`[^`]+`_?_|_`[^`]+`)(?=[\s\-.,:;!?\\\/'")\]}]|$)/i,alias:"string",inside:{punctuation:/^_?`|`$|`?_?_$/}}],punctuation:{pattern:/(^\s*)(?:\|(?= |$)|(?:---?|—|\.\.|__)(?= )|\.\.$)/m,lookbehind:!0}}; Prism.languages.rest={table:[{pattern:/(\s*)(?:\+[=-]+)+\+(?:\r?\n|\r)(?:\1(?:[+|].+)+[+|](?:\r?\n|\r))+\1(?:\+[=-]+)+\+/,lookbehind:!0,inside:{punctuation:/\||(?:\+[=-]+)+\+/}},{pattern:/(\s*)(?:=+ +)+=+((?:\r?\n|\r)\1.+)+(?:\r?\n|\r)\1(?:=+ +)+=+(?=(?:\r?\n|\r){2}|\s*$)/,lookbehind:!0,inside:{punctuation:/[=-]+/}}],"substitution-def":{pattern:/(^\s*\.\. )\|(?:[^|\s](?:[^|]*[^|\s])?)\| [^:]+::/m,lookbehind:!0,inside:{substitution:{pattern:/^\|(?:[^|\s]|[^|\s][^|]*[^|\s])\|/,alias:"attr-value",inside:{punctuation:/^\||\|$/}},directive:{pattern:/( +)[^:]+::/,lookbehind:!0,alias:"function",inside:{punctuation:/::$/}}}},"link-target":[{pattern:/(^\s*\.\. )\[[^\]]+\]/m,lookbehind:!0,alias:"string",inside:{punctuation:/^\[|\]$/}},{pattern:/(^\s*\.\. )_(?:`[^`]+`|(?:[^:\\]|\\.)+):/m,lookbehind:!0,alias:"string",inside:{punctuation:/^_|:$/}}],directive:{pattern:/(^\s*\.\. )[^:]+::/m,lookbehind:!0,alias:"function",inside:{punctuation:/::$/}},comment:{pattern:/(^\s*\.\.)(?:(?: .+)?(?:(?:\r?\n|\r).+)+| .+)(?=(?:\r?\n|\r){2}|$)/m,lookbehind:!0},title:[{pattern:/^(([!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~])\2+)(?:\r?\n|\r).+(?:\r?\n|\r)\1$/m,inside:{punctuation:/^[!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~]+|[!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~]+$/,important:/.+/}},{pattern:/(^|(?:\r?\n|\r){2}).+(?:\r?\n|\r)([!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~])\2+(?=\r?\n|\r|$)/,lookbehind:!0,inside:{punctuation:/[!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~]+$/,important:/.+/}}],hr:{pattern:/((?:\r?\n|\r){2})([!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~])\2{3,}(?=(?:\r?\n|\r){2})/,lookbehind:!0,alias:"punctuation"},field:{pattern:/(^\s*):[^:\r\n]+:(?= )/m,lookbehind:!0,alias:"attr-name"},"command-line-option":{pattern:/(^\s*)(?:[+-][a-z\d]|(?:\-\-|\/)[a-z\d-]+)(?:[ =](?:[a-z][a-z\d_-]*|<[^<>]+>))?(?:, (?:[+-][a-z\d]|(?:\-\-|\/)[a-z\d-]+)(?:[ =](?:[a-z][a-z\d_-]*|<[^<>]+>))?)*(?=(?:\r?\n|\r)? {2,}\S)/im,lookbehind:!0,alias:"symbol"},"literal-block":{pattern:/::(?:\r?\n|\r){2}([ \t]+).+(?:(?:\r?\n|\r)\1.+)*/,inside:{"literal-block-punctuation":{pattern:/^::/,alias:"punctuation"}}},"quoted-literal-block":{pattern:/::(?:\r?\n|\r){2}([!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~]).*(?:(?:\r?\n|\r)\1.*)*/,inside:{"literal-block-punctuation":{pattern:/^(?:::|([!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~])\1*)/m,alias:"punctuation"}}},"list-bullet":{pattern:/(^\s*)(?:[*+\-•‣⁃]|\(?(?:\d+|[a-z]|[ivxdclm]+)\)|(?:\d+|[a-z]|[ivxdclm]+)\.)(?= )/im,lookbehind:!0,alias:"punctuation"},"doctest-block":{pattern:/(^\s*)>>> .+(?:(?:\r?\n|\r).+)*/m,lookbehind:!0,inside:{punctuation:/^>>>/}},inline:[{pattern:/(^|[\s\-:\/'"<(\[{])(?::[^:]+:`.*?`|`.*?`:[^:]+:|(\*\*?|``?|\|)(?!\s).*?[^\s]\2(?=[\s\-.,:;!?\\\/'")\]}]|$))/m,lookbehind:!0,inside:{bold:{pattern:/(^\*\*).+(?=\*\*$)/,lookbehind:!0},italic:{pattern:/(^\*).+(?=\*$)/,lookbehind:!0},"inline-literal":{pattern:/(^``).+(?=``$)/,lookbehind:!0,alias:"symbol"},role:{pattern:/^:[^:]+:|:[^:]+:$/,alias:"function",inside:{punctuation:/^:|:$/}},"interpreted-text":{pattern:/(^`).+(?=`$)/,lookbehind:!0,alias:"attr-value"},substitution:{pattern:/(^\|).+(?=\|$)/,lookbehind:!0,alias:"attr-value"},punctuation:/\*\*?|``?|\|/}}],link:[{pattern:/\[[^\]]+\]_(?=[\s\-.,:;!?\\\/'")\]}]|$)/,alias:"string",inside:{punctuation:/^\[|\]_$/}},{pattern:/(?:\b[a-z\d](?:[_.:+]?[a-z\d]+)*_?_|`[^`]+`_?_|_`[^`]+`)(?=[\s\-.,:;!?\\\/'")\]}]|$)/i,alias:"string",inside:{punctuation:/^_?`|`$|`?_?_$/}}],punctuation:{pattern:/(^\s*)(?:\|(?= |$)|(?:---?|—|\.\.|__)(?= )|\.\.$)/m,lookbehind:!0}};
!function(e){e.languages.sass=e.languages.extend("css",{comment:{pattern:/^([ \t]*)\/[\/*].*(?:(?:\r?\n|\r)\1[ \t]+.+)*/m,lookbehind:!0}}),e.languages.insertBefore("sass","atrule",{"atrule-line":{pattern:/^(?:[ \t]*)[@+=].+/m,inside:{atrule:/(?:@[\w-]+|[+=])/m}}}),delete e.languages.sass.atrule;var a=/((\$[-_\w]+)|(#\{\$[-_\w]+\}))/i,t=[/[+*\/%]|[=!]=|<=?|>=?|\b(?:and|or|not)\b/,{pattern:/(\s+)-(?=\s)/,lookbehind:!0}];e.languages.insertBefore("sass","property",{"variable-line":{pattern:/^[ \t]*\$.+/m,inside:{punctuation:/:/,variable:a,operator:t}},"property-line":{pattern:/^[ \t]*(?:[^:\s]+ *:.*|:[^:\s]+.*)/m,inside:{property:[/[^:\s]+(?=\s*:)/,{pattern:/(:)[^:\s]+/,lookbehind:!0}],punctuation:/:/,variable:a,operator:t,important:e.languages.sass.important}}}),delete e.languages.sass.property,delete e.languages.sass.important,delete e.languages.sass.selector,e.languages.insertBefore("sass","punctuation",{selector:{pattern:/([ \t]*)\S(?:,?[^,\r\n]+)*(?:,(?:\r?\n|\r)\1[ \t]+\S(?:,?[^,\r\n]+)*)*/,lookbehind:!0}})}(Prism); !function(e){e.languages.sass=e.languages.extend("css",{comment:{pattern:/^([ \t]*)\/[\/*].*(?:(?:\r?\n|\r)\1[ \t]+.+)*/m,lookbehind:!0}}),e.languages.insertBefore("sass","atrule",{"atrule-line":{pattern:/^(?:[ \t]*)[@+=].+/m,inside:{atrule:/(?:@[\w-]+|[+=])/m}}}),delete e.languages.sass.atrule;var a=/((\$[-_\w]+)|(#\{\$[-_\w]+\}))/i,t=[/[+*\/%]|[=!]=|<=?|>=?|\b(?:and|or|not)\b/,{pattern:/(\s+)-(?=\s)/,lookbehind:!0}];e.languages.insertBefore("sass","property",{"variable-line":{pattern:/^[ \t]*\$.+/m,inside:{punctuation:/:/,variable:a,operator:t}},"property-line":{pattern:/^[ \t]*(?:[^:\s]+ *:.*|:[^:\s]+.*)/m,inside:{property:[/[^:\s]+(?=\s*:)/,{pattern:/(:)[^:\s]+/,lookbehind:!0}],punctuation:/:/,variable:a,operator:t,important:e.languages.sass.important}}}),delete e.languages.sass.property,delete e.languages.sass.important,delete e.languages.sass.selector,e.languages.insertBefore("sass","punctuation",{selector:{pattern:/([ \t]*)\S(?:,?[^,\r\n]+)*(?:,(?:\r?\n|\r)\1[ \t]+\S(?:,?[^,\r\n]+)*)*/,lookbehind:!0}})}(Prism);
Prism.languages.scss=Prism.languages.extend("css",{comment:{pattern:/(^|[^\\])(?:\/\*[\w\W]*?\*\/|\/\/.*)/,lookbehind:!0},atrule:{pattern:/@[\w-]+(?:\([^()]+\)|[^(])*?(?=\s+[{;])/,inside:{rule:/@[\w-]+/}},url:/(?:[-a-z]+-)*url(?=\()/i,selector:{pattern:/(?=\S)[^@;\{\}\(\)]?([^@;\{\}\(\)]|&|#\{\$[-_\w]+\})+(?=\s*\{(\}|\s|[^\}]+(:|\{)[^\}]+))/m,inside:{placeholder:/%[-_\w]+/}}}),Prism.languages.insertBefore("scss","atrule",{keyword:[/@(?:if|else(?: if)?|for|each|while|import|extend|debug|warn|mixin|include|function|return|content)/i,{pattern:/( +)(?:from|through)(?= )/,lookbehind:!0}]}),Prism.languages.insertBefore("scss","property",{variable:/\$[-_\w]+|#\{\$[-_\w]+\}/}),Prism.languages.insertBefore("scss","function",{placeholder:{pattern:/%[-_\w]+/,alias:"selector"},statement:/\B!(?:default|optional)\b/i,"boolean":/\b(?:true|false)\b/,"null":/\bnull\b/,operator:{pattern:/(\s)(?:[-+*\/%]|[=!]=|<=?|>=?|and|or|not)(?=\s)/,lookbehind:!0}}),Prism.languages.scss.atrule.inside.rest=Prism.util.clone(Prism.languages.scss); Prism.languages.scss=Prism.languages.extend("css",{comment:{pattern:/(^|[^\\])(?:\/\*[\w\W]*?\*\/|\/\/.*)/,lookbehind:!0},atrule:{pattern:/@[\w-]+(?:\([^()]+\)|[^(])*?(?=\s+[{;])/,inside:{rule:/@[\w-]+/}},url:/(?:[-a-z]+-)*url(?=\()/i,selector:{pattern:/(?=\S)[^@;\{\}\(\)]?([^@;\{\}\(\)]|&|#\{\$[-_\w]+\})+(?=\s*\{(\}|\s|[^\}]+(:|\{)[^\}]+))/m,inside:{placeholder:/%[-_\w]+/}}}),Prism.languages.insertBefore("scss","atrule",{keyword:[/@(?:if|else(?: if)?|for|each|while|import|extend|debug|warn|mixin|include|function|return|content)/i,{pattern:/( +)(?:from|through)(?= )/,lookbehind:!0}]}),Prism.languages.insertBefore("scss","property",{variable:/\$[-_\w]+|#\{\$[-_\w]+\}/}),Prism.languages.insertBefore("scss","function",{placeholder:{pattern:/%[-_\w]+/,alias:"selector"},statement:/\B!(?:default|optional)\b/i,"boolean":/\b(?:true|false)\b/,"null":/\bnull\b/,operator:{pattern:/(\s)(?:[-+*\/%]|[=!]=|<=?|>=?|and|or|not)(?=\s)/,lookbehind:!0}}),Prism.languages.scss.atrule.inside.rest=Prism.util.clone(Prism.languages.scss);

View File

@ -5,7 +5,6 @@ include ../../_includes/_mixins
p Match sequences of tokens, based on pattern rules. p Match sequences of tokens, based on pattern rules.
+infobox("⚠️ Deprecation note") +infobox("⚠️ Deprecation note")
.o-block
| As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity] | As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
| are deprecated and have been replaced with a simpler | are deprecated and have been replaced with a simpler
| #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of | #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
@ -146,9 +145,9 @@ p Check whether the matcher contains rules for a match ID.
+aside-code("Example"). +aside-code("Example").
matcher = Matcher(nlp.vocab) matcher = Matcher(nlp.vocab)
assert 'Rule' in matcher == False assert 'Rule' not in matcher
matcher.add('Rule', None, [{'ORTH': 'test'}]) matcher.add('Rule', None, [{'ORTH': 'test'}])
assert 'Rule' in matcher == True assert 'Rule' in matcher
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
@ -226,9 +225,9 @@ p
+aside-code("Example"). +aside-code("Example").
matcher.add('Rule', None, [{'ORTH': 'test'}]) matcher.add('Rule', None, [{'ORTH': 'test'}])
assert 'Rule' in matcher == True assert 'Rule' in matcher
matcher.remove('Rule') matcher.remove('Rule')
assert 'Rule' in matcher == False assert 'Rule' not in matcher
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
@ -248,8 +247,7 @@ p
+aside-code("Example"). +aside-code("Example").
pattern = [{'ORTH': 'test'}] pattern = [{'ORTH': 'test'}]
matcher.add('Rule', None, pattern) matcher.add('Rule', None, pattern)
(on_match, patterns) = matcher.get('Rule') on_match, patterns = matcher.get('Rule')
assert patterns = [pattern]
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row

View File

@ -51,7 +51,7 @@ p Retrieve a string from a given hash, or vice versa.
+aside-code("Example"). +aside-code("Example").
stringstore = StringStore([u'apple', u'orange']) stringstore = StringStore([u'apple', u'orange'])
apple_hash = stringstore[u'apple'] apple_hash = stringstore[u'apple']
assert apple_hash == 8566208034543834098L assert apple_hash == 8566208034543834098
assert stringstore[apple_hash] == u'apple' assert stringstore[apple_hash] == u'apple'
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
@ -72,8 +72,8 @@ p Check whether a string is in the store.
+aside-code("Example"). +aside-code("Example").
stringstore = StringStore([u'apple', u'orange']) stringstore = StringStore([u'apple', u'orange'])
assert u'apple' in stringstore == True assert u'apple' in stringstore
assert u'cherry' in stringstore == False assert not u'cherry' in stringstore
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
@ -115,7 +115,7 @@ p Add a string to the #[code StringStore].
stringstore = StringStore([u'apple', u'orange']) stringstore = StringStore([u'apple', u'orange'])
banana_hash = stringstore.add(u'banana') banana_hash = stringstore.add(u'banana')
assert len(stringstore) == 3 assert len(stringstore) == 3
assert banana_hash == 2525716904149915114L assert banana_hash == 2525716904149915114
assert stringstore[banana_hash] == u'banana' assert stringstore[banana_hash] == u'banana'
assert stringstore[u'banana'] == banana_hash assert stringstore[u'banana'] == banana_hash
@ -215,3 +215,25 @@ p Load state from a binary string.
+cell returns +cell returns
+cell #[code StringStore] +cell #[code StringStore]
+cell The #[code StringStore] object. +cell The #[code StringStore] object.
+h(2, "util") Utilities
+h(3, "hash_string") strings.hash_string
+tag function
p Get a 64-bit hash for a given string.
+aside-code("Example").
from spacy.strings import hash_string
assert hash_string(u'apple') == 8566208034543834098
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
+cell unicode
+cell The string to hash.
+footrow
+cell returns
+cell uint64
+cell The hash.

View File

@ -34,10 +34,10 @@ p Create the vocabulary.
+row +row
+cell #[code strings] +cell #[code strings]
+cell #[code StringStore] +cell #[code StringStore] or list
+cell +cell
| A #[code StringStore] that maps strings to hash values, and vice | A #[+api("stringstore") #[code StringStore]] that maps
| versa. | strings to hash values, and vice versa, or a list of strings.
+footrow +footrow
+cell returns +cell returns

View File

@ -42,7 +42,7 @@
}, },
"spacy-101": { "spacy-101": {
"title": "spaCy 101", "title": "spaCy 101 Everything you need to know",
"next": "lightning-tour", "next": "lightning-tour",
"quickstart": true "quickstart": true
}, },

View File

@ -63,3 +63,16 @@ p
+code(false, "json"). +code(false, "json").
"pipeline": ["tensorizer", "tagger", "parser", "ner"] "pipeline": ["tensorizer", "tagger", "parser", "ner"]
p
| Although you can mix and match pipeline components, their
| #[strong order and combination] is usually important. Some components may
| require certain modifications on the #[code Doc] to process it. For
| example, the default pipeline first applies the tensorizer, which
| pre-processes the doc and encodes its internal
| #[strong meaning representations] as an array of floats, also called a
| #[strong tensor]. This includes the tokens and their context, which is
| required for the next component, the tagger, to make predictions of the
| part-of-speech tags. Because spaCy's models are neural network models,
| they only "speak" tensors and expect the input #[code Doc] to have
| a #[code tensor].

View File

@ -29,7 +29,7 @@ p
| into two tokens, "do" and "n't", while "U.K." should always | into two tokens, "do" and "n't", while "U.K." should always
| remain one token. | remain one token.
+item +item
| #[strong Can a prefix, suffix or infixes be split off?]. For example | #[strong Can a prefix, suffix or infixes be split off?] For example
| punctuation like commas, periods, hyphens or quotes. | punctuation like commas, periods, hyphens or quotes.
p p

View File

@ -5,7 +5,7 @@ p
| #[+api("vocab") #[code Vocab]], that will be | #[+api("vocab") #[code Vocab]], that will be
| #[strong shared by multiple documents]. To save memory, spaCy also | #[strong shared by multiple documents]. To save memory, spaCy also
| encodes all strings to #[strong hash values] in this case for example, | encodes all strings to #[strong hash values] in this case for example,
| "coffee" has the hash #[code 3197928453018144401L]. Entity labels like | "coffee" has the hash #[code 3197928453018144401]. Entity labels like
| "ORG" and part-of-speech tags like "VERB" are also encoded. Internally, | "ORG" and part-of-speech tags like "VERB" are also encoded. Internally,
| spaCy only "speaks" in hash values. | spaCy only "speaks" in hash values.
@ -17,7 +17,7 @@ p
| #[strong Doc]: A processed container of tokens in context.#[br] | #[strong Doc]: A processed container of tokens in context.#[br]
| #[strong Vocab]: The collection of lexemes.#[br] | #[strong Vocab]: The collection of lexemes.#[br]
| #[strong StringStore]: The dictionary mapping hash values to strings, for | #[strong StringStore]: The dictionary mapping hash values to strings, for
| example #[code 3197928453018144401L] &rarr; "coffee". | example #[code 3197928453018144401] &rarr; "coffee".
+image +image
include ../../../assets/img/docs/vocab_stringstore.svg include ../../../assets/img/docs/vocab_stringstore.svg
@ -35,8 +35,13 @@ p
+code. +code.
doc = nlp(u'I like coffee') doc = nlp(u'I like coffee')
assert doc.vocab.strings[u'coffee'] == 3197928453018144401L assert doc.vocab.strings[u'coffee'] == 3197928453018144401
assert doc.vocab.strings[3197928453018144401L] == u'coffee' assert doc.vocab.strings[3197928453018144401] == u'coffee'
+aside("What does 'L' at the end of a hash mean?")
| If you return a hash value in the #[strong Python 2 interpreter], it'll
| show up as #[code 3197928453018144401L]. The #[code L] just means "long
| integer" it's #[strong not] actually a part of the hash value.
p p
| Now that all strings are encoded, the entries in the vocabulary | Now that all strings are encoded, the entries in the vocabulary
@ -65,9 +70,9 @@ p
+table(["text", "orth", "shape", "prefix", "suffix", "is_alpha", "is_digit"]) +table(["text", "orth", "shape", "prefix", "suffix", "is_alpha", "is_digit"])
- var style = [0, 1, 1, 0, 0, 1, 1] - var style = [0, 1, 1, 0, 0, 1, 1]
+annotation-row(["I", "4690420944186131903L", "X", "I", "I", true, false], style) +annotation-row(["I", "4690420944186131903", "X", "I", "I", true, false], style)
+annotation-row(["love", "3702023516439754181L", "xxxx", "l", "ove", true, false], style) +annotation-row(["love", "3702023516439754181", "xxxx", "l", "ove", true, false], style)
+annotation-row(["coffee", "3197928453018144401L", "xxxx", "c", "ffe", true, false], style) +annotation-row(["coffee", "3197928453018144401", "xxxx", "c", "ffe", true, false], style)
p p
| The mapping of words to hashes doesn't depend on any state. To make sure | The mapping of words to hashes doesn't depend on any state. To make sure
@ -79,7 +84,7 @@ p
p p
| However, hashes #[strong cannot be reversed] and there's no way to | However, hashes #[strong cannot be reversed] and there's no way to
| resolve #[code 3197928453018144401L] back to "coffee". All spaCy can do | resolve #[code 3197928453018144401] back to "coffee". All spaCy can do
| is look it up in the vocabulary. That's why you always need to make | is look it up in the vocabulary. That's why you always need to make
| sure all objects you create have access to the same vocabulary. If they | sure all objects you create have access to the same vocabulary. If they
| don't, spaCy might not be able to find the strings it needs. | don't, spaCy might not be able to find the strings it needs.
@ -89,17 +94,17 @@ p
from spacy.vocab import Vocab from spacy.vocab import Vocab
doc = nlp(u'I like coffee') # original Doc doc = nlp(u'I like coffee') # original Doc
assert doc.vocab.strings[u'coffee'] == 3197928453018144401L # get hash assert doc.vocab.strings[u'coffee'] == 3197928453018144401 # get hash
assert doc.vocab.strings[3197928453018144401L] == u'coffee' # 👍 assert doc.vocab.strings[3197928453018144401] == u'coffee' # 👍
empty_doc = Doc(Vocab()) # new Doc with empty Vocab empty_doc = Doc(Vocab()) # new Doc with empty Vocab
# doc.vocab.strings[3197928453018144401L] will raise an error :( # doc.vocab.strings[3197928453018144401] will raise an error :(
empty_doc.vocab.strings.add(u'coffee') # add "coffee" and generate hash empty_doc.vocab.strings.add(u'coffee') # add "coffee" and generate hash
assert doc.vocab.strings[3197928453018144401L] == u'coffee' # 👍 assert doc.vocab.strings[3197928453018144401] == u'coffee' # 👍
new_doc = Doc(doc.vocab) # create new doc with first doc's vocab new_doc = Doc(doc.vocab) # create new doc with first doc's vocab
assert doc.vocab.strings[3197928453018144401L] == u'coffee' # 👍 assert doc.vocab.strings[3197928453018144401] == u'coffee' # 👍
p p
| If the vocabulary doesn't contain a hash for "coffee", spaCy will | If the vocabulary doesn't contain a hash for "coffee", spaCy will

View File

@ -53,9 +53,9 @@ p
+code. +code.
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion') doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
apple = doc[0] apple = doc[0]
assert [apple.pos_, apple.pos] == [u'PROPN', 17049293600679659579L] assert [apple.pos_, apple.pos] == [u'PROPN', 17049293600679659579]
assert [apple.tag_, apple.tag] == [u'NNP', 15794550382381185553L] assert [apple.tag_, apple.tag] == [u'NNP', 15794550382381185553]
assert [apple.shape_, apple.shape] == [u'Xxxxx', 16072095006890171862L] assert [apple.shape_, apple.shape] == [u'Xxxxx', 16072095006890171862]
assert apple.is_alpha == True assert apple.is_alpha == True
assert apple.is_punct == False assert apple.is_punct == False
@ -72,16 +72,16 @@ p
+code. +code.
doc = nlp(u'I love coffee') doc = nlp(u'I love coffee')
coffee_hash = nlp.vocab.strings[u'coffee'] # 3197928453018144401L coffee_hash = nlp.vocab.strings[u'coffee'] # 3197928453018144401
coffee_text = nlp.vocab.strings[coffee_hash] # 'coffee' coffee_text = nlp.vocab.strings[coffee_hash] # 'coffee'
assert doc[2].orth == coffee_hash == 3197928453018144401L assert doc[2].orth == coffee_hash == 3197928453018144401
assert doc[2].text == coffee_text == u'coffee' assert doc[2].text == coffee_text == u'coffee'
beer_hash = doc.vocab.strings.add(u'beer') # 3073001599257881079L beer_hash = doc.vocab.strings.add(u'beer') # 3073001599257881079
beer_text = doc.vocab.strings[beer_hash] # 'beer' beer_text = doc.vocab.strings[beer_hash] # 'beer'
unicorn_hash = doc.vocab.strings.add(u'🦄 ') # 18234233413267120783L unicorn_hash = doc.vocab.strings.add(u'🦄 ') # 18234233413267120783
unicorn_text = doc.vocab.strings[unicorn_hash] # '🦄 ' unicorn_text = doc.vocab.strings[unicorn_hash] # '🦄 '
+infobox +infobox

View File

@ -352,8 +352,7 @@ p
p p
| By default, spaCy's tokenizer will split emoji into separate tokens. This | By default, spaCy's tokenizer will split emoji into separate tokens. This
| means that you can create a pattern for one or more emoji tokens. In this | means that you can create a pattern for one or more emoji tokens.
| case, a sequence of identical emoji should be treated as one instance.
| Valid hashtags usually consist of a #[code #], plus a sequence of | Valid hashtags usually consist of a #[code #], plus a sequence of
| ASCII characters with no whitespace, making them easy to match as well. | ASCII characters with no whitespace, making them easy to match as well.
@ -368,8 +367,8 @@ p
neg_emoji = [u'😞', u'😠', u'😩', u'😢', u'😭', u'😒'] # negative emoji neg_emoji = [u'😞', u'😠', u'😩', u'😢', u'😭', u'😒'] # negative emoji
# add patterns to match one or more emoji tokens # add patterns to match one or more emoji tokens
pos_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in pos_emoji] pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emoji]
neg_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in neg_emoji] neg_patterns = [[{'ORTH': emoji}] for emoji in neg_emoji]
matcher.add('HAPPY', label_sentiment, *pos_patterns) # add positive pattern matcher.add('HAPPY', label_sentiment, *pos_patterns) # add positive pattern
matcher.add('SAD', label_sentiment, *neg_patterns) # add negative pattern matcher.add('SAD', label_sentiment, *neg_patterns) # add negative pattern
@ -397,9 +396,9 @@ p
def label_sentiment(matcher, doc, i, matches): def label_sentiment(matcher, doc, i, matches):
match_id, start, end = matches[i] match_id, start, end = matches[i]
if match_id is 'HAPPY': if doc.vocab.strings[match_id] == 'HAPPY': # don't forget to get string!
doc.sentiment += 0.1 # add 0.1 for positive sentiment doc.sentiment += 0.1 # add 0.1 for positive sentiment
elif match_id is 'SAD': elif doc.vocab.strings[match_id] == 'SAD':
doc.sentiment -= 0.1 # subtract 0.1 for negative sentiment doc.sentiment -= 0.1 # subtract 0.1 for negative sentiment
span = doc[start : end] span = doc[start : end]
emoji = Emojipedia.search(span[0].text) # get data for emoji emoji = Emojipedia.search(span[0].text) # get data for emoji

View File

@ -54,8 +54,8 @@ p
+aside-code("Example"). +aside-code("Example").
doc = nlp(u'I love coffee') doc = nlp(u'I love coffee')
assert doc.vocab.strings[u'coffee'] == 3197928453018144401L assert doc.vocab.strings[u'coffee'] == 3197928453018144401
assert doc.vocab.strings[3197928453018144401L] == u'coffee' assert doc.vocab.strings[3197928453018144401] == u'coffee'
beer_hash = doc.vocab.strings.add(u'beer') beer_hash = doc.vocab.strings.add(u'beer')
assert doc.vocab.strings[u'beer'] == beer_hash assert doc.vocab.strings[u'beer'] == beer_hash
@ -343,8 +343,8 @@ p
+code-new. +code-new.
nlp.vocab.strings.add(u'coffee') nlp.vocab.strings.add(u'coffee')
nlp.vocab.strings[u'coffee'] # 3197928453018144401L nlp.vocab.strings[u'coffee'] # 3197928453018144401
other_nlp.vocab.strings[u'coffee'] # 3197928453018144401L other_nlp.vocab.strings[u'coffee'] # 3197928453018144401
+code-old. +code-old.
nlp.vocab.strings[u'coffee'] # 3672 nlp.vocab.strings[u'coffee'] # 3672