mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Fix typos, long integers and tests
This commit is contained in:
parent
804dbb8d25
commit
00b2094dc3
|
@ -20,6 +20,40 @@ def matcher(en_vocab):
|
||||||
return matcher
|
return matcher
|
||||||
|
|
||||||
|
|
||||||
|
def test_matcher_from_api_docs(en_vocab):
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern = [{'ORTH': 'test'}]
|
||||||
|
assert len(matcher) == 0
|
||||||
|
matcher.add('Rule', None, pattern)
|
||||||
|
assert len(matcher) == 1
|
||||||
|
matcher.remove('Rule')
|
||||||
|
assert 'Rule' not in matcher
|
||||||
|
matcher.add('Rule', None, pattern)
|
||||||
|
assert 'Rule' in matcher
|
||||||
|
on_match, patterns = matcher.get('Rule')
|
||||||
|
assert len(patterns[0])
|
||||||
|
|
||||||
|
|
||||||
|
def test_matcher_from_usage_docs(en_vocab):
|
||||||
|
text = "Wow 😀 This is really cool! 😂 😂"
|
||||||
|
doc = get_doc(en_vocab, words=text.split(' '))
|
||||||
|
pos_emoji = [u'😀', u'😃', u'😂', u'🤣', u'😊', u'😍']
|
||||||
|
pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emoji]
|
||||||
|
|
||||||
|
def label_sentiment(matcher, doc, i, matches):
|
||||||
|
match_id, start, end = matches[i]
|
||||||
|
if doc.vocab.strings[match_id] == 'HAPPY':
|
||||||
|
doc.sentiment += 0.1
|
||||||
|
span = doc[start : end]
|
||||||
|
token = span.merge(norm='happy emoji')
|
||||||
|
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
matcher.add('HAPPY', label_sentiment, *pos_patterns)
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert doc.sentiment != 0
|
||||||
|
assert doc[1].norm_ == 'happy emoji'
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('words', [["Some", "words"]])
|
@pytest.mark.parametrize('words', [["Some", "words"]])
|
||||||
def test_matcher_init(en_vocab, words):
|
def test_matcher_init(en_vocab, words):
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
|
|
|
@ -5,14 +5,13 @@ include ../../_includes/_mixins
|
||||||
p Match sequences of tokens, based on pattern rules.
|
p Match sequences of tokens, based on pattern rules.
|
||||||
|
|
||||||
+infobox("⚠️ Deprecation note")
|
+infobox("⚠️ Deprecation note")
|
||||||
.o-block
|
| As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
|
||||||
| As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
|
| are deprecated and have been replaced with a simpler
|
||||||
| are deprecated and have been replaced with a simpler
|
| #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
|
||||||
| #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
|
| patterns and a callback for a given match ID. #[code Matcher.get_entity]
|
||||||
| patterns and a callback for a given match ID. #[code Matcher.get_entity]
|
| is now called #[+api("matcher#get") #[code matcher.get]].
|
||||||
| is now called #[+api("matcher#get") #[code matcher.get]].
|
| #[code Matcher.load] (not useful, as it didn't allow specifying callbacks),
|
||||||
| #[code Matcher.load] (not useful, as it didn't allow specifying callbacks),
|
| and #[code Matcher.has_entity] (now redundant) have been removed.
|
||||||
| and #[code Matcher.has_entity] (now redundant) have been removed.
|
|
||||||
|
|
||||||
+h(2, "init") Matcher.__init__
|
+h(2, "init") Matcher.__init__
|
||||||
+tag method
|
+tag method
|
||||||
|
@ -146,9 +145,9 @@ p Check whether the matcher contains rules for a match ID.
|
||||||
|
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
matcher = Matcher(nlp.vocab)
|
matcher = Matcher(nlp.vocab)
|
||||||
assert 'Rule' in matcher == False
|
assert 'Rule' not in matcher
|
||||||
matcher.add('Rule', None, [{'ORTH': 'test'}])
|
matcher.add('Rule', None, [{'ORTH': 'test'}])
|
||||||
assert 'Rule' in matcher == True
|
assert 'Rule' in matcher
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
|
@ -226,9 +225,9 @@ p
|
||||||
|
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
matcher.add('Rule', None, [{'ORTH': 'test'}])
|
matcher.add('Rule', None, [{'ORTH': 'test'}])
|
||||||
assert 'Rule' in matcher == True
|
assert 'Rule' in matcher
|
||||||
matcher.remove('Rule')
|
matcher.remove('Rule')
|
||||||
assert 'Rule' in matcher == False
|
assert 'Rule' not in matcher
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
|
@ -248,8 +247,7 @@ p
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
pattern = [{'ORTH': 'test'}]
|
pattern = [{'ORTH': 'test'}]
|
||||||
matcher.add('Rule', None, pattern)
|
matcher.add('Rule', None, pattern)
|
||||||
(on_match, patterns) = matcher.get('Rule')
|
on_match, patterns = matcher.get('Rule')
|
||||||
assert patterns = [pattern]
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
|
|
|
@ -51,7 +51,7 @@ p Retrieve a string from a given hash, or vice versa.
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
stringstore = StringStore([u'apple', u'orange'])
|
stringstore = StringStore([u'apple', u'orange'])
|
||||||
apple_hash = stringstore[u'apple']
|
apple_hash = stringstore[u'apple']
|
||||||
assert apple_hash == 8566208034543834098L
|
assert apple_hash == 8566208034543834098
|
||||||
assert stringstore[apple_hash] == u'apple'
|
assert stringstore[apple_hash] == u'apple'
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
|
@ -72,8 +72,8 @@ p Check whether a string is in the store.
|
||||||
|
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
stringstore = StringStore([u'apple', u'orange'])
|
stringstore = StringStore([u'apple', u'orange'])
|
||||||
assert u'apple' in stringstore == True
|
assert u'apple' in stringstore
|
||||||
assert u'cherry' in stringstore == False
|
assert not u'cherry' in stringstore
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
|
@ -115,7 +115,7 @@ p Add a string to the #[code StringStore].
|
||||||
stringstore = StringStore([u'apple', u'orange'])
|
stringstore = StringStore([u'apple', u'orange'])
|
||||||
banana_hash = stringstore.add(u'banana')
|
banana_hash = stringstore.add(u'banana')
|
||||||
assert len(stringstore) == 3
|
assert len(stringstore) == 3
|
||||||
assert banana_hash == 2525716904149915114L
|
assert banana_hash == 2525716904149915114
|
||||||
assert stringstore[banana_hash] == u'banana'
|
assert stringstore[banana_hash] == u'banana'
|
||||||
assert stringstore[u'banana'] == banana_hash
|
assert stringstore[u'banana'] == banana_hash
|
||||||
|
|
||||||
|
@ -215,3 +215,25 @@ p Load state from a binary string.
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell #[code StringStore]
|
+cell #[code StringStore]
|
||||||
+cell The #[code StringStore] object.
|
+cell The #[code StringStore] object.
|
||||||
|
|
||||||
|
+h(2, "util") Utilities
|
||||||
|
|
||||||
|
+h(3, "hash_string") strings.hash_string
|
||||||
|
+tag function
|
||||||
|
|
||||||
|
p Get a 64-bit hash for a given string.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
from spacy.strings import hash_string
|
||||||
|
assert hash_string(u'apple') == 8566208034543834098
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code string]
|
||||||
|
+cell unicode
|
||||||
|
+cell The string to hash.
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell returns
|
||||||
|
+cell uint64
|
||||||
|
+cell The hash.
|
||||||
|
|
|
@ -34,10 +34,10 @@ p Create the vocabulary.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code strings]
|
+cell #[code strings]
|
||||||
+cell #[code StringStore]
|
+cell #[code StringStore] or list
|
||||||
+cell
|
+cell
|
||||||
| A #[code StringStore] that maps strings to hash values, and vice
|
| A #[+api("stringstore") #[code StringStore]] that maps
|
||||||
| versa.
|
| strings to hash values, and vice versa, or a list of strings.
|
||||||
|
|
||||||
+footrow
|
+footrow
|
||||||
+cell returns
|
+cell returns
|
||||||
|
|
|
@ -5,7 +5,7 @@ p
|
||||||
| #[+api("vocab") #[code Vocab]], that will be
|
| #[+api("vocab") #[code Vocab]], that will be
|
||||||
| #[strong shared by multiple documents]. To save memory, spaCy also
|
| #[strong shared by multiple documents]. To save memory, spaCy also
|
||||||
| encodes all strings to #[strong hash values] – in this case for example,
|
| encodes all strings to #[strong hash values] – in this case for example,
|
||||||
| "coffee" has the hash #[code 3197928453018144401L]. Entity labels like
|
| "coffee" has the hash #[code 3197928453018144401]. Entity labels like
|
||||||
| "ORG" and part-of-speech tags like "VERB" are also encoded. Internally,
|
| "ORG" and part-of-speech tags like "VERB" are also encoded. Internally,
|
||||||
| spaCy only "speaks" in hash values.
|
| spaCy only "speaks" in hash values.
|
||||||
|
|
||||||
|
@ -17,7 +17,7 @@ p
|
||||||
| #[strong Doc]: A processed container of tokens in context.#[br]
|
| #[strong Doc]: A processed container of tokens in context.#[br]
|
||||||
| #[strong Vocab]: The collection of lexemes.#[br]
|
| #[strong Vocab]: The collection of lexemes.#[br]
|
||||||
| #[strong StringStore]: The dictionary mapping hash values to strings, for
|
| #[strong StringStore]: The dictionary mapping hash values to strings, for
|
||||||
| example #[code 3197928453018144401L] → "coffee".
|
| example #[code 3197928453018144401] → "coffee".
|
||||||
|
|
||||||
+image
|
+image
|
||||||
include ../../../assets/img/docs/vocab_stringstore.svg
|
include ../../../assets/img/docs/vocab_stringstore.svg
|
||||||
|
@ -35,8 +35,8 @@ p
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
doc = nlp(u'I like coffee')
|
doc = nlp(u'I like coffee')
|
||||||
assert doc.vocab.strings[u'coffee'] == 3197928453018144401L
|
assert doc.vocab.strings[u'coffee'] == 3197928453018144401
|
||||||
assert doc.vocab.strings[3197928453018144401L] == u'coffee'
|
assert doc.vocab.strings[3197928453018144401] == u'coffee'
|
||||||
|
|
||||||
p
|
p
|
||||||
| Now that all strings are encoded, the entries in the vocabulary
|
| Now that all strings are encoded, the entries in the vocabulary
|
||||||
|
@ -65,9 +65,9 @@ p
|
||||||
|
|
||||||
+table(["text", "orth", "shape", "prefix", "suffix", "is_alpha", "is_digit"])
|
+table(["text", "orth", "shape", "prefix", "suffix", "is_alpha", "is_digit"])
|
||||||
- var style = [0, 1, 1, 0, 0, 1, 1]
|
- var style = [0, 1, 1, 0, 0, 1, 1]
|
||||||
+annotation-row(["I", "4690420944186131903L", "X", "I", "I", true, false], style)
|
+annotation-row(["I", "4690420944186131903", "X", "I", "I", true, false], style)
|
||||||
+annotation-row(["love", "3702023516439754181L", "xxxx", "l", "ove", true, false], style)
|
+annotation-row(["love", "3702023516439754181", "xxxx", "l", "ove", true, false], style)
|
||||||
+annotation-row(["coffee", "3197928453018144401L", "xxxx", "c", "ffe", true, false], style)
|
+annotation-row(["coffee", "3197928453018144401", "xxxx", "c", "ffe", true, false], style)
|
||||||
|
|
||||||
p
|
p
|
||||||
| The mapping of words to hashes doesn't depend on any state. To make sure
|
| The mapping of words to hashes doesn't depend on any state. To make sure
|
||||||
|
@ -79,7 +79,7 @@ p
|
||||||
|
|
||||||
p
|
p
|
||||||
| However, hashes #[strong cannot be reversed] and there's no way to
|
| However, hashes #[strong cannot be reversed] and there's no way to
|
||||||
| resolve #[code 3197928453018144401L] back to "coffee". All spaCy can do
|
| resolve #[code 3197928453018144401] back to "coffee". All spaCy can do
|
||||||
| is look it up in the vocabulary. That's why you always need to make
|
| is look it up in the vocabulary. That's why you always need to make
|
||||||
| sure all objects you create have access to the same vocabulary. If they
|
| sure all objects you create have access to the same vocabulary. If they
|
||||||
| don't, spaCy might not be able to find the strings it needs.
|
| don't, spaCy might not be able to find the strings it needs.
|
||||||
|
@ -89,17 +89,17 @@ p
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
doc = nlp(u'I like coffee') # original Doc
|
doc = nlp(u'I like coffee') # original Doc
|
||||||
assert doc.vocab.strings[u'coffee'] == 3197928453018144401L # get hash
|
assert doc.vocab.strings[u'coffee'] == 3197928453018144401 # get hash
|
||||||
assert doc.vocab.strings[3197928453018144401L] == u'coffee' # 👍
|
assert doc.vocab.strings[3197928453018144401] == u'coffee' # 👍
|
||||||
|
|
||||||
empty_doc = Doc(Vocab()) # new Doc with empty Vocab
|
empty_doc = Doc(Vocab()) # new Doc with empty Vocab
|
||||||
# doc.vocab.strings[3197928453018144401L] will raise an error :(
|
# doc.vocab.strings[3197928453018144401] will raise an error :(
|
||||||
|
|
||||||
empty_doc.vocab.strings.add(u'coffee') # add "coffee" and generate hash
|
empty_doc.vocab.strings.add(u'coffee') # add "coffee" and generate hash
|
||||||
assert doc.vocab.strings[3197928453018144401L] == u'coffee' # 👍
|
assert doc.vocab.strings[3197928453018144401] == u'coffee' # 👍
|
||||||
|
|
||||||
new_doc = Doc(doc.vocab) # create new doc with first doc's vocab
|
new_doc = Doc(doc.vocab) # create new doc with first doc's vocab
|
||||||
assert doc.vocab.strings[3197928453018144401L] == u'coffee' # 👍
|
assert doc.vocab.strings[3197928453018144401] == u'coffee' # 👍
|
||||||
|
|
||||||
p
|
p
|
||||||
| If the vocabulary doesn't contain a hash for "coffee", spaCy will
|
| If the vocabulary doesn't contain a hash for "coffee", spaCy will
|
||||||
|
|
|
@ -53,9 +53,9 @@ p
|
||||||
+code.
|
+code.
|
||||||
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
|
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
|
||||||
apple = doc[0]
|
apple = doc[0]
|
||||||
assert [apple.pos_, apple.pos] == [u'PROPN', 17049293600679659579L]
|
assert [apple.pos_, apple.pos] == [u'PROPN', 17049293600679659579]
|
||||||
assert [apple.tag_, apple.tag] == [u'NNP', 15794550382381185553L]
|
assert [apple.tag_, apple.tag] == [u'NNP', 15794550382381185553]
|
||||||
assert [apple.shape_, apple.shape] == [u'Xxxxx', 16072095006890171862L]
|
assert [apple.shape_, apple.shape] == [u'Xxxxx', 16072095006890171862]
|
||||||
assert apple.is_alpha == True
|
assert apple.is_alpha == True
|
||||||
assert apple.is_punct == False
|
assert apple.is_punct == False
|
||||||
|
|
||||||
|
@ -72,16 +72,16 @@ p
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
doc = nlp(u'I love coffee')
|
doc = nlp(u'I love coffee')
|
||||||
coffee_hash = nlp.vocab.strings[u'coffee'] # 3197928453018144401L
|
coffee_hash = nlp.vocab.strings[u'coffee'] # 3197928453018144401
|
||||||
coffee_text = nlp.vocab.strings[coffee_hash] # 'coffee'
|
coffee_text = nlp.vocab.strings[coffee_hash] # 'coffee'
|
||||||
|
|
||||||
assert doc[2].orth == coffee_hash == 3197928453018144401L
|
assert doc[2].orth == coffee_hash == 3197928453018144401
|
||||||
assert doc[2].text == coffee_text == u'coffee'
|
assert doc[2].text == coffee_text == u'coffee'
|
||||||
|
|
||||||
beer_hash = doc.vocab.strings.add(u'beer') # 3073001599257881079L
|
beer_hash = doc.vocab.strings.add(u'beer') # 3073001599257881079
|
||||||
beer_text = doc.vocab.strings[beer_hash] # 'beer'
|
beer_text = doc.vocab.strings[beer_hash] # 'beer'
|
||||||
|
|
||||||
unicorn_hash = doc.vocab.strings.add(u'🦄 ') # 18234233413267120783L
|
unicorn_hash = doc.vocab.strings.add(u'🦄 ') # 18234233413267120783
|
||||||
unicorn_text = doc.vocab.strings[unicorn_hash] # '🦄 '
|
unicorn_text = doc.vocab.strings[unicorn_hash] # '🦄 '
|
||||||
|
|
||||||
+infobox
|
+infobox
|
||||||
|
|
Loading…
Reference in New Issue
Block a user