diff --git a/spacy/tests/test_matcher.py b/spacy/tests/test_matcher.py index 9bbc9b24d..645618013 100644 --- a/spacy/tests/test_matcher.py +++ b/spacy/tests/test_matcher.py @@ -20,6 +20,40 @@ def matcher(en_vocab): return matcher +def test_matcher_from_api_docs(en_vocab): + matcher = Matcher(en_vocab) + pattern = [{'ORTH': 'test'}] + assert len(matcher) == 0 + matcher.add('Rule', None, pattern) + assert len(matcher) == 1 + matcher.remove('Rule') + assert 'Rule' not in matcher + matcher.add('Rule', None, pattern) + assert 'Rule' in matcher + on_match, patterns = matcher.get('Rule') + assert len(patterns[0]) + + +def test_matcher_from_usage_docs(en_vocab): + text = "Wow 😀 This is really cool! 😂 😂" + doc = get_doc(en_vocab, words=text.split(' ')) + pos_emoji = [u'😀', u'😃', u'😂', u'🤣', u'😊', u'😍'] + pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emoji] + + def label_sentiment(matcher, doc, i, matches): + match_id, start, end = matches[i] + if doc.vocab.strings[match_id] == 'HAPPY': + doc.sentiment += 0.1 + span = doc[start : end] + token = span.merge(norm='happy emoji') + + matcher = Matcher(en_vocab) + matcher.add('HAPPY', label_sentiment, *pos_patterns) + matches = matcher(doc) + assert doc.sentiment != 0 + assert doc[1].norm_ == 'happy emoji' + + @pytest.mark.parametrize('words', [["Some", "words"]]) def test_matcher_init(en_vocab, words): matcher = Matcher(en_vocab) diff --git a/website/docs/api/matcher.jade b/website/docs/api/matcher.jade index c837fe434..e7c0aaaf2 100644 --- a/website/docs/api/matcher.jade +++ b/website/docs/api/matcher.jade @@ -5,14 +5,13 @@ include ../../_includes/_mixins p Match sequences of tokens, based on pattern rules. +infobox("⚠️ Deprecation note") - .o-block - | As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity] - | are deprecated and have been replaced with a simpler - | #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of - | patterns and a callback for a given match ID. #[code Matcher.get_entity] - | is now called #[+api("matcher#get") #[code matcher.get]]. - | #[code Matcher.load] (not useful, as it didn't allow specifying callbacks), - | and #[code Matcher.has_entity] (now redundant) have been removed. + | As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity] + | are deprecated and have been replaced with a simpler + | #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of + | patterns and a callback for a given match ID. #[code Matcher.get_entity] + | is now called #[+api("matcher#get") #[code matcher.get]]. + | #[code Matcher.load] (not useful, as it didn't allow specifying callbacks), + | and #[code Matcher.has_entity] (now redundant) have been removed. +h(2, "init") Matcher.__init__ +tag method @@ -146,9 +145,9 @@ p Check whether the matcher contains rules for a match ID. +aside-code("Example"). matcher = Matcher(nlp.vocab) - assert 'Rule' in matcher == False + assert 'Rule' not in matcher matcher.add('Rule', None, [{'ORTH': 'test'}]) - assert 'Rule' in matcher == True + assert 'Rule' in matcher +table(["Name", "Type", "Description"]) +row @@ -226,9 +225,9 @@ p +aside-code("Example"). matcher.add('Rule', None, [{'ORTH': 'test'}]) - assert 'Rule' in matcher == True + assert 'Rule' in matcher matcher.remove('Rule') - assert 'Rule' in matcher == False + assert 'Rule' not in matcher +table(["Name", "Type", "Description"]) +row @@ -248,8 +247,7 @@ p +aside-code("Example"). pattern = [{'ORTH': 'test'}] matcher.add('Rule', None, pattern) - (on_match, patterns) = matcher.get('Rule') - assert patterns = [pattern] + on_match, patterns = matcher.get('Rule') +table(["Name", "Type", "Description"]) +row diff --git a/website/docs/api/stringstore.jade b/website/docs/api/stringstore.jade index 969c8a6a5..c17fb1db9 100644 --- a/website/docs/api/stringstore.jade +++ b/website/docs/api/stringstore.jade @@ -51,7 +51,7 @@ p Retrieve a string from a given hash, or vice versa. +aside-code("Example"). stringstore = StringStore([u'apple', u'orange']) apple_hash = stringstore[u'apple'] - assert apple_hash == 8566208034543834098L + assert apple_hash == 8566208034543834098 assert stringstore[apple_hash] == u'apple' +table(["Name", "Type", "Description"]) @@ -72,8 +72,8 @@ p Check whether a string is in the store. +aside-code("Example"). stringstore = StringStore([u'apple', u'orange']) - assert u'apple' in stringstore == True - assert u'cherry' in stringstore == False + assert u'apple' in stringstore + assert not u'cherry' in stringstore +table(["Name", "Type", "Description"]) +row @@ -115,7 +115,7 @@ p Add a string to the #[code StringStore]. stringstore = StringStore([u'apple', u'orange']) banana_hash = stringstore.add(u'banana') assert len(stringstore) == 3 - assert banana_hash == 2525716904149915114L + assert banana_hash == 2525716904149915114 assert stringstore[banana_hash] == u'banana' assert stringstore[u'banana'] == banana_hash @@ -215,3 +215,25 @@ p Load state from a binary string. +cell returns +cell #[code StringStore] +cell The #[code StringStore] object. + ++h(2, "util") Utilities + ++h(3, "hash_string") strings.hash_string + +tag function + +p Get a 64-bit hash for a given string. + ++aside-code("Example"). + from spacy.strings import hash_string + assert hash_string(u'apple') == 8566208034543834098 + ++table(["Name", "Type", "Description"]) + +row + +cell #[code string] + +cell unicode + +cell The string to hash. + + +footrow + +cell returns + +cell uint64 + +cell The hash. diff --git a/website/docs/api/vocab.jade b/website/docs/api/vocab.jade index ce62612d3..4d3e0828a 100644 --- a/website/docs/api/vocab.jade +++ b/website/docs/api/vocab.jade @@ -34,10 +34,10 @@ p Create the vocabulary. +row +cell #[code strings] - +cell #[code StringStore] + +cell #[code StringStore] or list +cell - | A #[code StringStore] that maps strings to hash values, and vice - | versa. + | A #[+api("stringstore") #[code StringStore]] that maps + | strings to hash values, and vice versa, or a list of strings. +footrow +cell returns diff --git a/website/docs/usage/_spacy-101/_vocab.jade b/website/docs/usage/_spacy-101/_vocab.jade index e59518a25..f4cc426c2 100644 --- a/website/docs/usage/_spacy-101/_vocab.jade +++ b/website/docs/usage/_spacy-101/_vocab.jade @@ -5,7 +5,7 @@ p | #[+api("vocab") #[code Vocab]], that will be | #[strong shared by multiple documents]. To save memory, spaCy also | encodes all strings to #[strong hash values] – in this case for example, - | "coffee" has the hash #[code 3197928453018144401L]. Entity labels like + | "coffee" has the hash #[code 3197928453018144401]. Entity labels like | "ORG" and part-of-speech tags like "VERB" are also encoded. Internally, | spaCy only "speaks" in hash values. @@ -17,7 +17,7 @@ p | #[strong Doc]: A processed container of tokens in context.#[br] | #[strong Vocab]: The collection of lexemes.#[br] | #[strong StringStore]: The dictionary mapping hash values to strings, for - | example #[code 3197928453018144401L] → "coffee". + | example #[code 3197928453018144401] → "coffee". +image include ../../../assets/img/docs/vocab_stringstore.svg @@ -35,8 +35,8 @@ p +code. doc = nlp(u'I like coffee') - assert doc.vocab.strings[u'coffee'] == 3197928453018144401L - assert doc.vocab.strings[3197928453018144401L] == u'coffee' + assert doc.vocab.strings[u'coffee'] == 3197928453018144401 + assert doc.vocab.strings[3197928453018144401] == u'coffee' p | Now that all strings are encoded, the entries in the vocabulary @@ -65,9 +65,9 @@ p +table(["text", "orth", "shape", "prefix", "suffix", "is_alpha", "is_digit"]) - var style = [0, 1, 1, 0, 0, 1, 1] - +annotation-row(["I", "4690420944186131903L", "X", "I", "I", true, false], style) - +annotation-row(["love", "3702023516439754181L", "xxxx", "l", "ove", true, false], style) - +annotation-row(["coffee", "3197928453018144401L", "xxxx", "c", "ffe", true, false], style) + +annotation-row(["I", "4690420944186131903", "X", "I", "I", true, false], style) + +annotation-row(["love", "3702023516439754181", "xxxx", "l", "ove", true, false], style) + +annotation-row(["coffee", "3197928453018144401", "xxxx", "c", "ffe", true, false], style) p | The mapping of words to hashes doesn't depend on any state. To make sure @@ -79,7 +79,7 @@ p p | However, hashes #[strong cannot be reversed] and there's no way to - | resolve #[code 3197928453018144401L] back to "coffee". All spaCy can do + | resolve #[code 3197928453018144401] back to "coffee". All spaCy can do | is look it up in the vocabulary. That's why you always need to make | sure all objects you create have access to the same vocabulary. If they | don't, spaCy might not be able to find the strings it needs. @@ -89,17 +89,17 @@ p from spacy.vocab import Vocab doc = nlp(u'I like coffee') # original Doc - assert doc.vocab.strings[u'coffee'] == 3197928453018144401L # get hash - assert doc.vocab.strings[3197928453018144401L] == u'coffee' # 👍 + assert doc.vocab.strings[u'coffee'] == 3197928453018144401 # get hash + assert doc.vocab.strings[3197928453018144401] == u'coffee' # 👍 empty_doc = Doc(Vocab()) # new Doc with empty Vocab - # doc.vocab.strings[3197928453018144401L] will raise an error :( + # doc.vocab.strings[3197928453018144401] will raise an error :( empty_doc.vocab.strings.add(u'coffee') # add "coffee" and generate hash - assert doc.vocab.strings[3197928453018144401L] == u'coffee' # 👍 + assert doc.vocab.strings[3197928453018144401] == u'coffee' # 👍 new_doc = Doc(doc.vocab) # create new doc with first doc's vocab - assert doc.vocab.strings[3197928453018144401L] == u'coffee' # 👍 + assert doc.vocab.strings[3197928453018144401] == u'coffee' # 👍 p | If the vocabulary doesn't contain a hash for "coffee", spaCy will diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade index f144b4f05..89dac830c 100644 --- a/website/docs/usage/lightning-tour.jade +++ b/website/docs/usage/lightning-tour.jade @@ -53,9 +53,9 @@ p +code. doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion') apple = doc[0] - assert [apple.pos_, apple.pos] == [u'PROPN', 17049293600679659579L] - assert [apple.tag_, apple.tag] == [u'NNP', 15794550382381185553L] - assert [apple.shape_, apple.shape] == [u'Xxxxx', 16072095006890171862L] + assert [apple.pos_, apple.pos] == [u'PROPN', 17049293600679659579] + assert [apple.tag_, apple.tag] == [u'NNP', 15794550382381185553] + assert [apple.shape_, apple.shape] == [u'Xxxxx', 16072095006890171862] assert apple.is_alpha == True assert apple.is_punct == False @@ -72,16 +72,16 @@ p +code. doc = nlp(u'I love coffee') - coffee_hash = nlp.vocab.strings[u'coffee'] # 3197928453018144401L + coffee_hash = nlp.vocab.strings[u'coffee'] # 3197928453018144401 coffee_text = nlp.vocab.strings[coffee_hash] # 'coffee' - assert doc[2].orth == coffee_hash == 3197928453018144401L + assert doc[2].orth == coffee_hash == 3197928453018144401 assert doc[2].text == coffee_text == u'coffee' - beer_hash = doc.vocab.strings.add(u'beer') # 3073001599257881079L + beer_hash = doc.vocab.strings.add(u'beer') # 3073001599257881079 beer_text = doc.vocab.strings[beer_hash] # 'beer' - unicorn_hash = doc.vocab.strings.add(u'🦄 ') # 18234233413267120783L + unicorn_hash = doc.vocab.strings.add(u'🦄 ') # 18234233413267120783 unicorn_text = doc.vocab.strings[unicorn_hash] # '🦄 ' +infobox