mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Remove outdated examples
This commit is contained in:
parent
daed7ff8fe
commit
db843735d3
|
@ -1,5 +0,0 @@
|
|||
An example of inventory counting using SpaCy.io NLP library. Meant to show how to instantiate Spacy's English class, and allow reusability by reloading the main module.
|
||||
|
||||
In the future, a better implementation of this library would be to apply machine learning to each query and learn what to classify as the quantitative statement (55 kgs OF), vs the actual item of count (how likely is a preposition object to be the item of count if x,y,z qualifications appear in the statement).
|
||||
|
||||
|
|
@ -1,35 +0,0 @@
|
|||
class Inventory:
|
||||
"""
|
||||
Inventory class - a struct{} like feature to house inventory counts
|
||||
across modules.
|
||||
"""
|
||||
originalQuery = None
|
||||
item = ""
|
||||
unit = ""
|
||||
amount = ""
|
||||
|
||||
def __init__(self, statement):
|
||||
"""
|
||||
Constructor - only takes in the original query/statement
|
||||
:return: new Inventory object
|
||||
"""
|
||||
|
||||
self.originalQuery = statement
|
||||
pass
|
||||
|
||||
def __str__(self):
|
||||
return str(self.amount) + ' ' + str(self.unit) + ' ' + str(self.item)
|
||||
|
||||
def printInfo(self):
|
||||
print '-------------Inventory Count------------'
|
||||
print "Original Query: " + str(self.originalQuery)
|
||||
print 'Amount: ' + str(self.amount)
|
||||
print 'Unit: ' + str(self.unit)
|
||||
print 'Item: ' + str(self.item)
|
||||
print '----------------------------------------'
|
||||
|
||||
def isValid(self):
|
||||
if not self.item or not self.unit or not self.amount or not self.originalQuery:
|
||||
return False
|
||||
else:
|
||||
return True
|
|
@ -1,92 +0,0 @@
|
|||
from inventory import Inventory
|
||||
|
||||
|
||||
def runTest(nlp):
|
||||
testset = []
|
||||
testset += [nlp(u'6 lobster cakes')]
|
||||
testset += [nlp(u'6 avacados')]
|
||||
testset += [nlp(u'fifty five carrots')]
|
||||
testset += [nlp(u'i have 55 carrots')]
|
||||
testset += [nlp(u'i got me some 9 cabbages')]
|
||||
testset += [nlp(u'i got 65 kgs of carrots')]
|
||||
|
||||
result = []
|
||||
for doc in testset:
|
||||
c = decodeInventoryEntry_level1(doc)
|
||||
if not c.isValid():
|
||||
c = decodeInventoryEntry_level2(doc)
|
||||
result.append(c)
|
||||
|
||||
for i in result:
|
||||
i.printInfo()
|
||||
|
||||
|
||||
def decodeInventoryEntry_level1(document):
|
||||
"""
|
||||
Decodes a basic entry such as: '6 lobster cake' or '6' cakes
|
||||
@param document : NLP Doc object
|
||||
:return: Status if decoded correctly (true, false), and Inventory object
|
||||
"""
|
||||
count = Inventory(str(document))
|
||||
for token in document:
|
||||
if token.pos_ == (u'NOUN' or u'NNS' or u'NN'):
|
||||
item = str(token)
|
||||
|
||||
for child in token.children:
|
||||
if child.dep_ == u'compound' or child.dep_ == u'ad':
|
||||
item = str(child) + str(item)
|
||||
elif child.dep_ == u'nummod':
|
||||
count.amount = str(child).strip()
|
||||
for numerical_child in child.children:
|
||||
# this isn't arithmetic rather than treating it such as a string
|
||||
count.amount = str(numerical_child) + str(count.amount).strip()
|
||||
else:
|
||||
print "WARNING: unknown child: " + str(child) + ':'+str(child.dep_)
|
||||
|
||||
count.item = item
|
||||
count.unit = item
|
||||
|
||||
return count
|
||||
|
||||
|
||||
def decodeInventoryEntry_level2(document):
|
||||
"""
|
||||
Entry level 2, a more complicated parsing scheme that covers examples such as
|
||||
'i have 80 boxes of freshly baked pies'
|
||||
|
||||
@document @param document : NLP Doc object
|
||||
:return: Status if decoded correctly (true, false), and Inventory object-
|
||||
"""
|
||||
|
||||
count = Inventory(str(document))
|
||||
|
||||
for token in document:
|
||||
# Look for a preposition object that is a noun (this is the item we are counting).
|
||||
# If found, look at its' dependency (if a preposition that is not indicative of
|
||||
# inventory location, the dependency of the preposition must be a noun
|
||||
|
||||
if token.dep_ == (u'pobj' or u'meta') and token.pos_ == (u'NOUN' or u'NNS' or u'NN'):
|
||||
item = ''
|
||||
|
||||
# Go through all the token's children, these are possible adjectives and other add-ons
|
||||
# this deals with cases such as 'hollow rounded waffle pancakes"
|
||||
for i in token.children:
|
||||
item += ' ' + str(i)
|
||||
|
||||
item += ' ' + str(token)
|
||||
count.item = item
|
||||
|
||||
# Get the head of the item:
|
||||
if token.head.dep_ != u'prep':
|
||||
# Break out of the loop, this is a confusing entry
|
||||
break
|
||||
else:
|
||||
amountUnit = token.head.head
|
||||
count.unit = str(amountUnit)
|
||||
|
||||
for inner in amountUnit.children:
|
||||
if inner.pos_ == u'NUM':
|
||||
count.amount += str(inner)
|
||||
return count
|
||||
|
||||
|
|
@ -1,30 +0,0 @@
|
|||
import inventoryCount as mainModule
|
||||
import os
|
||||
from spacy.en import English
|
||||
|
||||
if __name__ == '__main__':
|
||||
"""
|
||||
Main module for this example - loads the English main NLP class,
|
||||
and keeps it in RAM while waiting for the user to re-run it. Allows the
|
||||
developer to re-edit their module under testing without having
|
||||
to wait as long to load the English class
|
||||
"""
|
||||
|
||||
# Set the NLP object here for the parameters you want to see,
|
||||
# or just leave it blank and get all the opts
|
||||
print "Loading English module... this will take a while."
|
||||
nlp = English()
|
||||
print "Done loading English module."
|
||||
while True:
|
||||
try:
|
||||
reload(mainModule)
|
||||
mainModule.runTest(nlp)
|
||||
raw_input('================ To reload main module, press Enter ================')
|
||||
|
||||
|
||||
except Exception, e:
|
||||
print "Unexpected error: " + str(e)
|
||||
continue
|
||||
|
||||
|
||||
|
|
@ -1,161 +0,0 @@
|
|||
from __future__ import unicode_literals, print_function
|
||||
|
||||
import spacy.en
|
||||
import spacy.matcher
|
||||
from spacy.attrs import ORTH, TAG, LOWER, IS_ALPHA, FLAG63
|
||||
|
||||
import plac
|
||||
|
||||
|
||||
def main():
|
||||
nlp = spacy.en.English()
|
||||
example = u"I prefer Siri to Google Now. I'll google now to find out how the google now service works."
|
||||
before = nlp(example)
|
||||
print("Before")
|
||||
for ent in before.ents:
|
||||
print(ent.text, ent.label_, [w.tag_ for w in ent])
|
||||
# Output:
|
||||
# Google ORG [u'NNP']
|
||||
# google ORG [u'VB']
|
||||
# google ORG [u'NNP']
|
||||
nlp.matcher.add(
|
||||
"GoogleNow", # Entity ID: Not really used at the moment.
|
||||
"PRODUCT", # Entity type: should be one of the types in the NER data
|
||||
{"wiki_en": "Google_Now"}, # Arbitrary attributes. Currently unused.
|
||||
[ # List of patterns that can be Surface Forms of the entity
|
||||
|
||||
# This Surface Form matches "Google Now", verbatim
|
||||
[ # Each Surface Form is a list of Token Specifiers.
|
||||
{ # This Token Specifier matches tokens whose orth field is "Google"
|
||||
ORTH: "Google"
|
||||
},
|
||||
{ # This Token Specifier matches tokens whose orth field is "Now"
|
||||
ORTH: "Now"
|
||||
}
|
||||
],
|
||||
[ # This Surface Form matches "google now", verbatim, and requires
|
||||
# "google" to have the NNP tag. This helps prevent the pattern from
|
||||
# matching cases like "I will google now to look up the time"
|
||||
{
|
||||
ORTH: "google",
|
||||
TAG: "NNP"
|
||||
},
|
||||
{
|
||||
ORTH: "now"
|
||||
}
|
||||
]
|
||||
]
|
||||
)
|
||||
after = nlp(example)
|
||||
print("After")
|
||||
for ent in after.ents:
|
||||
print(ent.text, ent.label_, [w.tag_ for w in ent])
|
||||
# Output
|
||||
# Google Now PRODUCT [u'NNP', u'RB']
|
||||
# google ORG [u'VB']
|
||||
# google now PRODUCT [u'NNP', u'RB']
|
||||
#
|
||||
# You can customize attribute values in the lexicon, and then refer to the
|
||||
# new attributes in your Token Specifiers.
|
||||
# This is particularly good for word-set membership.
|
||||
#
|
||||
australian_capitals = ['Brisbane', 'Sydney', 'Canberra', 'Melbourne', 'Hobart',
|
||||
'Darwin', 'Adelaide', 'Perth']
|
||||
# Internally, the tokenizer immediately maps each token to a pointer to a
|
||||
# LexemeC struct. These structs hold various features, e.g. the integer IDs
|
||||
# of the normalized string forms.
|
||||
# For our purposes, the key attribute is a 64-bit integer, used as a bit field.
|
||||
# spaCy currently only uses 12 of the bits for its built-in features, so
|
||||
# the others are available for use. It's best to use the higher bits, as
|
||||
# future versions of spaCy may add more flags. For instance, we might add
|
||||
# a built-in IS_MONTH flag, taking up FLAG13. So, we bind our user-field to
|
||||
# FLAG63 here.
|
||||
is_australian_capital = FLAG63
|
||||
# Now we need to set the flag value. It's False on all tokens by default,
|
||||
# so we just need to set it to True for the tokens we want.
|
||||
# Here we iterate over the strings, and set it on only the literal matches.
|
||||
for string in australian_capitals:
|
||||
lexeme = nlp.vocab[string]
|
||||
lexeme.set_flag(is_australian_capital, True)
|
||||
print('Sydney', nlp.vocab[u'Sydney'].check_flag(is_australian_capital))
|
||||
print('sydney', nlp.vocab[u'sydney'].check_flag(is_australian_capital))
|
||||
# If we want case-insensitive matching, we have to be a little bit more
|
||||
# round-about, as there's no case-insensitive index to the vocabulary. So
|
||||
# we have to iterate over the vocabulary.
|
||||
# We'll be looking up attribute IDs in this set a lot, so it's good to pre-build it
|
||||
target_ids = {nlp.vocab.strings[s.lower()] for s in australian_capitals}
|
||||
for lexeme in nlp.vocab:
|
||||
if lexeme.lower in target_ids:
|
||||
lexeme.set_flag(is_australian_capital, True)
|
||||
print('Sydney', nlp.vocab[u'Sydney'].check_flag(is_australian_capital))
|
||||
print('sydney', nlp.vocab[u'sydney'].check_flag(is_australian_capital))
|
||||
print('SYDNEY', nlp.vocab[u'SYDNEY'].check_flag(is_australian_capital))
|
||||
# Output
|
||||
# Sydney True
|
||||
# sydney False
|
||||
# Sydney True
|
||||
# sydney True
|
||||
# SYDNEY True
|
||||
#
|
||||
# The key thing to note here is that we're setting these attributes once,
|
||||
# over the vocabulary --- and then reusing them at run-time. This means the
|
||||
# amortized complexity of anything we do this way is going to be O(1). You
|
||||
# can match over expressions that need to have sets with tens of thousands
|
||||
# of values, e.g. "all the street names in Germany", and you'll still have
|
||||
# O(1) complexity. Most regular expression algorithms don't scale well to
|
||||
# this sort of problem.
|
||||
#
|
||||
# Now, let's use this in a pattern
|
||||
nlp.matcher.add("AuCitySportsTeam", "ORG", {},
|
||||
[
|
||||
[
|
||||
{LOWER: "the"},
|
||||
{is_australian_capital: True},
|
||||
{TAG: "NNS"}
|
||||
],
|
||||
[
|
||||
{LOWER: "the"},
|
||||
{is_australian_capital: True},
|
||||
{TAG: "NNPS"}
|
||||
],
|
||||
[
|
||||
{LOWER: "the"},
|
||||
{IS_ALPHA: True}, # Allow a word in between, e.g. The Western Sydney
|
||||
{is_australian_capital: True},
|
||||
{TAG: "NNS"}
|
||||
],
|
||||
[
|
||||
{LOWER: "the"},
|
||||
{IS_ALPHA: True}, # Allow a word in between, e.g. The Western Sydney
|
||||
{is_australian_capital: True},
|
||||
{TAG: "NNPS"}
|
||||
]
|
||||
])
|
||||
doc = nlp(u'The pattern should match the Brisbane Broncos and the South Darwin Spiders, but not the Colorado Boulders')
|
||||
for ent in doc.ents:
|
||||
print(ent.text, ent.label_)
|
||||
# Output
|
||||
# the Brisbane Broncos ORG
|
||||
# the South Darwin Spiders ORG
|
||||
|
||||
|
||||
# Output
|
||||
# Before
|
||||
# Google ORG [u'NNP']
|
||||
# google ORG [u'VB']
|
||||
# google ORG [u'NNP']
|
||||
# After
|
||||
# Google Now PRODUCT [u'NNP', u'RB']
|
||||
# google ORG [u'VB']
|
||||
# google now PRODUCT [u'NNP', u'RB']
|
||||
# Sydney True
|
||||
# sydney False
|
||||
# Sydney True
|
||||
# sydney True
|
||||
# SYDNEY True
|
||||
# the Brisbane Broncos ORG
|
||||
# the South Darwin Spiders ORG
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
@ -1,36 +0,0 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
import plac
|
||||
import codecs
|
||||
import pathlib
|
||||
import random
|
||||
|
||||
import twython
|
||||
import spacy.en
|
||||
|
||||
import _handler
|
||||
|
||||
|
||||
class Connection(twython.TwythonStreamer):
|
||||
def __init__(self, keys_dir, nlp, query):
|
||||
keys_dir = pathlib.Path(keys_dir)
|
||||
read = lambda fn: (keys_dir / (fn + '.txt')).open().read().strip()
|
||||
api_key = map(read, ['key', 'secret', 'token', 'token_secret'])
|
||||
twython.TwythonStreamer.__init__(self, *api_key)
|
||||
self.nlp = nlp
|
||||
self.query = query
|
||||
|
||||
def on_success(self, data):
|
||||
_handler.handle_tweet(self.nlp, data, self.query)
|
||||
if random.random() >= 0.1:
|
||||
reload(_handler)
|
||||
|
||||
|
||||
def main(keys_dir, term):
|
||||
nlp = spacy.en.English()
|
||||
twitter = Connection(keys_dir, nlp, term)
|
||||
twitter.statuses.filter(track=term, language='en')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
Loading…
Reference in New Issue
Block a user