Remove outdated examples

This commit is contained in:
ines 2017-10-26 18:46:25 +02:00
parent daed7ff8fe
commit db843735d3
6 changed files with 0 additions and 359 deletions

View File

@ -1,5 +0,0 @@
An example of inventory counting using SpaCy.io NLP library. Meant to show how to instantiate Spacy's English class, and allow reusability by reloading the main module.
In the future, a better implementation of this library would be to apply machine learning to each query and learn what to classify as the quantitative statement (55 kgs OF), vs the actual item of count (how likely is a preposition object to be the item of count if x,y,z qualifications appear in the statement).

View File

@ -1,35 +0,0 @@
class Inventory:
"""
Inventory class - a struct{} like feature to house inventory counts
across modules.
"""
originalQuery = None
item = ""
unit = ""
amount = ""
def __init__(self, statement):
"""
Constructor - only takes in the original query/statement
:return: new Inventory object
"""
self.originalQuery = statement
pass
def __str__(self):
return str(self.amount) + ' ' + str(self.unit) + ' ' + str(self.item)
def printInfo(self):
print '-------------Inventory Count------------'
print "Original Query: " + str(self.originalQuery)
print 'Amount: ' + str(self.amount)
print 'Unit: ' + str(self.unit)
print 'Item: ' + str(self.item)
print '----------------------------------------'
def isValid(self):
if not self.item or not self.unit or not self.amount or not self.originalQuery:
return False
else:
return True

View File

@ -1,92 +0,0 @@
from inventory import Inventory
def runTest(nlp):
testset = []
testset += [nlp(u'6 lobster cakes')]
testset += [nlp(u'6 avacados')]
testset += [nlp(u'fifty five carrots')]
testset += [nlp(u'i have 55 carrots')]
testset += [nlp(u'i got me some 9 cabbages')]
testset += [nlp(u'i got 65 kgs of carrots')]
result = []
for doc in testset:
c = decodeInventoryEntry_level1(doc)
if not c.isValid():
c = decodeInventoryEntry_level2(doc)
result.append(c)
for i in result:
i.printInfo()
def decodeInventoryEntry_level1(document):
"""
Decodes a basic entry such as: '6 lobster cake' or '6' cakes
@param document : NLP Doc object
:return: Status if decoded correctly (true, false), and Inventory object
"""
count = Inventory(str(document))
for token in document:
if token.pos_ == (u'NOUN' or u'NNS' or u'NN'):
item = str(token)
for child in token.children:
if child.dep_ == u'compound' or child.dep_ == u'ad':
item = str(child) + str(item)
elif child.dep_ == u'nummod':
count.amount = str(child).strip()
for numerical_child in child.children:
# this isn't arithmetic rather than treating it such as a string
count.amount = str(numerical_child) + str(count.amount).strip()
else:
print "WARNING: unknown child: " + str(child) + ':'+str(child.dep_)
count.item = item
count.unit = item
return count
def decodeInventoryEntry_level2(document):
"""
Entry level 2, a more complicated parsing scheme that covers examples such as
'i have 80 boxes of freshly baked pies'
@document @param document : NLP Doc object
:return: Status if decoded correctly (true, false), and Inventory object-
"""
count = Inventory(str(document))
for token in document:
# Look for a preposition object that is a noun (this is the item we are counting).
# If found, look at its' dependency (if a preposition that is not indicative of
# inventory location, the dependency of the preposition must be a noun
if token.dep_ == (u'pobj' or u'meta') and token.pos_ == (u'NOUN' or u'NNS' or u'NN'):
item = ''
# Go through all the token's children, these are possible adjectives and other add-ons
# this deals with cases such as 'hollow rounded waffle pancakes"
for i in token.children:
item += ' ' + str(i)
item += ' ' + str(token)
count.item = item
# Get the head of the item:
if token.head.dep_ != u'prep':
# Break out of the loop, this is a confusing entry
break
else:
amountUnit = token.head.head
count.unit = str(amountUnit)
for inner in amountUnit.children:
if inner.pos_ == u'NUM':
count.amount += str(inner)
return count

View File

@ -1,30 +0,0 @@
import inventoryCount as mainModule
import os
from spacy.en import English
if __name__ == '__main__':
"""
Main module for this example - loads the English main NLP class,
and keeps it in RAM while waiting for the user to re-run it. Allows the
developer to re-edit their module under testing without having
to wait as long to load the English class
"""
# Set the NLP object here for the parameters you want to see,
# or just leave it blank and get all the opts
print "Loading English module... this will take a while."
nlp = English()
print "Done loading English module."
while True:
try:
reload(mainModule)
mainModule.runTest(nlp)
raw_input('================ To reload main module, press Enter ================')
except Exception, e:
print "Unexpected error: " + str(e)
continue

View File

@ -1,161 +0,0 @@
from __future__ import unicode_literals, print_function
import spacy.en
import spacy.matcher
from spacy.attrs import ORTH, TAG, LOWER, IS_ALPHA, FLAG63
import plac
def main():
nlp = spacy.en.English()
example = u"I prefer Siri to Google Now. I'll google now to find out how the google now service works."
before = nlp(example)
print("Before")
for ent in before.ents:
print(ent.text, ent.label_, [w.tag_ for w in ent])
# Output:
# Google ORG [u'NNP']
# google ORG [u'VB']
# google ORG [u'NNP']
nlp.matcher.add(
"GoogleNow", # Entity ID: Not really used at the moment.
"PRODUCT", # Entity type: should be one of the types in the NER data
{"wiki_en": "Google_Now"}, # Arbitrary attributes. Currently unused.
[ # List of patterns that can be Surface Forms of the entity
# This Surface Form matches "Google Now", verbatim
[ # Each Surface Form is a list of Token Specifiers.
{ # This Token Specifier matches tokens whose orth field is "Google"
ORTH: "Google"
},
{ # This Token Specifier matches tokens whose orth field is "Now"
ORTH: "Now"
}
],
[ # This Surface Form matches "google now", verbatim, and requires
# "google" to have the NNP tag. This helps prevent the pattern from
# matching cases like "I will google now to look up the time"
{
ORTH: "google",
TAG: "NNP"
},
{
ORTH: "now"
}
]
]
)
after = nlp(example)
print("After")
for ent in after.ents:
print(ent.text, ent.label_, [w.tag_ for w in ent])
# Output
# Google Now PRODUCT [u'NNP', u'RB']
# google ORG [u'VB']
# google now PRODUCT [u'NNP', u'RB']
#
# You can customize attribute values in the lexicon, and then refer to the
# new attributes in your Token Specifiers.
# This is particularly good for word-set membership.
#
australian_capitals = ['Brisbane', 'Sydney', 'Canberra', 'Melbourne', 'Hobart',
'Darwin', 'Adelaide', 'Perth']
# Internally, the tokenizer immediately maps each token to a pointer to a
# LexemeC struct. These structs hold various features, e.g. the integer IDs
# of the normalized string forms.
# For our purposes, the key attribute is a 64-bit integer, used as a bit field.
# spaCy currently only uses 12 of the bits for its built-in features, so
# the others are available for use. It's best to use the higher bits, as
# future versions of spaCy may add more flags. For instance, we might add
# a built-in IS_MONTH flag, taking up FLAG13. So, we bind our user-field to
# FLAG63 here.
is_australian_capital = FLAG63
# Now we need to set the flag value. It's False on all tokens by default,
# so we just need to set it to True for the tokens we want.
# Here we iterate over the strings, and set it on only the literal matches.
for string in australian_capitals:
lexeme = nlp.vocab[string]
lexeme.set_flag(is_australian_capital, True)
print('Sydney', nlp.vocab[u'Sydney'].check_flag(is_australian_capital))
print('sydney', nlp.vocab[u'sydney'].check_flag(is_australian_capital))
# If we want case-insensitive matching, we have to be a little bit more
# round-about, as there's no case-insensitive index to the vocabulary. So
# we have to iterate over the vocabulary.
# We'll be looking up attribute IDs in this set a lot, so it's good to pre-build it
target_ids = {nlp.vocab.strings[s.lower()] for s in australian_capitals}
for lexeme in nlp.vocab:
if lexeme.lower in target_ids:
lexeme.set_flag(is_australian_capital, True)
print('Sydney', nlp.vocab[u'Sydney'].check_flag(is_australian_capital))
print('sydney', nlp.vocab[u'sydney'].check_flag(is_australian_capital))
print('SYDNEY', nlp.vocab[u'SYDNEY'].check_flag(is_australian_capital))
# Output
# Sydney True
# sydney False
# Sydney True
# sydney True
# SYDNEY True
#
# The key thing to note here is that we're setting these attributes once,
# over the vocabulary --- and then reusing them at run-time. This means the
# amortized complexity of anything we do this way is going to be O(1). You
# can match over expressions that need to have sets with tens of thousands
# of values, e.g. "all the street names in Germany", and you'll still have
# O(1) complexity. Most regular expression algorithms don't scale well to
# this sort of problem.
#
# Now, let's use this in a pattern
nlp.matcher.add("AuCitySportsTeam", "ORG", {},
[
[
{LOWER: "the"},
{is_australian_capital: True},
{TAG: "NNS"}
],
[
{LOWER: "the"},
{is_australian_capital: True},
{TAG: "NNPS"}
],
[
{LOWER: "the"},
{IS_ALPHA: True}, # Allow a word in between, e.g. The Western Sydney
{is_australian_capital: True},
{TAG: "NNS"}
],
[
{LOWER: "the"},
{IS_ALPHA: True}, # Allow a word in between, e.g. The Western Sydney
{is_australian_capital: True},
{TAG: "NNPS"}
]
])
doc = nlp(u'The pattern should match the Brisbane Broncos and the South Darwin Spiders, but not the Colorado Boulders')
for ent in doc.ents:
print(ent.text, ent.label_)
# Output
# the Brisbane Broncos ORG
# the South Darwin Spiders ORG
# Output
# Before
# Google ORG [u'NNP']
# google ORG [u'VB']
# google ORG [u'NNP']
# After
# Google Now PRODUCT [u'NNP', u'RB']
# google ORG [u'VB']
# google now PRODUCT [u'NNP', u'RB']
# Sydney True
# sydney False
# Sydney True
# sydney True
# SYDNEY True
# the Brisbane Broncos ORG
# the South Darwin Spiders ORG
if __name__ == '__main__':
main()

View File

@ -1,36 +0,0 @@
# encoding: utf8
from __future__ import unicode_literals, print_function
import plac
import codecs
import pathlib
import random
import twython
import spacy.en
import _handler
class Connection(twython.TwythonStreamer):
def __init__(self, keys_dir, nlp, query):
keys_dir = pathlib.Path(keys_dir)
read = lambda fn: (keys_dir / (fn + '.txt')).open().read().strip()
api_key = map(read, ['key', 'secret', 'token', 'token_secret'])
twython.TwythonStreamer.__init__(self, *api_key)
self.nlp = nlp
self.query = query
def on_success(self, data):
_handler.handle_tweet(self.nlp, data, self.query)
if random.random() >= 0.1:
reload(_handler)
def main(keys_dir, term):
nlp = spacy.en.English()
twitter = Connection(keys_dir, nlp, term)
twitter.statuses.filter(track=term, language='en')
if __name__ == '__main__':
plac.call(main)