mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			93 lines
		
	
	
		
			3.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			93 lines
		
	
	
		
			3.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
from inventory import Inventory
 | 
						|
 | 
						|
 | 
						|
def runTest(nlp):
 | 
						|
    testset = []
 | 
						|
    testset += [nlp(u'6 lobster cakes')]
 | 
						|
    testset += [nlp(u'6 avacados')]
 | 
						|
    testset += [nlp(u'fifty five carrots')]
 | 
						|
    testset += [nlp(u'i have 55 carrots')]
 | 
						|
    testset += [nlp(u'i got me some 9 cabbages')]
 | 
						|
    testset += [nlp(u'i got 65 kgs of carrots')]
 | 
						|
 | 
						|
    result = []
 | 
						|
    for doc in testset:
 | 
						|
        c = decodeInventoryEntry_level1(doc)
 | 
						|
        if not c.isValid():
 | 
						|
            c = decodeInventoryEntry_level2(doc)
 | 
						|
        result.append(c)
 | 
						|
 | 
						|
    for i in result:
 | 
						|
        i.printInfo()
 | 
						|
 | 
						|
 | 
						|
def decodeInventoryEntry_level1(document):
 | 
						|
    """
 | 
						|
    Decodes a basic entry such as: '6 lobster cake' or '6' cakes
 | 
						|
    @param document : NLP Doc object
 | 
						|
    :return: Status if decoded correctly (true, false), and Inventory object
 | 
						|
    """
 | 
						|
    count = Inventory(str(document))
 | 
						|
    for token in document:
 | 
						|
        if token.pos_ == (u'NOUN' or u'NNS' or u'NN'):
 | 
						|
            item = str(token)
 | 
						|
 | 
						|
            for child in token.children:
 | 
						|
                if child.dep_ == u'compound' or child.dep_ == u'ad':
 | 
						|
                    item = str(child) + str(item)
 | 
						|
                elif child.dep_ == u'nummod':
 | 
						|
                    count.amount = str(child).strip()
 | 
						|
                    for numerical_child in child.children:
 | 
						|
                        # this isn't arithmetic rather than treating it such as a string
 | 
						|
                        count.amount = str(numerical_child) + str(count.amount).strip()
 | 
						|
                else:
 | 
						|
                    print "WARNING: unknown child: " + str(child) + ':'+str(child.dep_)
 | 
						|
 | 
						|
            count.item = item
 | 
						|
            count.unit = item
 | 
						|
 | 
						|
    return count
 | 
						|
 | 
						|
 | 
						|
def decodeInventoryEntry_level2(document):
 | 
						|
    """
 | 
						|
    Entry level 2, a more complicated parsing scheme that covers examples such as
 | 
						|
    'i have 80 boxes of freshly baked pies'
 | 
						|
 | 
						|
    @document @param document : NLP Doc object
 | 
						|
    :return: Status if decoded correctly (true, false), and Inventory object-
 | 
						|
    """
 | 
						|
 | 
						|
    count = Inventory(str(document))
 | 
						|
 | 
						|
    for token in document:
 | 
						|
        #  Look for a preposition object that is a noun (this is the item we are counting).
 | 
						|
        #  If found, look at its' dependency (if a preposition that is not indicative of
 | 
						|
        #  inventory location, the dependency of the preposition must be a noun
 | 
						|
 | 
						|
        if token.dep_ == (u'pobj' or u'meta') and token.pos_ == (u'NOUN' or u'NNS' or u'NN'):
 | 
						|
            item = ''
 | 
						|
 | 
						|
            #  Go through all the token's children, these are possible adjectives and other add-ons
 | 
						|
            #  this deals with cases such as 'hollow rounded waffle pancakes"
 | 
						|
            for i in token.children:
 | 
						|
                item += ' ' + str(i)
 | 
						|
 | 
						|
            item += ' ' + str(token)
 | 
						|
            count.item = item
 | 
						|
 | 
						|
            # Get the head of the item:
 | 
						|
            if token.head.dep_ != u'prep':
 | 
						|
                #  Break out of the loop, this is a confusing entry
 | 
						|
                break
 | 
						|
            else:
 | 
						|
                amountUnit = token.head.head
 | 
						|
                count.unit = str(amountUnit)
 | 
						|
 | 
						|
                for inner in amountUnit.children:
 | 
						|
                    if inner.pos_ == u'NUM':
 | 
						|
                        count.amount += str(inner)
 | 
						|
    return count
 | 
						|
 | 
						|
 |