spaCy/examples/inventory_count/inventoryCount.py

from inventory import Inventory


def runTest(nlp):
    testset = []
    testset += [nlp(u'6 lobster cakes')]
    testset += [nlp(u'6 avacados')]
    testset += [nlp(u'fifty five carrots')]
    testset += [nlp(u'i have 55 carrots')]
    testset += [nlp(u'i got me some 9 cabbages')]
    testset += [nlp(u'i got 65 kgs of carrots')]

    result = []
    for doc in testset:
        c = decodeInventoryEntry_level1(doc)
        if not c.isValid():
            c = decodeInventoryEntry_level2(doc)
        result.append(c)

    for i in result:
        i.printInfo()


def decodeInventoryEntry_level1(document):
    """
    Decodes a basic entry such as: '6 lobster cake' or '6' cakes
    @param document : NLP Doc object
    :return: Status if decoded correctly (true, false), and Inventory object
    """
    count = Inventory(str(document))
    for token in document:
        if token.pos_ == (u'NOUN' or u'NNS' or u'NN'):
            item = str(token)

            for child in token.children:
                if child.dep_ == u'compound' or child.dep_ == u'ad':
                    item = str(child) + str(item)
                elif child.dep_ == u'nummod':
                    count.amount = str(child).strip()
                    for numerical_child in child.children:
                        # this isn't arithmetic rather than treating it such as a string
                        count.amount = str(numerical_child) + str(count.amount).strip()
                else:
                    print "WARNING: unknown child: " + str(child) + ':'+str(child.dep_)

            count.item = item
            count.unit = item

    return count


def decodeInventoryEntry_level2(document):
    """
    Entry level 2, a more complicated parsing scheme that covers examples such as
    'i have 80 boxes of freshly baked pies'

    @document @param document : NLP Doc object
    :return: Status if decoded correctly (true, false), and Inventory object-
    """

    count = Inventory(str(document))

    for token in document:
        #  Look for a preposition object that is a noun (this is the item we are counting).
        #  If found, look at its' dependency (if a preposition that is not indicative of
        #  inventory location, the dependency of the preposition must be a noun

        if token.dep_ == (u'pobj' or u'meta') and token.pos_ == (u'NOUN' or u'NNS' or u'NN'):
            item = ''

            #  Go through all the token's children, these are possible adjectives and other add-ons
            #  this deals with cases such as 'hollow rounded waffle pancakes"
            for i in token.children:
                item += ' ' + str(i)

            item += ' ' + str(token)
            count.item = item

            # Get the head of the item:
            if token.head.dep_ != u'prep':
                #  Break out of the loop, this is a confusing entry
                break
            else:
                amountUnit = token.head.head
                count.unit = str(amountUnit)

                for inner in amountUnit.children:
                    if inner.pos_ == u'NUM':
                        count.amount += str(inner)
    return count