Merge branch 'master' of ssh://github.com/honnibal/spaCy

This commit is contained in:
Matthew Honnibal 2016-03-29 14:31:52 +11:00
commit 26622f0ffc
19 changed files with 485 additions and 114 deletions

5
.gitignore vendored
View File

@ -96,5 +96,8 @@ setup.py
# Windows local helper files
*.bat
# Mac OS X
*.DS_Store
# Komodo project files
*.komodoproject
*.komodoproject

View File

@ -0,0 +1,5 @@
An example of inventory counting using SpaCy.io NLP library. Meant to show how to instantiate Spacy's English class, and allow reusability by reloading the main module.
In the future, a better implementation of this library would be to apply machine learning to each query and learn what to classify as the quantitative statement (55 kgs OF), vs the actual item of count (how likely is a preposition object to be the item of count if x,y,z qualifications appear in the statement).

View File

@ -0,0 +1,35 @@
class Inventory:
"""
Inventory class - a struct{} like feature to house inventory counts
across modules.
"""
originalQuery = None
item = ""
unit = ""
amount = ""
def __init__(self, statement):
"""
Constructor - only takes in the original query/statement
:return: new Inventory object
"""
self.originalQuery = statement
pass
def __str__(self):
return str(self.amount) + ' ' + str(self.unit) + ' ' + str(self.item)
def printInfo(self):
print '-------------Inventory Count------------'
print "Original Query: " + str(self.originalQuery)
print 'Amount: ' + str(self.amount)
print 'Unit: ' + str(self.unit)
print 'Item: ' + str(self.item)
print '----------------------------------------'
def isValid(self):
if not self.item or not self.unit or not self.amount or not self.originalQuery:
return False
else:
return True

View File

@ -0,0 +1,92 @@
from inventory import Inventory
def runTest(nlp):
testset = []
testset += [nlp(u'6 lobster cakes')]
testset += [nlp(u'6 avacados')]
testset += [nlp(u'fifty five carrots')]
testset += [nlp(u'i have 55 carrots')]
testset += [nlp(u'i got me some 9 cabbages')]
testset += [nlp(u'i got 65 kgs of carrots')]
result = []
for doc in testset:
c = decodeInventoryEntry_level1(doc)
if not c.isValid():
c = decodeInventoryEntry_level2(doc)
result.append(c)
for i in result:
i.printInfo()
def decodeInventoryEntry_level1(document):
"""
Decodes a basic entry such as: '6 lobster cake' or '6' cakes
@param document : NLP Doc object
:return: Status if decoded correctly (true, false), and Inventory object
"""
count = Inventory(str(document))
for token in document:
if token.pos_ == (u'NOUN' or u'NNS' or u'NN'):
item = str(token)
for child in token.children:
if child.dep_ == u'compound' or child.dep_ == u'ad':
item = str(child) + str(item)
elif child.dep_ == u'nummod':
count.amount = str(child).strip()
for numerical_child in child.children:
# this isn't arithmetic rather than treating it such as a string
count.amount = str(numerical_child) + str(count.amount).strip()
else:
print "WARNING: unknown child: " + str(child) + ':'+str(child.dep_)
count.item = item
count.unit = item
return count
def decodeInventoryEntry_level2(document):
"""
Entry level 2, a more complicated parsing scheme that covers examples such as
'i have 80 boxes of freshly baked pies'
@document @param document : NLP Doc object
:return: Status if decoded correctly (true, false), and Inventory object-
"""
count = Inventory(str(document))
for token in document:
# Look for a preposition object that is a noun (this is the item we are counting).
# If found, look at its' dependency (if a preposition that is not indicative of
# inventory location, the dependency of the preposition must be a noun
if token.dep_ == (u'pobj' or u'meta') and token.pos_ == (u'NOUN' or u'NNS' or u'NN'):
item = ''
# Go through all the token's children, these are possible adjectives and other add-ons
# this deals with cases such as 'hollow rounded waffle pancakes"
for i in token.children:
item += ' ' + str(i)
item += ' ' + str(token)
count.item = item
# Get the head of the item:
if token.head.dep_ != u'prep':
# Break out of the loop, this is a confusing entry
break
else:
amountUnit = token.head.head
count.unit = str(amountUnit)
for inner in amountUnit.children:
if inner.pos_ == u'NUM':
count.amount += str(inner)
return count

View File

@ -0,0 +1,31 @@
import inventoryCount as mainModule
import os
from spacy.en import English, LOCAL_DATA_DIR
data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
if __name__ == '__main__':
"""
Main module for this example - loads the English main NLP class,
and keeps it in RAM while waiting for the user to re-run it. Allows the
developer to re-edit their module under testing without having
to wait as long to load the English class
"""
# Set the NLP object here for the parameters you want to see,
# or just leave it blank and get all the opts
print "Loading English module... this will take a while."
nlp = English()
print "Done loading English module."
while True:
try:
reload(mainModule)
mainModule.runTest(nlp)
raw_input('================ To reload main module, press Enter ================')
except Exception, e:
print "Unexpected error: " + str(e)
continue

View File

@ -1,8 +1,15 @@
from . import util
from .util import set_lang_class, get_lang_class, get_package, get_package_by_name
from .en import English
from .de import German
set_lang_class(English.lang, English)
set_lang_class(German.lang, German)
def load(name, vectors=None, via=None):
return English(
package=util.get_package_by_name(name, via=via),
vectors_package=util.get_package_by_name(vectors, via=via))
package = get_package_by_name(name, via=via)
vectors_package = get_package_by_name(vectors, via=via)
cls = get_lang_class(name)
return cls(package=package, vectors_package=vectors_package)

View File

@ -10,4 +10,8 @@ __uri__ = 'https://spacy.io'
__author__ = 'Matthew Honnibal'
__email__ = 'matt@spacy.io'
__license__ = 'MIT'
__default_model__ = 'en>=1.0.0,<1.1.0'
__models__ = {
'en': 'en>=1.0.0,<1.1.0',
'de': 'de>=1.0.0,<1.1.0',
}
__default_lang__ = 'en'

13
spacy/de/download.py Normal file
View File

@ -0,0 +1,13 @@
import plac
from ..download import download
@plac.annotations(
force=("Force overwrite", "flag", "f", bool),
)
def main(data_size='all', force=False):
download('de', force)
if __name__ == '__main__':
plac.call(main)

33
spacy/download.py Normal file
View File

@ -0,0 +1,33 @@
from __future__ import print_function
import sys
import sputnik
from sputnik.package_list import (PackageNotFoundException,
CompatiblePackageNotFoundException)
from . import about
def download(lang, force=False):
if force:
sputnik.purge(about.__title__, about.__version__)
try:
sputnik.package(about.__title__, about.__version__, about.__models__[lang])
print("Model already installed. Please run 'python -m "
"spacy.%s.download --force' to reinstall." % lang, file=sys.stderr)
sys.exit(1)
except (PackageNotFoundException, CompatiblePackageNotFoundException):
pass
package = sputnik.install(about.__title__, about.__version__, about.__models__[lang])
try:
sputnik.package(about.__title__, about.__version__, about.__models__[lang])
except (PackageNotFoundException, CompatiblePackageNotFoundException):
print("Model failed to install. Please run 'python -m "
"spacy.%s.download --force'." % lang, file=sys.stderr)
sys.exit(1)
print("Model successfully installed.", file=sys.stderr)

View File

@ -1,57 +1,12 @@
from __future__ import print_function
import sys
import os
import shutil
import plac
import sputnik
from sputnik.package_list import (PackageNotFoundException,
CompatiblePackageNotFoundException)
from .. import about
def migrate(path):
data_path = os.path.join(path, 'data')
if os.path.isdir(data_path):
if os.path.islink(data_path):
os.unlink(data_path)
else:
shutil.rmtree(data_path)
for filename in os.listdir(path):
if filename.endswith('.tgz'):
os.unlink(os.path.join(path, filename))
from ..download import download
@plac.annotations(
force=("Force overwrite", "flag", "f", bool),
)
def main(data_size='all', force=False):
if force:
sputnik.purge(about.__title__, about.__version__)
try:
sputnik.package(about.__title__, about.__version__, about.__default_model__)
print("Model already installed. Please run 'python -m "
"spacy.en.download --force' to reinstall.", file=sys.stderr)
sys.exit(1)
except (PackageNotFoundException, CompatiblePackageNotFoundException):
pass
package = sputnik.install(about.__title__, about.__version__, about.__default_model__)
try:
sputnik.package(about.__title__, about.__version__, about.__default_model__)
except (PackageNotFoundException, CompatiblePackageNotFoundException):
print("Model failed to install. Please run 'python -m "
"spacy.en.download --force'.", file=sys.stderr)
sys.exit(1)
# FIXME clean up old-style packages
migrate(os.path.dirname(os.path.abspath(__file__)))
print("Model successfully installed.", file=sys.stderr)
download('en', force)
if __name__ == '__main__':

View File

@ -24,3 +24,4 @@ cdef class StringStore:
cdef int64_t _resize_at
cdef const Utf8Str* intern(self, unicode py_string) except NULL
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length) except NULL

View File

@ -1,30 +1,25 @@
from __future__ import unicode_literals
import codecs
from __future__ import unicode_literals, absolute_import
cimport cython
from libc.string cimport memcpy
from libc.stdint cimport uint64_t
from murmurhash.mrmr cimport hash64
from preshed.maps cimport map_iter, key_t
from cpython cimport PyUnicode_AS_DATA
from cpython cimport PyUnicode_GET_DATA_SIZE
from libc.stdint cimport int64_t
from .typedefs cimport hash_t, attr_t
try:
import codecs as io
except ImportError:
import io
from .typedefs cimport hash_t
import ujson as json
cpdef hash_t hash_string(unicode string) except 0:
chars = string.encode('utf8')
return hash64(<char*>chars, len(chars), 1)
return _hash_utf8(chars, len(chars))
cdef hash_t _hash_utf8(char* utf8_string, int length):
return hash64(utf8_string, length, 1)
cdef unicode _decode(const Utf8Str* string):
@ -92,45 +87,45 @@ cdef class StringStore:
def __getitem__(self, object string_or_id):
cdef bytes byte_string
cdef unicode py_string
cdef const Utf8Str* utf8str
cdef unsigned int int_id
cdef int id_
if isinstance(string_or_id, int) or isinstance(string_or_id, long):
if string_or_id == 0:
return u''
elif string_or_id < 1 or string_or_id >= self.size:
if isinstance(string_or_id, (int, long)):
try:
int_id = string_or_id
except OverflowError:
raise IndexError(string_or_id)
utf8str = &self.c[<int>string_or_id]
if int_id == 0:
return u''
elif int_id >= <uint64_t>self.size:
raise IndexError(string_or_id)
utf8str = &self.c[int_id]
return _decode(utf8str)
elif isinstance(string_or_id, bytes):
if len(string_or_id) == 0:
byte_string = <bytes>string_or_id
if len(byte_string) == 0:
return 0
py_string = string_or_id.decode('utf8')
utf8str = self.intern(py_string)
utf8str = self._intern_utf8(byte_string, len(byte_string))
return utf8str - self.c
elif isinstance(string_or_id, unicode):
if len(string_or_id) == 0:
if len(<unicode>string_or_id) == 0:
return 0
py_string = string_or_id
utf8str = self.intern(py_string)
byte_string = (<unicode>string_or_id).encode('utf8')
utf8str = self._intern_utf8(byte_string, len(byte_string))
return utf8str - self.c
else:
raise TypeError(type(string_or_id))
def __contains__(self, unicode string):
def __contains__(self, unicode string not None):
if len(string) == 0:
return True
cdef hash_t key = hash_string(string)
value = <Utf8Str*>self._map.get(key)
return True if value is not NULL else False
return self._map.get(key) is not NULL
def __iter__(self):
cdef int i
for i in range(self.size):
if i == 0:
yield u''
else:
utf8str = &self.c[i]
yield _decode(utf8str)
yield _decode(&self.c[i]) if i > 0 else u''
def __reduce__(self):
strings = [""]
@ -142,21 +137,26 @@ cdef class StringStore:
cdef const Utf8Str* intern(self, unicode py_string) except NULL:
# 0 means missing, but we don't bother offsetting the index.
cdef hash_t key = hash_string(py_string)
cdef bytes byte_string = py_string.encode('utf8')
return self._intern_utf8(byte_string, len(byte_string))
@cython.final
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length) except NULL:
# 0 means missing, but we don't bother offsetting the index.
cdef hash_t key = _hash_utf8(utf8_string, length)
value = <Utf8Str*>self._map.get(key)
if value != NULL:
if value is not NULL:
return value
if self.size == self._resize_at:
self._realloc()
cdef bytes byte_string = py_string.encode('utf8')
self.c[self.size] = _allocate(self.mem, <unsigned char*>byte_string, len(byte_string))
self.c[self.size] = _allocate(self.mem, <unsigned char*>utf8_string, length)
self._map.set(key, <void*>&self.c[self.size])
self.size += 1
return &self.c[self.size-1]
def dump(self, file_):
string_data = json.dumps([s for s in self])
string_data = json.dumps(list(self))
if not isinstance(string_data, unicode):
string_data = string_data.decode('utf8')
file_.write(string_data)
@ -166,8 +166,10 @@ cdef class StringStore:
if strings == ['']:
return None
cdef unicode string
for string in strings:
if string:
for string in strings:
# explicit None/len check instead of simple truth testing
# (bug in Cython <= 0.23.4)
if string is not None and len(string):
self.intern(string)
def _realloc(self):

View File

@ -118,15 +118,18 @@ class PseudoProjectivity:
# reattach arcs with decorated labels (following HEAD scheme)
# for each decorated arc X||Y, search top-down, left-to-right,
# breadth-first until hitting a Y then make this the new head
parse = tokens.to_array([HEAD, DEP])
labels = [ tokens.vocab.strings[int(p[1])] for p in parse ]
#parse = tokens.to_array([HEAD, DEP])
for token in tokens:
if cls.is_decorated(token.dep_):
newlabel,headlabel = cls.decompose(token.dep_)
newhead = cls._find_new_head(token,headlabel)
parse[token.i,1] = tokens.vocab.strings[newlabel]
parse[token.i,0] = newhead.i - token.i
tokens.from_array([HEAD, DEP],parse)
token.head = newhead
token.dep_ = newlabel
# tokens.attach(token,newhead,newlabel)
#parse[token.i,1] = tokens.vocab.strings[newlabel]
#parse[token.i,0] = newhead.i - token.i
#tokens.from_array([HEAD, DEP],parse)
@classmethod
@ -168,7 +171,7 @@ class PseudoProjectivity:
@classmethod
def _find_new_head(cls, token, headlabel):
# search through the tree starting from root
# search through the tree starting from the head of the given token
# returns the id of the first descendant with the given label
# if there is none, return the current head (no change)
queue = [token.head]
@ -176,8 +179,8 @@ class PseudoProjectivity:
next_queue = []
for qtoken in queue:
for child in qtoken.children:
if child == token:
continue
if child.is_space: continue
if child == token: continue
if child.dep_ == headlabel:
return child
next_queue.append(child)

View File

@ -62,3 +62,67 @@ def test_vectors(EN):
assert sum(apples.vector) != sum(oranges.vector)
assert apples.vector_norm != oranges.vector_norm
@pytest.mark.models
def test_ancestors(EN):
# the structure of this sentence depends on the English annotation scheme
tokens = EN(u'Yesterday I saw a dog that barked loudly.')
ancestors = [ t.orth_ for t in tokens[6].ancestors ]
assert ancestors == ['dog','saw']
ancestors = [ t.orth_ for t in tokens[1].ancestors ]
assert ancestors == ['saw']
ancestors = [ t.orth_ for t in tokens[2].ancestors ]
assert ancestors == []
assert tokens[2].is_ancestor_of(tokens[7])
assert not tokens[6].is_ancestor_of(tokens[2])
@pytest.mark.models
def test_head_setter(EN):
# the structure of this sentence depends on the English annotation scheme
yesterday, i, saw, a, dog, that, barked, loudly, dot = EN(u'Yesterday I saw a dog that barked loudly.')
assert barked.n_lefts == 1
assert barked.n_rights == 1
assert barked.left_edge == that
assert barked.right_edge == loudly
assert dog.n_lefts == 1
assert dog.n_rights == 1
assert dog.left_edge == a
assert dog.right_edge == loudly
assert a.n_lefts == 0
assert a.n_rights == 0
assert a.left_edge == a
assert a.right_edge == a
assert saw.left_edge == yesterday
assert saw.right_edge == dot
barked.head = a
assert barked.n_lefts == 1
assert barked.n_rights == 1
assert barked.left_edge == that
assert barked.right_edge == loudly
assert a.n_lefts == 0
assert a.n_rights == 1
assert a.left_edge == a
assert a.right_edge == loudly
assert dog.n_lefts == 1
assert dog.n_rights == 0
assert dog.left_edge == a
assert dog.right_edge == loudly
assert saw.left_edge == yesterday
assert saw.right_edge == dot
yesterday.head = that
assert that.left_edge == yesterday
assert barked.left_edge == yesterday
assert a.left_edge == yesterday
assert dog.left_edge == yesterday
assert saw.left_edge == yesterday

View File

@ -16,8 +16,7 @@ cimport cython
from . import util
from .tokens.doc cimport Doc
from .util import read_lang_data
from .util import get_package
from .util import read_lang_data, get_package
cdef class Tokenizer:

View File

@ -6,7 +6,7 @@ from .doc cimport Doc
cdef class Token:
cdef Vocab vocab
cdef const TokenC* c
cdef TokenC* c
cdef readonly int i
cdef int array_len
cdef readonly Doc doc

View File

@ -142,6 +142,8 @@ cdef class Token:
property dep:
def __get__(self):
return self.c.dep
def __set__(self, int label):
self.c.dep = label
property has_vector:
def __get__(self):
@ -250,10 +252,113 @@ cdef class Token:
def __get__(self):
return self.doc[self.c.r_edge]
property ancestors:
def __get__(self):
cdef const TokenC* head_ptr = self.c
# guard against infinite loop, no token can have
# more ancestors than tokens in the tree
cdef int i = 0
while head_ptr.head != 0 and i < self.doc.length:
head_ptr += head_ptr.head
yield self.doc[head_ptr - (self.c - self.i)]
i += 1
def is_ancestor_of(self, descendant):
return any( ancestor.i == self.i for ancestor in descendant.ancestors )
property head:
def __get__(self):
"""The token predicted by the parser to be the head of the current token."""
return self.doc[self.i + self.c.head]
def __set__(self, Token new_head):
# this function sets the head of self to new_head
# and updates the counters for left/right dependents
# and left/right corner for the new and the old head
# do nothing if old head is new head
if self.i + self.c.head == new_head.i:
return
cdef Token old_head = self.head
cdef int rel_newhead_i = new_head.i - self.i
# is the new head a descendant of the old head
cdef bint is_desc = old_head.is_ancestor_of(new_head)
cdef int new_edge
cdef Token anc, child
# update number of deps of old head
if self.c.head > 0: # left dependent
old_head.c.l_kids -= 1
if self.c.l_edge == old_head.c.l_edge:
# the token dominates the left edge so the left edge of the head
# may change when the token is reattached
# it may not change if the new head is a descendant of the current head
new_edge = self.c.l_edge
# the new l_edge is the left-most l_edge on any of the other dependents
# where the l_edge is left of the head, otherwise it is the head
if not is_desc:
new_edge = old_head.i
for child in old_head.children:
if child == self:
continue
if child.c.l_edge < new_edge:
new_edge = child.c.l_edge
old_head.c.l_edge = new_edge
# walk up the tree from old_head and assign new l_edge to ancestors
# until an ancestor already has an l_edge that's further left
for anc in old_head.ancestors:
if anc.c.l_edge <= new_edge:
break
anc.c.l_edge = new_edge
elif self.c.head < 0: # right dependent
old_head.c.r_kids -= 1
# do the same thing as for l_edge
if self.c.r_edge == old_head.c.r_edge:
new_edge = self.c.r_edge
if not is_desc:
new_edge = old_head.i
for child in old_head.children:
if child == self:
continue
if child.c.r_edge > new_edge:
new_edge = child.c.r_edge
old_head.c.r_edge = new_edge
for anc in old_head.ancestors:
if anc.c.r_edge >= new_edge:
break
anc.c.r_edge = new_edge
# update number of deps of new head
if rel_newhead_i > 0: # left dependent
new_head.c.l_kids += 1
# walk up the tree from new head and set l_edge to self.l_edge
# until you hit a token with an l_edge further to the left
if self.c.l_edge < new_head.c.l_edge:
new_head.c.l_edge = self.c.l_edge
for anc in new_head.ancestors:
if anc.c.l_edge <= self.c.l_edge:
break
anc.c.l_edge = self.c.l_edge
elif rel_newhead_i < 0: # right dependent
new_head.c.r_kids += 1
# do the same as for l_edge
if self.c.r_edge > new_head.c.r_edge:
new_head.c.r_edge = self.c.r_edge
for anc in new_head.ancestors:
if anc.c.r_edge >= self.c.r_edge:
break
anc.c.r_edge = self.c.r_edge
# set new head
self.c.head = rel_newhead_i
property conjuncts:
def __get__(self):
@ -325,6 +430,8 @@ cdef class Token:
property dep_:
def __get__(self):
return self.vocab.strings[self.c.dep]
def __set__(self, unicode label):
self.c.dep = self.vocab.strings[label]
property is_oov:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV)

View File

@ -14,6 +14,21 @@ from . import about
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
LANGUAGES = {}
def set_lang_class(name, cls):
global LANGUAGES
LANGUAGES[name] = cls
def get_lang_class(name):
lang = re.split('[^a-zA-Z0-9_]', name, 1)[0]
if lang not in LANGUAGES:
raise RuntimeError('Language not supported: %s' % lang)
return LANGUAGES[lang]
def get_package(data_dir):
if not isinstance(data_dir, six.string_types):
raise RuntimeError('data_dir must be a string')
@ -21,17 +36,20 @@ def get_package(data_dir):
def get_package_by_name(name=None, via=None):
package_name = name or about.__models__[about.__default_lang__]
lang = get_lang_class(package_name)
try:
return sputnik.package(about.__title__, about.__version__,
name or about.__default_model__, data_path=via)
package_name, data_path=via)
except PackageNotFoundException as e:
raise RuntimeError("Model %s not installed. Please run 'python -m "
"spacy.en.download' to install latest compatible "
"model." % name)
raise RuntimeError("Model '%s' not installed. Please run 'python -m "
"%s.download' to install latest compatible "
"model." % (name, lang.__module__))
except CompatiblePackageNotFoundException as e:
raise RuntimeError("Installed model is not compatible with spaCy "
"version. Please run 'python -m spacy.en.download "
"--force' to install latest compatible model.")
"version. Please run 'python -m %s.download "
"--force' to install latest compatible model." %
(lang.__module__))
def normalize_slice(length, start, stop, step=None):

View File

@ -25,7 +25,6 @@ from . import attrs
from . import symbols
from cymem.cymem cimport Address
from . import util
from .serialize.packer cimport Packer
from .attrs cimport PROB