Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-08-19 09:03:15 -05:00
commit 7c47e38c12
24 changed files with 392 additions and 28 deletions

18
spacy/lang/da/examples.py Normal file
View File

@ -0,0 +1,18 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.da.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple overvejer at købe et britisk statup for 1 milliard dollar",
"Selvkørende biler flytter forsikringsansvaret over på producenterne",
"San Francisco overvejer at forbyde leverandørrobotter på fortov",
"London er en stor by i Storbritannien"
]

22
spacy/lang/de/examples.py Normal file
View File

@ -0,0 +1,22 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.de.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen",
"Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz",
"Trend zum Urlaub in Deutschland beschert Gastwirten mehr Umsatz",
"Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion",
"San Francisco erwägt Verbot von Lieferrobotern",
"Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller",
"Wo bist du?",
"Was ist die Hauptstadt von Deutschland?"
]

22
spacy/lang/en/examples.py Normal file
View File

@ -0,0 +1,22 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.en.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple is looking at buying U.K. startup for $1 billion",
"Autonomous cars shift insurance liability toward manufacturers",
"San Francisco considers banning sidewalk delivery robots",
"London is a big city in the United Kingdom.",
"Where are you?",
"Who is the president of France?",
"What is the capital of the United States?",
"When was Barack Obama born?"
]

22
spacy/lang/es/examples.py Normal file
View File

@ -0,0 +1,22 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.es.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple está buscando comprar una startup del Reino Unido por mil millones de dólares",
"Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes",
"San Francisco analiza prohibir los robots delivery",
"Londres es una gran ciudad del Reino Unido",
"El gato come pescado",
"Veo al hombre con el telescopio",
"La araña come moscas",
"El pingüino incuba en su nido"
]

26
spacy/lang/fr/examples.py Normal file
View File

@ -0,0 +1,26 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.fr.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple cherche a acheter une startup anglaise pour 1 milliard de dollard",
"Les voitures autonomes voient leur assurances décalées vers les constructeurs",
"San Francisco envisage d'interdire les robots coursiers",
"Londres est une grande ville du Royaume-Uni",
"LItalie choisit ArcelorMittal pour reprendre la plus grande aciérie dEurope",
"Apple lance HomePod parce qu'il se sent menacé par l'Echo d'Amazon",
"La France ne devrait pas manquer d'électricité cet été, même en cas de canicule",
"Nouvelles attaques de Trump contre le maire de Londres",
"Où es-tu ?",
"Qui est le président de la France ?",
"Où est la capitale des Etats-Unis ?",
"Quand est né Barack Obama ?"
]

28
spacy/lang/he/examples.py Normal file
View File

@ -0,0 +1,28 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.he.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
'סין מקימה קרן של 440 מיליון דולר להשקעה בהייטק בישראל',
'רה"מ הודיע כי יחרים טקס בחסותו',
'הכנסת צפויה לאשר איכון אוטומטי של שיחות למוקד 100',
'תוכנית לאומית תהפוך את ישראל למעצמה דיגיטלית',
'סע לשלום, המפתחות בפנים.',
'מלצר, פעמיים טורקי!',
'ואהבת לרעך כמוך.',
'היום נעשה משהו בלתי נשכח.',
'איפה הילד?',
'מיהו נשיא צרפת?',
'מהי בירת ארצות הברית?',
"איך קוראים בעברית לצ'ופצ'יק של הקומקום?",
'מה הייתה הדקה?',
'מי אומר שלום ראשון, זה שעולה או זה שיורד?'
]

18
spacy/lang/it/examples.py Normal file
View File

@ -0,0 +1,18 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.it.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple vuole comprare una startup del Regno Unito per un miliardo di dollari",
"Le automobili a guida autonoma spostano la responsabilità assicurativa verso i produttori",
"San Francisco prevede di bandire i robot di consegna porta a porta",
"Londra è una grande città del Regno Unito."
]

18
spacy/lang/nb/examples.py Normal file
View File

@ -0,0 +1,18 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.nb.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar",
"Selvkjørende biler flytter forsikringsansvaret over på produsentene ",
"San Francisco vurderer å forby robotbud på fortauene",
"London er en stor by i Storbritannia."
]

20
spacy/lang/pl/examples.py Normal file
View File

@ -0,0 +1,20 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.pl.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Poczuł przyjemną woń mocnej kawy.",
"Istnieje wiele dróg oddziaływania substancji psychoaktywnej na układ nerwowy.",
"Powitał mnie biało-czarny kot, płosząc siedzące na płocie trzy dorodne dudki.",
"Nowy abonament pod lupą Komisji Europejskiej",
"Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?",
"Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”."
]

18
spacy/lang/pt/examples.py Normal file
View File

@ -0,0 +1,18 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.pt.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares",
"Carros autônomos empurram a responsabilidade do seguro para os fabricantes."
"São Francisco considera banir os robôs de entrega que andam pelas calçadas",
"Londres é a maior cidade do Reino Unido"
]

18
spacy/lang/sv/examples.py Normal file
View File

@ -0,0 +1,18 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.sv.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple överväger att köpa brittisk startup för 1 miljard dollar.",
"Självkörande bilar förskjuter försäkringsansvar mot tillverkare.",
"San Fransisco överväger förbud mot leveransrobotar på trottoarer.".
"London är en storstad i Storbritannien."
]

View File

@ -430,11 +430,16 @@ class Language(object):
except StopIteration:
pass
def pipe(self, texts, tuples=False, n_threads=2, batch_size=1000, disable=[]):
def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000,
disable=[]):
"""Process texts as a stream, and yield `Doc` objects in order. Supports
GIL-free multi-threading.
texts (iterator): A sequence of texts to process.
as_tuples (bool):
If set to True, inputs should be a sequence of
(text, context) tuples. Output will then be a sequence of
(doc, context) tuples. Defaults to False.
n_threads (int): The number of worker threads to use. If -1, OpenMP will
decide how many to use at run time. Default is 2.
batch_size (int): The number of texts to buffer.
@ -446,7 +451,7 @@ class Language(object):
>>> for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
>>> assert doc.is_parsed
"""
if tuples:
if as_tuples:
text_context1, text_context2 = itertools.tee(texts)
texts = (tc[0] for tc in text_context1)
contexts = (tc[1] for tc in text_context2)

View File

@ -63,7 +63,7 @@ def vector_size():
@pytest.fixture
def beam(moves, states, golds, beam_width):
return ParserBeam(moves, states, golds, width=beam_width)
return ParserBeam(moves, states, golds, width=beam_width, density=0.0)
@pytest.fixture
def scores(moves, batch_size, beam_width):

View File

@ -11,8 +11,8 @@ import pytest
def taggers(en_vocab):
tagger1 = Tagger(en_vocab)
tagger2 = Tagger(en_vocab)
tagger1.model = tagger1.Model(None, None)
tagger2.model = tagger2.Model(None, None)
tagger1.model = tagger1.Model(8, 8)
tagger2.model = tagger1.model
return (tagger1, tagger2)
@ -20,7 +20,6 @@ def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers):
tagger1, tagger2 = taggers
tagger1_b = tagger1.to_bytes()
tagger2_b = tagger2.to_bytes()
assert tagger1_b == tagger2_b
tagger1 = tagger1.from_bytes(tagger1_b)
assert tagger1.to_bytes() == tagger1_b
new_tagger1 = Tagger(en_vocab).from_bytes(tagger1_b)

View File

@ -238,6 +238,27 @@ cdef class Doc:
def doc(self):
return self
def char_span(self, int start_idx, int end_idx, attr_t label=0, vector=None):
"""Create a `Span` object from the slice `doc.text[start : end]`.
doc (Doc): The parent document.
start (int): The index of the first character of the span.
end (int): The index of the first character after the span.
label (uint64): A label to attach to the Span, e.g. for named entities.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
RETURNS (Span): The newly constructed object.
"""
cdef int start = token_by_start(self.c, self.length, start_idx)
if start == -1:
return None
cdef int end = token_by_end(self.c, self.length, end_idx)
if end == -1:
return None
# Currently we have the token index, we want the range-end index
end += 1
cdef Span span = Span(self, start, end, label=label, vector=vector)
return span
def similarity(self, other):
"""Make a semantic similarity estimate. The default estimate is cosine
similarity using an average of word vectors.

View File

@ -15,5 +15,5 @@ cdef class Span:
cdef public _vector
cdef public _vector_norm
cpdef int _recalculate_indices(self) except -1
cpdef np.ndarray to_array(self, object features)

View File

@ -7,7 +7,7 @@ import numpy
import numpy.linalg
from libc.math cimport sqrt
from .doc cimport token_by_start, token_by_end
from .doc cimport token_by_start, token_by_end, get_token_attr
from ..structs cimport TokenC, LexemeC
from ..typedefs cimport flags_t, attr_t, hash_t
from ..attrs cimport attr_id_t
@ -135,6 +135,28 @@ cdef class Span:
return 0.0
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
cpdef np.ndarray to_array(self, object py_attr_ids):
"""Given a list of M attribute IDs, export the tokens to a numpy
`ndarray` of shape `(N, M)`, where `N` is the length of the document.
The values will be 32-bit integers.
attr_ids (list[int]): A list of attribute ID ints.
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
per word, and one column per attribute indicated in the input
`attr_ids`.
"""
cdef int i, j
cdef attr_id_t feature
cdef np.ndarray[attr_t, ndim=2] output
# Make an array from the attributes --- otherwise our inner loop is Python
# dict iteration.
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
for i in range(self.start, self.end):
for j, feature in enumerate(attr_ids):
output[i, j] = get_token_attr(&self.doc.c[i], feature)
return output
cpdef int _recalculate_indices(self) except -1:
if self.end > self.doc.length \
or self.doc.c[self.start].idx != self.start_char \

View File

@ -20,7 +20,7 @@ cdef class Vectors:
'''Store, save and load word vectors.'''
cdef public object data
cdef readonly StringStore strings
cdef public object index
cdef public object key2row
def __init__(self, strings, data_or_width):
self.strings = StringStore()
@ -30,9 +30,9 @@ cdef class Vectors:
else:
data = data_or_width
self.data = data
self.index = {}
self.key2row = {}
for i, string in enumerate(strings):
self.index[self.strings.add(string)] = i
self.key2row[self.strings.add(string)] = i
def __reduce__(self):
return (Vectors, (self.strings, self.data))
@ -40,7 +40,7 @@ cdef class Vectors:
def __getitem__(self, key):
if isinstance(key, basestring):
key = self.strings[key]
i = self.index[key]
i = self.key2row[key]
if i is None:
raise KeyError(key)
else:
@ -49,7 +49,7 @@ cdef class Vectors:
def __setitem__(self, key, vector):
if isinstance(key, basestring):
key = self.strings.add(key)
i = self.index[key]
i = self.key2row[key]
self.data[i] = vector
def __iter__(self):
@ -71,7 +71,7 @@ cdef class Vectors:
def to_disk(self, path, **exclude):
def serialize_vectors(p):
write_vectors_to_bin_loc(self.strings, self.key2i, self.data, str(p))
write_vectors_to_bin_loc(self.strings, self.key2row, self.data, str(p))
serializers = OrderedDict((
('vec.bin', serialize_vectors),
@ -80,12 +80,13 @@ cdef class Vectors:
def from_disk(self, path, **exclude):
def deserialize_vectors(p):
self.key2i, self.vectors = load_vectors_from_bin_loc(self.strings, str(p))
values = load_vectors_from_bin_loc(self.strings, str(p))
self.key2row, self.data = values
serializers = OrderedDict((
('vec.bin', deserialize_vectors)
('vec.bin', deserialize_vectors),
))
return util.to_disk(serializers, exclude)
return util.from_disk(path, serializers, exclude)
def to_bytes(self, **exclude):
def serialize_weights():
@ -93,9 +94,9 @@ cdef class Vectors:
return self.data.to_bytes()
else:
return msgpack.dumps(self.data)
b = msgpack.dumps(self.key2row)
serializers = OrderedDict((
('key2row', lambda: msgpack.dumps(self.key2i)),
('key2row', lambda: msgpack.dumps(self.key2row)),
('strings', lambda: self.strings.to_bytes()),
('vectors', serialize_weights)
))
@ -109,7 +110,7 @@ cdef class Vectors:
self.data = msgpack.loads(b)
deserializers = OrderedDict((
('key2row', lambda b: self.key2i.update(msgpack.loads(b))),
('key2row', lambda b: self.key2row.update(msgpack.loads(b))),
('strings', lambda b: self.strings.from_bytes(b)),
('vectors', deserialize_weights)
))

View File

@ -112,6 +112,10 @@
.u-nowrap
white-space: nowrap
.u-break.u-break
word-wrap: break-word
white-space: initial
.u-no-border
border: none

View File

@ -140,6 +140,44 @@ p Get the number of tokens in the document.
+cell int
+cell The number of tokens in the document.
+h(2, "char_span") Doc.char_span
+tag method
+tag-new(2)
p Create a #[code Span] object from the slice #[code doc.text[start : end]].
+aside-code("Example").
doc = nlp(u'I like New York')
label = doc.vocab.strings['GPE']
span = doc.char_span(7, 15, label=label)
assert span.text == 'New York'
+table(["Name", "Type", "Description"])
+row
+cell #[code start]
+cell int
+cell The index of the first character of the span.
+row
+cell #[code end]
+cell int
+cell The index of the first character after the span.
+row
+cell #[code label]
+cell uint64
+cell A label to attach to the Span, e.g. for named entities.
+row
+cell #[code vector]
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+cell A meaning representation of the span.
+footrow
+cell returns
+cell #[code Span]
+cell The newly constructed object.
+h(2, "similarity") Doc.similarity
+tag method
+tag-model("vectors")
@ -211,12 +249,12 @@ p
+table(["Name", "Type", "Description"])
+row
+cell #[code attr_ids]
+cell ints
+cell list
+cell A list of attribute ID ints.
+footrow
+cell returns
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']]
+cell
| The exported attributes as a 2D numpy array, with one row per
| token and one column per attribute.
@ -245,7 +283,7 @@ p
+row
+cell #[code array]
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']]
+cell The attribute values to load.
+footrow
@ -509,7 +547,7 @@ p
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+cell A 1D numpy array representing the document's semantics.
+h(2, "vector_norm") Doc.vector_norm

View File

@ -111,6 +111,14 @@ p
+cell -
+cell A sequence of unicode objects.
+row
+cell #[code as_tuples]
+cell bool
+cell
| If set to #[code True], inputs should be a sequence of
| #[code (text, context)] tuples. Output will then be a sequence of
| #[code (doc, context)] tuples. Defaults to #[code False].
+row
+cell #[code n_threads]
+cell int

View File

@ -129,7 +129,7 @@ p A real-valued meaning representation.
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+cell A 1D numpy array representing the lexeme's semantics.
+h(2, "vector_norm") Lexeme.vector_norm

View File

@ -37,7 +37,7 @@ p Create a Span object from the #[code slice doc[start : end]].
+row
+cell #[code vector]
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+cell A meaning representation of the span.
+footrow
@ -145,11 +145,47 @@ p
+cell float
+cell A scalar similarity score. Higher is more similar.
+h(2, "to_array") Span.to_array
+tag method
+tag-new(2)
p
| Given a list of #[code M] attribute IDs, export the tokens to a numpy
| #[code ndarray] of shape #[code (N, M)], where #[code N] is the length of
| the document. The values will be 32-bit integers.
+aside-code("Example").
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
doc = nlp(u'I like New York in Autumn.')
span = doc[2:3]
# All strings mapped to integers, for easy export to numpy
np_array = span.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
+table(["Name", "Type", "Description"])
+row
+cell #[code attr_ids]
+cell list
+cell A list of attribute ID ints.
+footrow
+cell returns
+cell #[code.u-break numpy.ndarray[long, ndim=2]]
+cell
| A feature matrix, with one row per word, and one column per
| attribute indicated in the input #[code attr_ids].
+h(2, "merge") Span.merge
+tag method
p Retokenize the document, such that the span is merged into a single token.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn.')
span = doc[2:3]
span.merge()
assert len(doc) == 6
assert doc[2].text == 'New York'
+table(["Name", "Type", "Description"])
+row
+cell #[code **attributes]
@ -270,7 +306,7 @@ p
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+cell A 1D numpy array representing the span's semantics.
+h(2, "vector_norm") Span.vector_norm

View File

@ -250,7 +250,7 @@ p A real-valued meaning representation.
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+cell A 1D numpy array representing the token's semantics.
+h(2, "vector_norm") Span.vector_norm