mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
		
						commit
						7c47e38c12
					
				
							
								
								
									
										18
									
								
								spacy/lang/da/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								spacy/lang/da/examples.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,18 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
| Example sentences to test spaCy and its language models. | ||||
| 
 | ||||
| >>> from spacy.lang.da.examples import sentences | ||||
| >>> docs = nlp.pipe(sentences) | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| sentences = [ | ||||
|     "Apple overvejer at købe et britisk statup for 1 milliard dollar", | ||||
|     "Selvkørende biler flytter forsikringsansvaret over på producenterne", | ||||
|     "San Francisco overvejer at forbyde leverandørrobotter på fortov", | ||||
|     "London er en stor by i Storbritannien" | ||||
| ] | ||||
							
								
								
									
										22
									
								
								spacy/lang/de/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								spacy/lang/de/examples.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,22 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
| Example sentences to test spaCy and its language models. | ||||
| 
 | ||||
| >>> from spacy.lang.de.examples import sentences | ||||
| >>> docs = nlp.pipe(sentences) | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| sentences = [ | ||||
|     "Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen", | ||||
|     "Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz", | ||||
|     "Trend zum Urlaub in Deutschland beschert Gastwirten mehr Umsatz", | ||||
|     "Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion", | ||||
|     "San Francisco erwägt Verbot von Lieferrobotern", | ||||
|     "Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller", | ||||
|     "Wo bist du?", | ||||
|     "Was ist die Hauptstadt von Deutschland?" | ||||
| ] | ||||
							
								
								
									
										22
									
								
								spacy/lang/en/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								spacy/lang/en/examples.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,22 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
| Example sentences to test spaCy and its language models. | ||||
| 
 | ||||
| >>> from spacy.lang.en.examples import sentences | ||||
| >>> docs = nlp.pipe(sentences) | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| sentences = [ | ||||
|     "Apple is looking at buying U.K. startup for $1 billion", | ||||
|     "Autonomous cars shift insurance liability toward manufacturers", | ||||
|     "San Francisco considers banning sidewalk delivery robots", | ||||
|     "London is a big city in the United Kingdom.", | ||||
|     "Where are you?", | ||||
|     "Who is the president of France?", | ||||
|     "What is the capital of the United States?", | ||||
|     "When was Barack Obama born?" | ||||
| ] | ||||
							
								
								
									
										22
									
								
								spacy/lang/es/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								spacy/lang/es/examples.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,22 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
| Example sentences to test spaCy and its language models. | ||||
| 
 | ||||
| >>> from spacy.lang.es.examples import sentences | ||||
| >>> docs = nlp.pipe(sentences) | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| sentences = [ | ||||
|     "Apple está buscando comprar una startup del Reino Unido por mil millones de dólares", | ||||
|     "Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes", | ||||
|     "San Francisco analiza prohibir los robots delivery", | ||||
|     "Londres es una gran ciudad del Reino Unido", | ||||
|     "El gato come pescado", | ||||
|     "Veo al hombre con el telescopio", | ||||
|     "La araña come moscas", | ||||
|     "El pingüino incuba en su nido" | ||||
| ] | ||||
							
								
								
									
										26
									
								
								spacy/lang/fr/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								spacy/lang/fr/examples.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,26 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
| Example sentences to test spaCy and its language models. | ||||
| 
 | ||||
| >>> from spacy.lang.fr.examples import sentences | ||||
| >>> docs = nlp.pipe(sentences) | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| sentences = [ | ||||
|     "Apple cherche a acheter une startup anglaise pour 1 milliard de dollard", | ||||
|     "Les voitures autonomes voient leur assurances décalées vers les constructeurs", | ||||
|     "San Francisco envisage d'interdire les robots coursiers", | ||||
|     "Londres est une grande ville du Royaume-Uni", | ||||
|     "L’Italie choisit ArcelorMittal pour reprendre la plus grande aciérie d’Europe", | ||||
|     "Apple lance HomePod parce qu'il se sent menacé par l'Echo d'Amazon", | ||||
|     "La France ne devrait pas manquer d'électricité cet été, même en cas de canicule", | ||||
|     "Nouvelles attaques de Trump contre le maire de Londres", | ||||
|     "Où es-tu ?", | ||||
|     "Qui est le président de la France ?", | ||||
|     "Où est la capitale des Etats-Unis ?", | ||||
|     "Quand est né Barack Obama ?" | ||||
| ] | ||||
							
								
								
									
										28
									
								
								spacy/lang/he/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										28
									
								
								spacy/lang/he/examples.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,28 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
| Example sentences to test spaCy and its language models. | ||||
| 
 | ||||
| >>> from spacy.lang.he.examples import sentences | ||||
| >>> docs = nlp.pipe(sentences) | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| sentences = [ | ||||
|     'סין מקימה קרן של 440 מיליון דולר להשקעה בהייטק בישראל', | ||||
|     'רה"מ הודיע כי יחרים טקס בחסותו', | ||||
|     'הכנסת צפויה לאשר איכון אוטומטי של שיחות למוקד 100', | ||||
|     'תוכנית לאומית תהפוך את ישראל למעצמה דיגיטלית', | ||||
|     'סע לשלום, המפתחות בפנים.', | ||||
|     'מלצר, פעמיים טורקי!', | ||||
|     'ואהבת לרעך כמוך.', | ||||
|     'היום נעשה משהו בלתי נשכח.', | ||||
|     'איפה הילד?', | ||||
|     'מיהו נשיא צרפת?', | ||||
|     'מהי בירת ארצות הברית?', | ||||
|     "איך קוראים בעברית לצ'ופצ'יק של הקומקום?", | ||||
|     'מה הייתה הדקה?', | ||||
|     'מי אומר שלום ראשון, זה שעולה או זה שיורד?' | ||||
| ] | ||||
							
								
								
									
										18
									
								
								spacy/lang/it/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								spacy/lang/it/examples.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,18 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
| Example sentences to test spaCy and its language models. | ||||
| 
 | ||||
| >>> from spacy.lang.it.examples import sentences | ||||
| >>> docs = nlp.pipe(sentences) | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| sentences = [ | ||||
|     "Apple vuole comprare una startup del Regno Unito per un miliardo di dollari", | ||||
|     "Le automobili a guida autonoma spostano la responsabilità assicurativa verso i produttori", | ||||
|     "San Francisco prevede di bandire i robot di consegna porta a porta", | ||||
|     "Londra è una grande città del Regno Unito." | ||||
| ] | ||||
							
								
								
									
										18
									
								
								spacy/lang/nb/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								spacy/lang/nb/examples.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,18 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
| Example sentences to test spaCy and its language models. | ||||
| 
 | ||||
| >>> from spacy.lang.nb.examples import sentences | ||||
| >>> docs = nlp.pipe(sentences) | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| sentences = [ | ||||
|     "Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar", | ||||
|     "Selvkjørende biler flytter forsikringsansvaret over på produsentene ", | ||||
|     "San Francisco vurderer å forby robotbud på fortauene", | ||||
|     "London er en stor by i Storbritannia." | ||||
| ] | ||||
							
								
								
									
										20
									
								
								spacy/lang/pl/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								spacy/lang/pl/examples.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,20 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
| Example sentences to test spaCy and its language models. | ||||
| 
 | ||||
| >>> from spacy.lang.pl.examples import sentences | ||||
| >>> docs = nlp.pipe(sentences) | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| sentences = [ | ||||
|     "Poczuł przyjemną woń mocnej kawy.", | ||||
|     "Istnieje wiele dróg oddziaływania substancji psychoaktywnej na układ nerwowy.", | ||||
|     "Powitał mnie biało-czarny kot, płosząc siedzące na płocie trzy dorodne dudki.", | ||||
|     "Nowy abonament pod lupą Komisji Europejskiej", | ||||
|     "Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?", | ||||
|     "Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”." | ||||
| ] | ||||
							
								
								
									
										18
									
								
								spacy/lang/pt/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								spacy/lang/pt/examples.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,18 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
| Example sentences to test spaCy and its language models. | ||||
| 
 | ||||
| >>> from spacy.lang.pt.examples import sentences | ||||
| >>> docs = nlp.pipe(sentences) | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| sentences = [ | ||||
|     "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares", | ||||
|     "Carros autônomos empurram a responsabilidade do seguro para os fabricantes." | ||||
|     "São Francisco considera banir os robôs de entrega que andam pelas calçadas", | ||||
|     "Londres é a maior cidade do Reino Unido" | ||||
| ] | ||||
							
								
								
									
										18
									
								
								spacy/lang/sv/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								spacy/lang/sv/examples.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,18 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
| Example sentences to test spaCy and its language models. | ||||
| 
 | ||||
| >>> from spacy.lang.sv.examples import sentences | ||||
| >>> docs = nlp.pipe(sentences) | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| sentences = [ | ||||
|     "Apple överväger att köpa brittisk startup för 1 miljard dollar.", | ||||
|     "Självkörande bilar förskjuter försäkringsansvar mot tillverkare.", | ||||
|     "San Fransisco överväger förbud mot leveransrobotar på trottoarer.". | ||||
|     "London är en storstad i Storbritannien." | ||||
| ] | ||||
|  | @ -430,11 +430,16 @@ class Language(object): | |||
|             except StopIteration: | ||||
|                 pass | ||||
| 
 | ||||
|     def pipe(self, texts, tuples=False, n_threads=2, batch_size=1000, disable=[]): | ||||
|     def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000, | ||||
|             disable=[]): | ||||
|         """Process texts as a stream, and yield `Doc` objects in order. Supports | ||||
|         GIL-free multi-threading. | ||||
| 
 | ||||
|         texts (iterator): A sequence of texts to process. | ||||
|         as_tuples (bool): | ||||
|             If set to True, inputs should be a sequence of | ||||
|             (text, context) tuples. Output will then be a sequence of | ||||
|             (doc, context) tuples. Defaults to False. | ||||
|         n_threads (int): The number of worker threads to use. If -1, OpenMP will | ||||
|             decide how many to use at run time. Default is 2. | ||||
|         batch_size (int): The number of texts to buffer. | ||||
|  | @ -446,7 +451,7 @@ class Language(object): | |||
|             >>>     for doc in nlp.pipe(texts, batch_size=50, n_threads=4): | ||||
|             >>>         assert doc.is_parsed | ||||
|         """ | ||||
|         if tuples: | ||||
|         if as_tuples: | ||||
|             text_context1, text_context2 = itertools.tee(texts) | ||||
|             texts = (tc[0] for tc in text_context1) | ||||
|             contexts = (tc[1] for tc in text_context2) | ||||
|  |  | |||
|  | @ -63,7 +63,7 @@ def vector_size(): | |||
| 
 | ||||
| @pytest.fixture | ||||
| def beam(moves, states, golds, beam_width): | ||||
|     return ParserBeam(moves, states, golds, width=beam_width) | ||||
|     return ParserBeam(moves, states, golds, width=beam_width, density=0.0) | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def scores(moves, batch_size, beam_width): | ||||
|  |  | |||
|  | @ -11,8 +11,8 @@ import pytest | |||
| def taggers(en_vocab): | ||||
|     tagger1 = Tagger(en_vocab) | ||||
|     tagger2 = Tagger(en_vocab) | ||||
|     tagger1.model = tagger1.Model(None, None) | ||||
|     tagger2.model = tagger2.Model(None, None) | ||||
|     tagger1.model = tagger1.Model(8, 8) | ||||
|     tagger2.model = tagger1.model | ||||
|     return (tagger1, tagger2) | ||||
| 
 | ||||
| 
 | ||||
|  | @ -20,7 +20,6 @@ def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers): | |||
|     tagger1, tagger2 = taggers | ||||
|     tagger1_b = tagger1.to_bytes() | ||||
|     tagger2_b = tagger2.to_bytes() | ||||
|     assert tagger1_b == tagger2_b | ||||
|     tagger1 = tagger1.from_bytes(tagger1_b) | ||||
|     assert tagger1.to_bytes() == tagger1_b | ||||
|     new_tagger1 = Tagger(en_vocab).from_bytes(tagger1_b) | ||||
|  |  | |||
|  | @ -238,6 +238,27 @@ cdef class Doc: | |||
|     def doc(self): | ||||
|         return self | ||||
| 
 | ||||
|     def char_span(self, int start_idx, int end_idx, attr_t label=0, vector=None): | ||||
|         """Create a `Span` object from the slice `doc.text[start : end]`. | ||||
| 
 | ||||
|         doc (Doc): The parent document. | ||||
|         start (int): The index of the first character of the span. | ||||
|         end (int): The index of the first character after the span. | ||||
|         label (uint64): A label to attach to the Span, e.g. for named entities. | ||||
|         vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. | ||||
|         RETURNS (Span): The newly constructed object. | ||||
|         """ | ||||
|         cdef int start = token_by_start(self.c, self.length, start_idx) | ||||
|         if start == -1: | ||||
|             return None | ||||
|         cdef int end = token_by_end(self.c, self.length, end_idx) | ||||
|         if end == -1: | ||||
|             return None | ||||
|         # Currently we have the token index, we want the range-end index | ||||
|         end += 1 | ||||
|         cdef Span span = Span(self, start, end, label=label, vector=vector) | ||||
|         return span | ||||
| 
 | ||||
|     def similarity(self, other): | ||||
|         """Make a semantic similarity estimate. The default estimate is cosine | ||||
|         similarity using an average of word vectors. | ||||
|  |  | |||
|  | @ -15,5 +15,5 @@ cdef class Span: | |||
|     cdef public _vector | ||||
|     cdef public _vector_norm | ||||
| 
 | ||||
| 
 | ||||
|     cpdef int _recalculate_indices(self) except -1 | ||||
|     cpdef np.ndarray to_array(self, object features) | ||||
|  |  | |||
|  | @ -7,7 +7,7 @@ import numpy | |||
| import numpy.linalg | ||||
| from libc.math cimport sqrt | ||||
| 
 | ||||
| from .doc cimport token_by_start, token_by_end | ||||
| from .doc cimport token_by_start, token_by_end, get_token_attr | ||||
| from ..structs cimport TokenC, LexemeC | ||||
| from ..typedefs cimport flags_t, attr_t, hash_t | ||||
| from ..attrs cimport attr_id_t | ||||
|  | @ -135,6 +135,28 @@ cdef class Span: | |||
|             return 0.0 | ||||
|         return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) | ||||
| 
 | ||||
|     cpdef np.ndarray to_array(self, object py_attr_ids): | ||||
|         """Given a list of M attribute IDs, export the tokens to a numpy | ||||
|         `ndarray` of shape `(N, M)`, where `N` is the length of the document. | ||||
|         The values will be 32-bit integers. | ||||
| 
 | ||||
|         attr_ids (list[int]): A list of attribute ID ints. | ||||
|         RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row | ||||
|             per word, and one column per attribute indicated in the input | ||||
|             `attr_ids`. | ||||
|         """ | ||||
|         cdef int i, j | ||||
|         cdef attr_id_t feature | ||||
|         cdef np.ndarray[attr_t, ndim=2] output | ||||
|         # Make an array from the attributes --- otherwise our inner loop is Python | ||||
|         # dict iteration. | ||||
|         cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64) | ||||
|         output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64) | ||||
|         for i in range(self.start, self.end): | ||||
|             for j, feature in enumerate(attr_ids): | ||||
|                 output[i, j] = get_token_attr(&self.doc.c[i], feature) | ||||
|         return output | ||||
| 
 | ||||
|     cpdef int _recalculate_indices(self) except -1: | ||||
|         if self.end > self.doc.length \ | ||||
|         or self.doc.c[self.start].idx != self.start_char \ | ||||
|  |  | |||
|  | @ -20,7 +20,7 @@ cdef class Vectors: | |||
|     '''Store, save and load word vectors.''' | ||||
|     cdef public object data | ||||
|     cdef readonly StringStore strings | ||||
|     cdef public object index | ||||
|     cdef public object key2row | ||||
| 
 | ||||
|     def __init__(self, strings, data_or_width): | ||||
|         self.strings = StringStore() | ||||
|  | @ -30,9 +30,9 @@ cdef class Vectors: | |||
|         else: | ||||
|             data = data_or_width | ||||
|         self.data = data | ||||
|         self.index = {} | ||||
|         self.key2row = {} | ||||
|         for i, string in enumerate(strings): | ||||
|             self.index[self.strings.add(string)] = i | ||||
|             self.key2row[self.strings.add(string)] = i | ||||
| 
 | ||||
|     def __reduce__(self): | ||||
|         return (Vectors, (self.strings, self.data)) | ||||
|  | @ -40,7 +40,7 @@ cdef class Vectors: | |||
|     def __getitem__(self, key): | ||||
|         if isinstance(key, basestring): | ||||
|             key = self.strings[key] | ||||
|         i = self.index[key] | ||||
|         i = self.key2row[key] | ||||
|         if i is None: | ||||
|             raise KeyError(key) | ||||
|         else: | ||||
|  | @ -49,7 +49,7 @@ cdef class Vectors: | |||
|     def __setitem__(self, key, vector): | ||||
|         if isinstance(key, basestring): | ||||
|             key = self.strings.add(key) | ||||
|         i = self.index[key] | ||||
|         i = self.key2row[key] | ||||
|         self.data[i] = vector | ||||
| 
 | ||||
|     def __iter__(self): | ||||
|  | @ -71,7 +71,7 @@ cdef class Vectors: | |||
| 
 | ||||
|     def to_disk(self, path, **exclude): | ||||
|         def serialize_vectors(p): | ||||
|             write_vectors_to_bin_loc(self.strings, self.key2i, self.data, str(p)) | ||||
|             write_vectors_to_bin_loc(self.strings, self.key2row, self.data, str(p)) | ||||
| 
 | ||||
|         serializers = OrderedDict(( | ||||
|             ('vec.bin', serialize_vectors), | ||||
|  | @ -80,12 +80,13 @@ cdef class Vectors: | |||
| 
 | ||||
|     def from_disk(self, path, **exclude): | ||||
|         def deserialize_vectors(p): | ||||
|             self.key2i, self.vectors = load_vectors_from_bin_loc(self.strings, str(p)) | ||||
|             values = load_vectors_from_bin_loc(self.strings, str(p)) | ||||
|             self.key2row, self.data = values | ||||
| 
 | ||||
|         serializers = OrderedDict(( | ||||
|             ('vec.bin', deserialize_vectors) | ||||
|             ('vec.bin', deserialize_vectors), | ||||
|         )) | ||||
|         return util.to_disk(serializers, exclude) | ||||
|         return util.from_disk(path, serializers, exclude) | ||||
| 
 | ||||
|     def to_bytes(self, **exclude): | ||||
|         def serialize_weights(): | ||||
|  | @ -93,9 +94,9 @@ cdef class Vectors: | |||
|                 return self.data.to_bytes() | ||||
|             else: | ||||
|                 return msgpack.dumps(self.data) | ||||
| 
 | ||||
|         b = msgpack.dumps(self.key2row) | ||||
|         serializers = OrderedDict(( | ||||
|             ('key2row', lambda: msgpack.dumps(self.key2i)), | ||||
|             ('key2row', lambda: msgpack.dumps(self.key2row)), | ||||
|             ('strings', lambda: self.strings.to_bytes()), | ||||
|             ('vectors', serialize_weights) | ||||
|         )) | ||||
|  | @ -109,7 +110,7 @@ cdef class Vectors: | |||
|                 self.data = msgpack.loads(b) | ||||
| 
 | ||||
|         deserializers = OrderedDict(( | ||||
|             ('key2row', lambda b: self.key2i.update(msgpack.loads(b))), | ||||
|             ('key2row', lambda b: self.key2row.update(msgpack.loads(b))), | ||||
|             ('strings', lambda b: self.strings.from_bytes(b)), | ||||
|             ('vectors', deserialize_weights) | ||||
|         )) | ||||
|  |  | |||
|  | @ -112,6 +112,10 @@ | |||
| .u-nowrap | ||||
|     white-space: nowrap | ||||
| 
 | ||||
| .u-break.u-break | ||||
|     word-wrap: break-word | ||||
|     white-space: initial | ||||
| 
 | ||||
| .u-no-border | ||||
|     border: none | ||||
| 
 | ||||
|  |  | |||
|  | @ -140,6 +140,44 @@ p Get the number of tokens in the document. | |||
|         +cell int | ||||
|         +cell The number of tokens in the document. | ||||
| 
 | ||||
| +h(2, "char_span") Doc.char_span | ||||
|     +tag method | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p Create a #[code Span] object from the slice #[code doc.text[start : end]]. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     doc = nlp(u'I like New York') | ||||
|     label = doc.vocab.strings['GPE'] | ||||
|     span = doc.char_span(7, 15, label=label) | ||||
|     assert span.text == 'New York' | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code start] | ||||
|         +cell int | ||||
|         +cell The index of the first character of the span. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code end] | ||||
|         +cell int | ||||
|         +cell The index of the first character after the span. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code label] | ||||
|         +cell uint64 | ||||
|         +cell A label to attach to the Span, e.g. for named entities. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code vector] | ||||
|         +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] | ||||
|         +cell A meaning representation of the span. | ||||
| 
 | ||||
|     +footrow | ||||
|         +cell returns | ||||
|         +cell #[code Span] | ||||
|         +cell The newly constructed object. | ||||
| 
 | ||||
| +h(2, "similarity") Doc.similarity | ||||
|     +tag method | ||||
|     +tag-model("vectors") | ||||
|  | @ -211,12 +249,12 @@ p | |||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code attr_ids] | ||||
|         +cell ints | ||||
|         +cell list | ||||
|         +cell A list of attribute ID ints. | ||||
| 
 | ||||
|     +footrow | ||||
|         +cell returns | ||||
|         +cell #[code numpy.ndarray[ndim=2, dtype='int32']] | ||||
|         +cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']] | ||||
|         +cell | ||||
|             |  The exported attributes as a 2D numpy array, with one row per | ||||
|             |  token and one column per attribute. | ||||
|  | @ -245,7 +283,7 @@ p | |||
| 
 | ||||
|     +row | ||||
|         +cell #[code array] | ||||
|         +cell #[code numpy.ndarray[ndim=2, dtype='int32']] | ||||
|         +cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']] | ||||
|         +cell The attribute values to load. | ||||
| 
 | ||||
|     +footrow | ||||
|  | @ -509,7 +547,7 @@ p | |||
| +table(["Name", "Type", "Description"]) | ||||
|     +footrow | ||||
|         +cell returns | ||||
|         +cell #[code numpy.ndarray[ndim=1, dtype='float32']] | ||||
|         +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] | ||||
|         +cell A 1D numpy array representing the document's semantics. | ||||
| 
 | ||||
| +h(2, "vector_norm") Doc.vector_norm | ||||
|  |  | |||
|  | @ -111,6 +111,14 @@ p | |||
|         +cell - | ||||
|         +cell A sequence of unicode objects. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code as_tuples] | ||||
|         +cell bool | ||||
|         +cell | ||||
|             |  If set to #[code True], inputs should be a sequence of | ||||
|             |  #[code (text, context)] tuples. Output will then be a sequence of | ||||
|             |  #[code (doc, context)] tuples. Defaults to #[code False]. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code n_threads] | ||||
|         +cell int | ||||
|  |  | |||
|  | @ -129,7 +129,7 @@ p A real-valued meaning representation. | |||
| +table(["Name", "Type", "Description"]) | ||||
|     +footrow | ||||
|         +cell returns | ||||
|         +cell #[code numpy.ndarray[ndim=1, dtype='float32']] | ||||
|         +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] | ||||
|         +cell A 1D numpy array representing the lexeme's semantics. | ||||
| 
 | ||||
| +h(2, "vector_norm") Lexeme.vector_norm | ||||
|  |  | |||
|  | @ -37,7 +37,7 @@ p Create a Span object from the #[code slice doc[start : end]]. | |||
| 
 | ||||
|     +row | ||||
|         +cell #[code vector] | ||||
|         +cell #[code numpy.ndarray[ndim=1, dtype='float32']] | ||||
|         +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] | ||||
|         +cell A meaning representation of the span. | ||||
| 
 | ||||
|     +footrow | ||||
|  | @ -145,11 +145,47 @@ p | |||
|         +cell float | ||||
|         +cell A scalar similarity score. Higher is more similar. | ||||
| 
 | ||||
| +h(2, "to_array") Span.to_array | ||||
|     +tag method | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p | ||||
|     |  Given a list of #[code M] attribute IDs, export the tokens to a numpy | ||||
|     |  #[code ndarray] of shape #[code (N, M)], where #[code N] is the length of | ||||
|     |  the document. The values will be 32-bit integers. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA | ||||
|     doc = nlp(u'I like New York in Autumn.') | ||||
|     span = doc[2:3] | ||||
|     # All strings mapped to integers, for easy export to numpy | ||||
|     np_array = span.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA]) | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code attr_ids] | ||||
|         +cell list | ||||
|         +cell A list of attribute ID ints. | ||||
| 
 | ||||
|     +footrow | ||||
|         +cell returns | ||||
|         +cell #[code.u-break numpy.ndarray[long, ndim=2]] | ||||
|         +cell | ||||
|             |  A feature matrix, with one row per word, and one column per | ||||
|             |  attribute indicated in the input #[code attr_ids]. | ||||
| 
 | ||||
| +h(2, "merge") Span.merge | ||||
|     +tag method | ||||
| 
 | ||||
| p Retokenize the document, such that the span is merged into a single token. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     doc = nlp(u'I like New York in Autumn.') | ||||
|     span = doc[2:3] | ||||
|     span.merge() | ||||
|     assert len(doc) == 6 | ||||
|     assert doc[2].text == 'New York' | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code **attributes] | ||||
|  | @ -270,7 +306,7 @@ p | |||
| +table(["Name", "Type", "Description"]) | ||||
|     +footrow | ||||
|         +cell returns | ||||
|         +cell #[code numpy.ndarray[ndim=1, dtype='float32']] | ||||
|         +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] | ||||
|         +cell A 1D numpy array representing the span's semantics. | ||||
| 
 | ||||
| +h(2, "vector_norm") Span.vector_norm | ||||
|  |  | |||
|  | @ -250,7 +250,7 @@ p A real-valued meaning representation. | |||
| +table(["Name", "Type", "Description"]) | ||||
|     +footrow | ||||
|         +cell returns | ||||
|         +cell #[code numpy.ndarray[ndim=1, dtype='float32']] | ||||
|         +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] | ||||
|         +cell A 1D numpy array representing the token's semantics. | ||||
| 
 | ||||
| +h(2, "vector_norm") Span.vector_norm | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user