mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Merge branch 'master' into spacy.io
This commit is contained in:
		
						commit
						959bc616dd
					
				|  | @ -36,7 +36,7 @@ for token in doc: | |||
| | Text    | Lemma   | POS     | Tag   | Dep        | Shape   | alpha   | stop    | | ||||
| | ------- | ------- | ------- | ----- | ---------- | ------- | ------- | ------- | | ||||
| | Apple   | apple   | `PROPN` | `NNP` | `nsubj`    | `Xxxxx` | `True`  | `False` | | ||||
| | is      | be      | `VERB`  | `VBZ` | `aux`      | `xx`    | `True`  | `True`  | | ||||
| | is      | be      | `AUX`   | `VBZ` | `aux`      | `xx`    | `True`  | `True`  | | ||||
| | looking | look    | `VERB`  | `VBG` | `ROOT`     | `xxxx`  | `True`  | `False` | | ||||
| | at      | at      | `ADP`   | `IN`  | `prep`     | `xx`    | `True`  | `True`  | | ||||
| | buying  | buy     | `VERB`  | `VBG` | `pcomp`    | `xxxx`  | `True`  | `False` | | ||||
|  |  | |||
|  | @ -662,7 +662,7 @@ One thing to keep in mind is that spaCy expects to train its models from **whole | |||
| documents**, not just single sentences. If your corpus only contains single | ||||
| sentences, spaCy's models will never learn to expect multi-sentence documents, | ||||
| leading to low performance on real text. To mitigate this problem, you can use | ||||
| the `-N` argument to the `spacy convert` command, to merge some of the sentences | ||||
| the `-n` argument to the `spacy convert` command, to merge some of the sentences | ||||
| into longer pseudo-documents. | ||||
| 
 | ||||
| ### Training the tagger and parser {#train-tagger-parser} | ||||
|  |  | |||
|  | @ -471,7 +471,7 @@ doc = nlp.make_doc("London is a big city in the United Kingdom.") | |||
| print("Before", doc.ents)  # [] | ||||
| 
 | ||||
| header = [ENT_IOB, ENT_TYPE] | ||||
| attr_array = numpy.zeros((len(doc), len(header))) | ||||
| attr_array = numpy.zeros((len(doc), len(header)), dtype="uint64") | ||||
| attr_array[0, 0] = 3  # B | ||||
| attr_array[0, 1] = doc.vocab.strings["GPE"] | ||||
| doc.from_array(header, attr_array) | ||||
|  | @ -1143,9 +1143,9 @@ from spacy.gold import align | |||
| other_tokens = ["i", "listened", "to", "obama", "'", "s", "podcasts", "."] | ||||
| spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."] | ||||
| cost, a2b, b2a, a2b_multi, b2a_multi = align(other_tokens, spacy_tokens) | ||||
| print("Misaligned tokens:", cost)  # 2 | ||||
| print("Edit distance:", cost)  # 3 | ||||
| print("One-to-one mappings a -> b", a2b)  # array([0, 1, 2, 3, -1, -1, 5, 6]) | ||||
| print("One-to-one mappings b -> a", b2a)  # array([0, 1, 2, 3, 5, 6, 7]) | ||||
| print("One-to-one mappings b -> a", b2a)  # array([0, 1, 2, 3, -1, 6, 7]) | ||||
| print("Many-to-one mappings a -> b", a2b_multi)  # {4: 4, 5: 4} | ||||
| print("Many-to-one mappings b-> a", b2a_multi)  # {} | ||||
| ``` | ||||
|  | @ -1153,7 +1153,7 @@ print("Many-to-one mappings b-> a", b2a_multi)  # {} | |||
| Here are some insights from the alignment information generated in the example | ||||
| above: | ||||
| 
 | ||||
| - Two tokens are misaligned. | ||||
| - The edit distance (cost) is `3`: two deletions and one insertion. | ||||
| - The one-to-one mappings for the first four tokens are identical, which means | ||||
|   they map to each other. This makes sense because they're also identical in the | ||||
|   input: `"i"`, `"listened"`, `"to"` and `"obama"`. | ||||
|  |  | |||
|  | @ -1158,17 +1158,17 @@ what you need for your application. | |||
| > available corpus. | ||||
| 
 | ||||
| For example, the corpus spaCy's [English models](/models/en) were trained on | ||||
| defines a `PERSON` entity as just the **person name**, without titles like "Mr" | ||||
| or "Dr". This makes sense, because it makes it easier to resolve the entity type | ||||
| back to a knowledge base. But what if your application needs the full names, | ||||
| _including_ the titles? | ||||
| defines a `PERSON` entity as just the **person name**, without titles like "Mr." | ||||
| or "Dr.". This makes sense, because it makes it easier to resolve the entity | ||||
| type back to a knowledge base. But what if your application needs the full | ||||
| names, _including_ the titles? | ||||
| 
 | ||||
| ```python | ||||
| ### {executable="true"} | ||||
| import spacy | ||||
| 
 | ||||
| nlp = spacy.load("en_core_web_sm") | ||||
| doc = nlp("Dr Alex Smith chaired first board meeting of Acme Corp Inc.") | ||||
| doc = nlp("Dr. Alex Smith chaired first board meeting of Acme Corp Inc.") | ||||
| print([(ent.text, ent.label_) for ent in doc.ents]) | ||||
| ``` | ||||
| 
 | ||||
|  | @ -1233,7 +1233,7 @@ def expand_person_entities(doc): | |||
| # Add the component after the named entity recognizer | ||||
| nlp.add_pipe(expand_person_entities, after='ner') | ||||
| 
 | ||||
| doc = nlp("Dr Alex Smith chaired first board meeting of Acme Corp Inc.") | ||||
| doc = nlp("Dr. Alex Smith chaired first board meeting of Acme Corp Inc.") | ||||
| print([(ent.text, ent.label_) for ent in doc.ents]) | ||||
| ``` | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user