mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Merge pull request #5599 from adrianeboyd/docs/v2.3.0-minor
This commit is contained in:
		
						commit
						6d712f3e06
					
				| 
						 | 
					@ -36,7 +36,7 @@ for token in doc:
 | 
				
			||||||
| Text    | Lemma   | POS     | Tag   | Dep        | Shape   | alpha   | stop    |
 | 
					| Text    | Lemma   | POS     | Tag   | Dep        | Shape   | alpha   | stop    |
 | 
				
			||||||
| ------- | ------- | ------- | ----- | ---------- | ------- | ------- | ------- |
 | 
					| ------- | ------- | ------- | ----- | ---------- | ------- | ------- | ------- |
 | 
				
			||||||
| Apple   | apple   | `PROPN` | `NNP` | `nsubj`    | `Xxxxx` | `True`  | `False` |
 | 
					| Apple   | apple   | `PROPN` | `NNP` | `nsubj`    | `Xxxxx` | `True`  | `False` |
 | 
				
			||||||
| is      | be      | `VERB`  | `VBZ` | `aux`      | `xx`    | `True`  | `True`  |
 | 
					| is      | be      | `AUX`   | `VBZ` | `aux`      | `xx`    | `True`  | `True`  |
 | 
				
			||||||
| looking | look    | `VERB`  | `VBG` | `ROOT`     | `xxxx`  | `True`  | `False` |
 | 
					| looking | look    | `VERB`  | `VBG` | `ROOT`     | `xxxx`  | `True`  | `False` |
 | 
				
			||||||
| at      | at      | `ADP`   | `IN`  | `prep`     | `xx`    | `True`  | `True`  |
 | 
					| at      | at      | `ADP`   | `IN`  | `prep`     | `xx`    | `True`  | `True`  |
 | 
				
			||||||
| buying  | buy     | `VERB`  | `VBG` | `pcomp`    | `xxxx`  | `True`  | `False` |
 | 
					| buying  | buy     | `VERB`  | `VBG` | `pcomp`    | `xxxx`  | `True`  | `False` |
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -662,7 +662,7 @@ One thing to keep in mind is that spaCy expects to train its models from **whole
 | 
				
			||||||
documents**, not just single sentences. If your corpus only contains single
 | 
					documents**, not just single sentences. If your corpus only contains single
 | 
				
			||||||
sentences, spaCy's models will never learn to expect multi-sentence documents,
 | 
					sentences, spaCy's models will never learn to expect multi-sentence documents,
 | 
				
			||||||
leading to low performance on real text. To mitigate this problem, you can use
 | 
					leading to low performance on real text. To mitigate this problem, you can use
 | 
				
			||||||
the `-N` argument to the `spacy convert` command, to merge some of the sentences
 | 
					the `-n` argument to the `spacy convert` command, to merge some of the sentences
 | 
				
			||||||
into longer pseudo-documents.
 | 
					into longer pseudo-documents.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### Training the tagger and parser {#train-tagger-parser}
 | 
					### Training the tagger and parser {#train-tagger-parser}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -471,7 +471,7 @@ doc = nlp.make_doc("London is a big city in the United Kingdom.")
 | 
				
			||||||
print("Before", doc.ents)  # []
 | 
					print("Before", doc.ents)  # []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
header = [ENT_IOB, ENT_TYPE]
 | 
					header = [ENT_IOB, ENT_TYPE]
 | 
				
			||||||
attr_array = numpy.zeros((len(doc), len(header)))
 | 
					attr_array = numpy.zeros((len(doc), len(header)), dtype="uint64")
 | 
				
			||||||
attr_array[0, 0] = 3  # B
 | 
					attr_array[0, 0] = 3  # B
 | 
				
			||||||
attr_array[0, 1] = doc.vocab.strings["GPE"]
 | 
					attr_array[0, 1] = doc.vocab.strings["GPE"]
 | 
				
			||||||
doc.from_array(header, attr_array)
 | 
					doc.from_array(header, attr_array)
 | 
				
			||||||
| 
						 | 
					@ -1143,9 +1143,9 @@ from spacy.gold import align
 | 
				
			||||||
other_tokens = ["i", "listened", "to", "obama", "'", "s", "podcasts", "."]
 | 
					other_tokens = ["i", "listened", "to", "obama", "'", "s", "podcasts", "."]
 | 
				
			||||||
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."]
 | 
					spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."]
 | 
				
			||||||
cost, a2b, b2a, a2b_multi, b2a_multi = align(other_tokens, spacy_tokens)
 | 
					cost, a2b, b2a, a2b_multi, b2a_multi = align(other_tokens, spacy_tokens)
 | 
				
			||||||
print("Misaligned tokens:", cost)  # 2
 | 
					print("Edit distance:", cost)  # 3
 | 
				
			||||||
print("One-to-one mappings a -> b", a2b)  # array([0, 1, 2, 3, -1, -1, 5, 6])
 | 
					print("One-to-one mappings a -> b", a2b)  # array([0, 1, 2, 3, -1, -1, 5, 6])
 | 
				
			||||||
print("One-to-one mappings b -> a", b2a)  # array([0, 1, 2, 3, 5, 6, 7])
 | 
					print("One-to-one mappings b -> a", b2a)  # array([0, 1, 2, 3, -1, 6, 7])
 | 
				
			||||||
print("Many-to-one mappings a -> b", a2b_multi)  # {4: 4, 5: 4}
 | 
					print("Many-to-one mappings a -> b", a2b_multi)  # {4: 4, 5: 4}
 | 
				
			||||||
print("Many-to-one mappings b-> a", b2a_multi)  # {}
 | 
					print("Many-to-one mappings b-> a", b2a_multi)  # {}
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
| 
						 | 
					@ -1153,7 +1153,7 @@ print("Many-to-one mappings b-> a", b2a_multi)  # {}
 | 
				
			||||||
Here are some insights from the alignment information generated in the example
 | 
					Here are some insights from the alignment information generated in the example
 | 
				
			||||||
above:
 | 
					above:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- Two tokens are misaligned.
 | 
					- The edit distance (cost) is `3`: two deletions and one insertion.
 | 
				
			||||||
- The one-to-one mappings for the first four tokens are identical, which means
 | 
					- The one-to-one mappings for the first four tokens are identical, which means
 | 
				
			||||||
  they map to each other. This makes sense because they're also identical in the
 | 
					  they map to each other. This makes sense because they're also identical in the
 | 
				
			||||||
  input: `"i"`, `"listened"`, `"to"` and `"obama"`.
 | 
					  input: `"i"`, `"listened"`, `"to"` and `"obama"`.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1158,17 +1158,17 @@ what you need for your application.
 | 
				
			||||||
> available corpus.
 | 
					> available corpus.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
For example, the corpus spaCy's [English models](/models/en) were trained on
 | 
					For example, the corpus spaCy's [English models](/models/en) were trained on
 | 
				
			||||||
defines a `PERSON` entity as just the **person name**, without titles like "Mr"
 | 
					defines a `PERSON` entity as just the **person name**, without titles like "Mr."
 | 
				
			||||||
or "Dr". This makes sense, because it makes it easier to resolve the entity type
 | 
					or "Dr.". This makes sense, because it makes it easier to resolve the entity
 | 
				
			||||||
back to a knowledge base. But what if your application needs the full names,
 | 
					type back to a knowledge base. But what if your application needs the full
 | 
				
			||||||
_including_ the titles?
 | 
					names, _including_ the titles?
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```python
 | 
					```python
 | 
				
			||||||
### {executable="true"}
 | 
					### {executable="true"}
 | 
				
			||||||
import spacy
 | 
					import spacy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
nlp = spacy.load("en_core_web_sm")
 | 
					nlp = spacy.load("en_core_web_sm")
 | 
				
			||||||
doc = nlp("Dr Alex Smith chaired first board meeting of Acme Corp Inc.")
 | 
					doc = nlp("Dr. Alex Smith chaired first board meeting of Acme Corp Inc.")
 | 
				
			||||||
print([(ent.text, ent.label_) for ent in doc.ents])
 | 
					print([(ent.text, ent.label_) for ent in doc.ents])
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1233,7 +1233,7 @@ def expand_person_entities(doc):
 | 
				
			||||||
# Add the component after the named entity recognizer
 | 
					# Add the component after the named entity recognizer
 | 
				
			||||||
nlp.add_pipe(expand_person_entities, after='ner')
 | 
					nlp.add_pipe(expand_person_entities, after='ner')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
doc = nlp("Dr Alex Smith chaired first board meeting of Acme Corp Inc.")
 | 
					doc = nlp("Dr. Alex Smith chaired first board meeting of Acme Corp Inc.")
 | 
				
			||||||
print([(ent.text, ent.label_) for ent in doc.ents])
 | 
					print([(ent.text, ent.label_) for ent in doc.ents])
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user