mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Merge pull request #5599 from adrianeboyd/docs/v2.3.0-minor
This commit is contained in:
commit
6d712f3e06
|
@ -36,7 +36,7 @@ for token in doc:
|
||||||
| Text | Lemma | POS | Tag | Dep | Shape | alpha | stop |
|
| Text | Lemma | POS | Tag | Dep | Shape | alpha | stop |
|
||||||
| ------- | ------- | ------- | ----- | ---------- | ------- | ------- | ------- |
|
| ------- | ------- | ------- | ----- | ---------- | ------- | ------- | ------- |
|
||||||
| Apple | apple | `PROPN` | `NNP` | `nsubj` | `Xxxxx` | `True` | `False` |
|
| Apple | apple | `PROPN` | `NNP` | `nsubj` | `Xxxxx` | `True` | `False` |
|
||||||
| is | be | `VERB` | `VBZ` | `aux` | `xx` | `True` | `True` |
|
| is | be | `AUX` | `VBZ` | `aux` | `xx` | `True` | `True` |
|
||||||
| looking | look | `VERB` | `VBG` | `ROOT` | `xxxx` | `True` | `False` |
|
| looking | look | `VERB` | `VBG` | `ROOT` | `xxxx` | `True` | `False` |
|
||||||
| at | at | `ADP` | `IN` | `prep` | `xx` | `True` | `True` |
|
| at | at | `ADP` | `IN` | `prep` | `xx` | `True` | `True` |
|
||||||
| buying | buy | `VERB` | `VBG` | `pcomp` | `xxxx` | `True` | `False` |
|
| buying | buy | `VERB` | `VBG` | `pcomp` | `xxxx` | `True` | `False` |
|
||||||
|
|
|
@ -662,7 +662,7 @@ One thing to keep in mind is that spaCy expects to train its models from **whole
|
||||||
documents**, not just single sentences. If your corpus only contains single
|
documents**, not just single sentences. If your corpus only contains single
|
||||||
sentences, spaCy's models will never learn to expect multi-sentence documents,
|
sentences, spaCy's models will never learn to expect multi-sentence documents,
|
||||||
leading to low performance on real text. To mitigate this problem, you can use
|
leading to low performance on real text. To mitigate this problem, you can use
|
||||||
the `-N` argument to the `spacy convert` command, to merge some of the sentences
|
the `-n` argument to the `spacy convert` command, to merge some of the sentences
|
||||||
into longer pseudo-documents.
|
into longer pseudo-documents.
|
||||||
|
|
||||||
### Training the tagger and parser {#train-tagger-parser}
|
### Training the tagger and parser {#train-tagger-parser}
|
||||||
|
|
|
@ -471,7 +471,7 @@ doc = nlp.make_doc("London is a big city in the United Kingdom.")
|
||||||
print("Before", doc.ents) # []
|
print("Before", doc.ents) # []
|
||||||
|
|
||||||
header = [ENT_IOB, ENT_TYPE]
|
header = [ENT_IOB, ENT_TYPE]
|
||||||
attr_array = numpy.zeros((len(doc), len(header)))
|
attr_array = numpy.zeros((len(doc), len(header)), dtype="uint64")
|
||||||
attr_array[0, 0] = 3 # B
|
attr_array[0, 0] = 3 # B
|
||||||
attr_array[0, 1] = doc.vocab.strings["GPE"]
|
attr_array[0, 1] = doc.vocab.strings["GPE"]
|
||||||
doc.from_array(header, attr_array)
|
doc.from_array(header, attr_array)
|
||||||
|
@ -1143,9 +1143,9 @@ from spacy.gold import align
|
||||||
other_tokens = ["i", "listened", "to", "obama", "'", "s", "podcasts", "."]
|
other_tokens = ["i", "listened", "to", "obama", "'", "s", "podcasts", "."]
|
||||||
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."]
|
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."]
|
||||||
cost, a2b, b2a, a2b_multi, b2a_multi = align(other_tokens, spacy_tokens)
|
cost, a2b, b2a, a2b_multi, b2a_multi = align(other_tokens, spacy_tokens)
|
||||||
print("Misaligned tokens:", cost) # 2
|
print("Edit distance:", cost) # 3
|
||||||
print("One-to-one mappings a -> b", a2b) # array([0, 1, 2, 3, -1, -1, 5, 6])
|
print("One-to-one mappings a -> b", a2b) # array([0, 1, 2, 3, -1, -1, 5, 6])
|
||||||
print("One-to-one mappings b -> a", b2a) # array([0, 1, 2, 3, 5, 6, 7])
|
print("One-to-one mappings b -> a", b2a) # array([0, 1, 2, 3, -1, 6, 7])
|
||||||
print("Many-to-one mappings a -> b", a2b_multi) # {4: 4, 5: 4}
|
print("Many-to-one mappings a -> b", a2b_multi) # {4: 4, 5: 4}
|
||||||
print("Many-to-one mappings b-> a", b2a_multi) # {}
|
print("Many-to-one mappings b-> a", b2a_multi) # {}
|
||||||
```
|
```
|
||||||
|
@ -1153,7 +1153,7 @@ print("Many-to-one mappings b-> a", b2a_multi) # {}
|
||||||
Here are some insights from the alignment information generated in the example
|
Here are some insights from the alignment information generated in the example
|
||||||
above:
|
above:
|
||||||
|
|
||||||
- Two tokens are misaligned.
|
- The edit distance (cost) is `3`: two deletions and one insertion.
|
||||||
- The one-to-one mappings for the first four tokens are identical, which means
|
- The one-to-one mappings for the first four tokens are identical, which means
|
||||||
they map to each other. This makes sense because they're also identical in the
|
they map to each other. This makes sense because they're also identical in the
|
||||||
input: `"i"`, `"listened"`, `"to"` and `"obama"`.
|
input: `"i"`, `"listened"`, `"to"` and `"obama"`.
|
||||||
|
|
|
@ -1158,17 +1158,17 @@ what you need for your application.
|
||||||
> available corpus.
|
> available corpus.
|
||||||
|
|
||||||
For example, the corpus spaCy's [English models](/models/en) were trained on
|
For example, the corpus spaCy's [English models](/models/en) were trained on
|
||||||
defines a `PERSON` entity as just the **person name**, without titles like "Mr"
|
defines a `PERSON` entity as just the **person name**, without titles like "Mr."
|
||||||
or "Dr". This makes sense, because it makes it easier to resolve the entity type
|
or "Dr.". This makes sense, because it makes it easier to resolve the entity
|
||||||
back to a knowledge base. But what if your application needs the full names,
|
type back to a knowledge base. But what if your application needs the full
|
||||||
_including_ the titles?
|
names, _including_ the titles?
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### {executable="true"}
|
### {executable="true"}
|
||||||
import spacy
|
import spacy
|
||||||
|
|
||||||
nlp = spacy.load("en_core_web_sm")
|
nlp = spacy.load("en_core_web_sm")
|
||||||
doc = nlp("Dr Alex Smith chaired first board meeting of Acme Corp Inc.")
|
doc = nlp("Dr. Alex Smith chaired first board meeting of Acme Corp Inc.")
|
||||||
print([(ent.text, ent.label_) for ent in doc.ents])
|
print([(ent.text, ent.label_) for ent in doc.ents])
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -1233,7 +1233,7 @@ def expand_person_entities(doc):
|
||||||
# Add the component after the named entity recognizer
|
# Add the component after the named entity recognizer
|
||||||
nlp.add_pipe(expand_person_entities, after='ner')
|
nlp.add_pipe(expand_person_entities, after='ner')
|
||||||
|
|
||||||
doc = nlp("Dr Alex Smith chaired first board meeting of Acme Corp Inc.")
|
doc = nlp("Dr. Alex Smith chaired first board meeting of Acme Corp Inc.")
|
||||||
print([(ent.text, ent.label_) for ent in doc.ents])
|
print([(ent.text, ent.label_) for ent in doc.ents])
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user