mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Update examples
This commit is contained in:
parent
1b1c9105b4
commit
173b1551af
|
@ -1,18 +1,24 @@
|
|||
import plac
|
||||
import collections
|
||||
import random
|
||||
"""
|
||||
This example shows how to use an LSTM sentiment classification model trained using Keras in spaCy. spaCy splits the document into sentences, and each sentence is classified using the LSTM. The scores for the sentences are then aggregated to give the document score. This kind of hierarchical model is quite difficult in "pure" Keras or Tensorflow, but it's very effective. The Keras example on this dataset performs quite poorly, because it cuts off the documents so that they're a fixed size. This hurts review accuracy a lot, because people often summarise their rating in the final sentence
|
||||
|
||||
Prerequisites:
|
||||
spacy download en_vectors_web_lg
|
||||
pip install keras==2.0.9
|
||||
|
||||
Compatible with: spaCy v2.0.0+
|
||||
"""
|
||||
|
||||
import plac
|
||||
import random
|
||||
import pathlib
|
||||
import cytoolz
|
||||
import numpy
|
||||
from keras.models import Sequential, model_from_json
|
||||
from keras.layers import LSTM, Dense, Embedding, Dropout, Bidirectional
|
||||
from keras.layers import LSTM, Dense, Embedding, Bidirectional
|
||||
from keras.layers import TimeDistributed
|
||||
from keras.optimizers import Adam
|
||||
from spacy.compat import pickle
|
||||
|
||||
import thinc.extra.datasets
|
||||
|
||||
from spacy.compat import pickle
|
||||
import spacy
|
||||
|
||||
|
||||
|
@ -84,8 +90,8 @@ def get_features(docs, max_length):
|
|||
|
||||
|
||||
def train(train_texts, train_labels, dev_texts, dev_labels,
|
||||
lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, nb_epoch=5,
|
||||
by_sentence=True):
|
||||
lstm_shape, lstm_settings, lstm_optimizer, batch_size=100,
|
||||
nb_epoch=5, by_sentence=True):
|
||||
print("Loading spaCy")
|
||||
nlp = spacy.load('en_vectors_web_lg')
|
||||
nlp.add_pipe(nlp.create_pipe('sentencizer'))
|
||||
|
@ -97,7 +103,7 @@ def train(train_texts, train_labels, dev_texts, dev_labels,
|
|||
if by_sentence:
|
||||
train_docs, train_labels = get_labelled_sentences(train_docs, train_labels)
|
||||
dev_docs, dev_labels = get_labelled_sentences(dev_docs, dev_labels)
|
||||
|
||||
|
||||
train_X = get_features(train_docs, lstm_shape['max_length'])
|
||||
dev_X = get_features(dev_docs, lstm_shape['max_length'])
|
||||
model.fit(train_X, train_labels, validation_data=(dev_X, dev_labels),
|
||||
|
@ -138,12 +144,12 @@ def evaluate(model_dir, texts, labels, max_length=100):
|
|||
'''
|
||||
return [nlp.tagger, nlp.parser, SentimentAnalyser.load(model_dir, nlp,
|
||||
max_length=max_length)]
|
||||
|
||||
|
||||
nlp = spacy.load('en')
|
||||
nlp.pipeline = create_pipeline(nlp)
|
||||
|
||||
correct = 0
|
||||
i = 0
|
||||
i = 0
|
||||
for doc in nlp.pipe(texts, batch_size=1000, n_threads=4):
|
||||
correct += bool(doc.sentiment >= 0.5) == bool(labels[i])
|
||||
i += 1
|
||||
|
|
|
@ -6,7 +6,7 @@ money and currency values (entities labelled as MONEY) and then check the
|
|||
dependency tree to find the noun phrase they are referring to – for example:
|
||||
$9.4 million --> Net income.
|
||||
|
||||
Compatible with: spaCy 2.0.0a18+
|
||||
Compatible with: spaCy v2.0.0+
|
||||
"""
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
|
|
|
@ -16,7 +16,7 @@ show you how computers understand [language]
|
|||
|
||||
I'm assuming that we can use the token.head to build these groups."
|
||||
|
||||
Compatible with: spaCy 2.0.0a18+
|
||||
Compatible with: spaCy v2.0.0+
|
||||
"""
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
|
|
|
@ -34,7 +34,7 @@ formatted in jsonl as a sequence of entries like this:
|
|||
{"text":"Appalachia"}
|
||||
{"text":"Argentina"}
|
||||
|
||||
Compatible with: spaCy 2.0.0a17+
|
||||
Compatible with: spaCy v2.0.0+
|
||||
"""
|
||||
from __future__ import print_function, unicode_literals, division
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@ they're called on is passed in as the first argument.
|
|||
|
||||
* Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components
|
||||
|
||||
Compatible with: spaCy 2.0.0a17+
|
||||
Compatible with: spaCy v2.0.0+
|
||||
"""
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@ coordinates. Can be extended with more details from the API.
|
|||
* REST Countries API: https://restcountries.eu (Mozilla Public License MPL 2.0)
|
||||
* Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components
|
||||
|
||||
Compatible with: spaCy 2.0.0a17+
|
||||
Compatible with: spaCy v2.0.0+
|
||||
"""
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@ respectively.
|
|||
|
||||
* Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components
|
||||
|
||||
Compatible with: spaCy 2.0.0a17+
|
||||
Compatible with: spaCy v2.0.0+
|
||||
"""
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ each "sentence" on a newline, and spaces between tokens. Data is loaded from
|
|||
the IMDB movie reviews dataset and will be loaded automatically via Thinc's
|
||||
built-in dataset loader.
|
||||
|
||||
Compatible with: spaCy 2.0.0a18+
|
||||
Compatible with: spaCy v2.0.0+
|
||||
"""
|
||||
from __future__ import print_function, unicode_literals
|
||||
from toolz import partition_all
|
||||
|
|
|
@ -15,7 +15,7 @@ following types of relations: ROOT, PLACE, QUALITY, ATTRIBUTE, TIME, LOCATION.
|
|||
('hotel', 'PLACE', 'show') --> show PLACE hotel
|
||||
('berlin', 'LOCATION', 'hotel') --> hotel with LOCATION berlin
|
||||
|
||||
Compatible with: spaCy 2.0.0a20+
|
||||
Compatible with: spaCy v2.0.0+
|
||||
"""
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@ For more details, see the documentation:
|
|||
* Training: https://alpha.spacy.io/usage/training
|
||||
* NER: https://alpha.spacy.io/usage/linguistic-features#named-entities
|
||||
|
||||
Compatible with: spaCy 2.0.0a20+
|
||||
Compatible with: spaCy v2.0.0+
|
||||
"""
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
|
|
|
@ -23,7 +23,7 @@ For more details, see the documentation:
|
|||
* Training: https://alpha.spacy.io/usage/training
|
||||
* NER: https://alpha.spacy.io/usage/linguistic-features#named-entities
|
||||
|
||||
Compatible with: spaCy 2.0.0a20+
|
||||
Compatible with: spaCy v2.0.0+
|
||||
"""
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ model or a blank model. For more details, see the documentation:
|
|||
* Training: https://alpha.spacy.io/usage/training
|
||||
* Dependency Parse: https://alpha.spacy.io/usage/linguistic-features#dependency-parse
|
||||
|
||||
Compatible with: spaCy 2.0.0a20+
|
||||
Compatible with: spaCy v2.0.0+
|
||||
"""
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@ the documentation:
|
|||
* Training: https://alpha.spacy.io/usage/training
|
||||
* POS Tagging: https://alpha.spacy.io/usage/linguistic-features#pos-tagging
|
||||
|
||||
Compatible with: spaCy 2.0.0a20+
|
||||
Compatible with: spaCy v2.0.0+
|
||||
"""
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@ see the documentation:
|
|||
* Training: https://alpha.spacy.io/usage/training
|
||||
* Text classification: https://alpha.spacy.io/usage/text-classification
|
||||
|
||||
Compatible with: spaCy 2.0.0a20+
|
||||
Compatible with: spaCy v2.0.0+
|
||||
"""
|
||||
from __future__ import unicode_literals, print_function
|
||||
import plac
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
# coding: utf8
|
||||
"""Load vectors for a language trained using fastText
|
||||
https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
|
||||
Compatible with: spaCy v2.0.0a17+
|
||||
Compatible with: spaCy v2.0.0+
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
import plac
|
||||
|
|
|
@ -165,18 +165,15 @@ include ../_includes/_mixins
|
|||
+h(3, "keras") Text classification with Keras
|
||||
|
||||
p
|
||||
| In this example, we're using spaCy to pre-process text for use with
|
||||
| a #[+a("https://keras.io") Keras] text classification model.
|
||||
| This example shows how to use a #[+a("https://keras.io") Keras]
|
||||
| LSTM sentiment classification model in spaCy. spaCy splits
|
||||
| the document into sentences, and each sentence is classified using
|
||||
| the LSTM. The scores for the sentences are then aggregated to give
|
||||
| the document score. This kind of hierarchical model is quite
|
||||
| difficult in "pure" Keras or Tensorflow, but it's very effective.
|
||||
| The Keras example on this dataset performs quite poorly, because it
|
||||
| cuts off the documents so that they're a fixed size. This hurts
|
||||
| review accuracy a lot, because people often summarise their rating
|
||||
| in the final sentence.
|
||||
|
||||
+github("spacy", "examples/deep_learning_keras.py")
|
||||
|
||||
+h(3, "keras-parikh-entailment") A decomposable attention model for Natural Language Inference
|
||||
|
||||
p
|
||||
| This example contains an implementation of the entailment prediction
|
||||
| model described by #[+a("https://arxiv.org/pdf/1606.01933.pdf") Parikh et al. (2016)].
|
||||
| The model is notable for its competitive performance with very few
|
||||
| parameters, and was implemented using #[+a("https://keras.io") Keras]
|
||||
| and spaCy.
|
||||
|
||||
+github("spacy", "examples/keras_parikh_entailment/__main__.py", false, "examples/keras_parikh_entailment")
|
||||
|
|
Loading…
Reference in New Issue
Block a user