mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 09:44:36 +03:00
Merge branch 'master' into organize-language-data
This commit is contained in:
commit
4edf7057ee
5
.gitignore
vendored
5
.gitignore
vendored
|
@ -29,10 +29,7 @@ spacy/orthography/*.cpp
|
||||||
ext/murmurhash.cpp
|
ext/murmurhash.cpp
|
||||||
ext/sparsehash.cpp
|
ext/sparsehash.cpp
|
||||||
|
|
||||||
data/en/pos
|
/spacy/data/
|
||||||
data/en/ner
|
|
||||||
data/en/lexemes
|
|
||||||
data/en/strings
|
|
||||||
|
|
||||||
_build/
|
_build/
|
||||||
.env/
|
.env/
|
||||||
|
|
|
@ -1,31 +0,0 @@
|
||||||
def write_parameter(outfile, feats):
|
|
||||||
"""
|
|
||||||
From https://github.com/baidu/Paddle/issues/490
|
|
||||||
|
|
||||||
outfile: Output file name with string type. **Note**, it should be the same as it in the above config.
|
|
||||||
feats: Parameter with float type.
|
|
||||||
"""
|
|
||||||
version = 0
|
|
||||||
value_size = 4; # means float type
|
|
||||||
ret = b""
|
|
||||||
for feat in feats:
|
|
||||||
ret += feat.tostring()
|
|
||||||
size = len(ret) / 4
|
|
||||||
fo = open(outfile, 'wb')
|
|
||||||
fo.write(struct.pack('iIQ', version, value_size, size))
|
|
||||||
fo.write(ret)
|
|
||||||
|
|
||||||
|
|
||||||
# config=trainer_config.py
|
|
||||||
# output=./model_output
|
|
||||||
# paddle train --config=$config \
|
|
||||||
# --save_dir=$output \
|
|
||||||
# --job=train \
|
|
||||||
# --use_gpu=false \
|
|
||||||
# --trainer_count=4 \
|
|
||||||
# --num_passes=10 \
|
|
||||||
# --log_period=20 \
|
|
||||||
# --dot_period=20 \
|
|
||||||
# --show_parameter_stats_period=100 \
|
|
||||||
# --test_all_data_in_one_period=1 \
|
|
||||||
# 2>&1 | tee 'train.log'
|
|
|
@ -1,44 +1,14 @@
|
||||||
from paddle.trainer.PyDataProvider2 import *
|
from paddle.trainer_config_helpers import *
|
||||||
from itertools import izip
|
|
||||||
|
|
||||||
|
define_py_data_sources2(train_list='train.list',
|
||||||
|
test_list='test.list',
|
||||||
|
module="dataprovider",
|
||||||
|
obj="process")
|
||||||
|
|
||||||
def get_features(doc):
|
settings(
|
||||||
return numpy.asarray(
|
batch_size=128,
|
||||||
[t.rank+1 for t in doc
|
learning_rate=2e-3,
|
||||||
if t.has_vector and not t.is_punct and not t.is_space],
|
learning_method=AdamOptimizer(),
|
||||||
dtype='int32')
|
regularization=L2Regularization(8e-4),
|
||||||
|
gradient_clipping_threshold=25
|
||||||
|
)
|
||||||
def read_data(data_dir):
|
|
||||||
for subdir, label in (('pos', 1), ('neg', 0)):
|
|
||||||
for filename in (data_dir / subdir).iterdir():
|
|
||||||
with filename.open() as file_:
|
|
||||||
text = file_.read()
|
|
||||||
yield text, label
|
|
||||||
|
|
||||||
|
|
||||||
def on_init(settings, lang_name, **kwargs):
|
|
||||||
print("Loading spaCy")
|
|
||||||
nlp = spacy.load('en', entity=False)
|
|
||||||
vectors = get_vectors(nlp)
|
|
||||||
settings.input_types = [
|
|
||||||
# The text is a sequence of integer values, and each value is a word id.
|
|
||||||
# The whole sequence is the sentences that we want to predict its
|
|
||||||
# sentimental.
|
|
||||||
integer_value(vectors.shape[0], seq_type=SequenceType), # text input
|
|
||||||
|
|
||||||
# label positive/negative
|
|
||||||
integer_value(2)
|
|
||||||
]
|
|
||||||
settings.nlp = nlp
|
|
||||||
settings.vectors = vectors
|
|
||||||
|
|
||||||
|
|
||||||
@provider(init_hook=on_init)
|
|
||||||
def process(settings, data_dir): # settings is not used currently.
|
|
||||||
texts, labels = read_data(data_dir)
|
|
||||||
for doc, label in izip(nlp.pipe(texts, batch_size=5000, n_threads=3), labels):
|
|
||||||
for sent in doc.sents:
|
|
||||||
ids = get_features(sent)
|
|
||||||
# give data to paddle.
|
|
||||||
yield ids, label
|
|
||||||
|
|
46
examples/paddle/sentiment_bilstm/dataprovider.py
Normal file
46
examples/paddle/sentiment_bilstm/dataprovider.py
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
from paddle.trainer.PyDataProvider2 import *
|
||||||
|
from itertools import izip
|
||||||
|
import spacy
|
||||||
|
|
||||||
|
|
||||||
|
def get_features(doc):
|
||||||
|
return numpy.asarray(
|
||||||
|
[t.rank+1 for t in doc
|
||||||
|
if t.has_vector and not t.is_punct and not t.is_space],
|
||||||
|
dtype='int32')
|
||||||
|
|
||||||
|
|
||||||
|
def read_data(data_dir):
|
||||||
|
for subdir, label in (('pos', 1), ('neg', 0)):
|
||||||
|
for filename in (data_dir / subdir).iterdir():
|
||||||
|
with filename.open() as file_:
|
||||||
|
text = file_.read()
|
||||||
|
yield text, label
|
||||||
|
|
||||||
|
|
||||||
|
def on_init(settings, **kwargs):
|
||||||
|
print("Loading spaCy")
|
||||||
|
nlp = spacy.load('en', entity=False)
|
||||||
|
vectors = get_vectors(nlp)
|
||||||
|
settings.input_types = [
|
||||||
|
# The text is a sequence of integer values, and each value is a word id.
|
||||||
|
# The whole sequence is the sentences that we want to predict its
|
||||||
|
# sentimental.
|
||||||
|
integer_value(vectors.shape[0], seq_type=SequenceType), # text input
|
||||||
|
|
||||||
|
# label positive/negative
|
||||||
|
integer_value(2)
|
||||||
|
]
|
||||||
|
settings.nlp = nlp
|
||||||
|
settings.vectors = vectors
|
||||||
|
settings['batch_size'] = 32
|
||||||
|
|
||||||
|
|
||||||
|
@provider(init_hook=on_init)
|
||||||
|
def process(settings, data_dir): # settings is not used currently.
|
||||||
|
texts, labels = read_data(data_dir)
|
||||||
|
for doc, label in izip(nlp.pipe(texts, batch_size=5000, n_threads=3), labels):
|
||||||
|
for sent in doc.sents:
|
||||||
|
ids = get_features(sent)
|
||||||
|
# give data to paddle.
|
||||||
|
yield ids, label
|
14
examples/paddle/sentiment_bilstm/train.sh
Executable file
14
examples/paddle/sentiment_bilstm/train.sh
Executable file
|
@ -0,0 +1,14 @@
|
||||||
|
config=config.py
|
||||||
|
output=./model_output
|
||||||
|
paddle train --config=$config \
|
||||||
|
--save_dir=$output \
|
||||||
|
--job=train \
|
||||||
|
--use_gpu=false \
|
||||||
|
--trainer_count=4 \
|
||||||
|
--num_passes=10 \
|
||||||
|
--log_period=20 \
|
||||||
|
--dot_period=20 \
|
||||||
|
--show_parameter_stats_period=100 \
|
||||||
|
--test_all_data_in_one_period=1 \
|
||||||
|
--config_args=batch_size=100 \
|
||||||
|
2>&1 | tee 'train.log'_
|
|
@ -7,7 +7,7 @@ p
|
||||||
| runs on #[strong Unix/Linux], #[strong macOS/OS X] and
|
| runs on #[strong Unix/Linux], #[strong macOS/OS X] and
|
||||||
| #[strong Windows]. The latest spaCy releases are currently only
|
| #[strong Windows]. The latest spaCy releases are currently only
|
||||||
| available as source packages over
|
| available as source packages over
|
||||||
| #[+a("https://pypi.python.org/pypi/spacy") pip]. Installaton requires a
|
| #[+a("https://pypi.python.org/pypi/spacy") pip]. Installation requires a
|
||||||
| working build environment. See notes on
|
| working build environment. See notes on
|
||||||
| #[a(href="#source-ubuntu") Ubuntu], #[a(href="#source-osx") macOS/OS X]
|
| #[a(href="#source-ubuntu") Ubuntu], #[a(href="#source-osx") macOS/OS X]
|
||||||
| and #[a(href="#source-windows") Windows] for details.
|
| and #[a(href="#source-windows") Windows] for details.
|
||||||
|
|
|
@ -31,7 +31,8 @@ p
|
||||||
p
|
p
|
||||||
| I've tried to make sure that the #[code Language.__call__] function
|
| I've tried to make sure that the #[code Language.__call__] function
|
||||||
| doesn't do any "heavy lifting", so that you won't have complicated logic
|
| doesn't do any "heavy lifting", so that you won't have complicated logic
|
||||||
| to replicate if you need to make your own pipeline class. This is all it | does.
|
| to replicate if you need to make your own pipeline class. This is all it
|
||||||
|
| does.
|
||||||
|
|
||||||
p
|
p
|
||||||
| The #[code .make_doc()] method and #[code .pipeline] attribute make it
|
| The #[code .make_doc()] method and #[code .pipeline] attribute make it
|
||||||
|
|
Loading…
Reference in New Issue
Block a user