diff --git a/.gitignore b/.gitignore index 299c67985..b8a4a2fec 100644 --- a/.gitignore +++ b/.gitignore @@ -29,10 +29,7 @@ spacy/orthography/*.cpp ext/murmurhash.cpp ext/sparsehash.cpp -data/en/pos -data/en/ner -data/en/lexemes -data/en/strings +/spacy/data/ _build/ .env/ diff --git a/examples/paddle/sentiment_bilstm/__main__.py b/examples/paddle/sentiment_bilstm/__main__.py deleted file mode 100644 index 845443d99..000000000 --- a/examples/paddle/sentiment_bilstm/__main__.py +++ /dev/null @@ -1,31 +0,0 @@ -def write_parameter(outfile, feats): - """ - From https://github.com/baidu/Paddle/issues/490 - - outfile: Output file name with string type. **Note**, it should be the same as it in the above config. - feats: Parameter with float type. - """ - version = 0 - value_size = 4; # means float type - ret = b"" - for feat in feats: - ret += feat.tostring() - size = len(ret) / 4 - fo = open(outfile, 'wb') - fo.write(struct.pack('iIQ', version, value_size, size)) - fo.write(ret) - - -# config=trainer_config.py -# output=./model_output -# paddle train --config=$config \ -# --save_dir=$output \ -# --job=train \ -# --use_gpu=false \ -# --trainer_count=4 \ -# --num_passes=10 \ -# --log_period=20 \ -# --dot_period=20 \ -# --show_parameter_stats_period=100 \ -# --test_all_data_in_one_period=1 \ -# 2>&1 | tee 'train.log' diff --git a/examples/paddle/sentiment_bilstm/config.py b/examples/paddle/sentiment_bilstm/config.py index cde30cf61..311359f2c 100644 --- a/examples/paddle/sentiment_bilstm/config.py +++ b/examples/paddle/sentiment_bilstm/config.py @@ -1,44 +1,14 @@ -from paddle.trainer.PyDataProvider2 import * -from itertools import izip +from paddle.trainer_config_helpers import * +define_py_data_sources2(train_list='train.list', + test_list='test.list', + module="dataprovider", + obj="process") -def get_features(doc): - return numpy.asarray( - [t.rank+1 for t in doc - if t.has_vector and not t.is_punct and not t.is_space], - dtype='int32') - - -def read_data(data_dir): - for subdir, label in (('pos', 1), ('neg', 0)): - for filename in (data_dir / subdir).iterdir(): - with filename.open() as file_: - text = file_.read() - yield text, label - - -def on_init(settings, lang_name, **kwargs): - print("Loading spaCy") - nlp = spacy.load('en', entity=False) - vectors = get_vectors(nlp) - settings.input_types = [ - # The text is a sequence of integer values, and each value is a word id. - # The whole sequence is the sentences that we want to predict its - # sentimental. - integer_value(vectors.shape[0], seq_type=SequenceType), # text input - - # label positive/negative - integer_value(2) - ] - settings.nlp = nlp - settings.vectors = vectors - - -@provider(init_hook=on_init) -def process(settings, data_dir): # settings is not used currently. - texts, labels = read_data(data_dir) - for doc, label in izip(nlp.pipe(texts, batch_size=5000, n_threads=3), labels): - for sent in doc.sents: - ids = get_features(sent) - # give data to paddle. - yield ids, label +settings( + batch_size=128, + learning_rate=2e-3, + learning_method=AdamOptimizer(), + regularization=L2Regularization(8e-4), + gradient_clipping_threshold=25 +) diff --git a/examples/paddle/sentiment_bilstm/dataprovider.py b/examples/paddle/sentiment_bilstm/dataprovider.py new file mode 100644 index 000000000..d4fb57756 --- /dev/null +++ b/examples/paddle/sentiment_bilstm/dataprovider.py @@ -0,0 +1,46 @@ +from paddle.trainer.PyDataProvider2 import * +from itertools import izip +import spacy + + +def get_features(doc): + return numpy.asarray( + [t.rank+1 for t in doc + if t.has_vector and not t.is_punct and not t.is_space], + dtype='int32') + + +def read_data(data_dir): + for subdir, label in (('pos', 1), ('neg', 0)): + for filename in (data_dir / subdir).iterdir(): + with filename.open() as file_: + text = file_.read() + yield text, label + + +def on_init(settings, **kwargs): + print("Loading spaCy") + nlp = spacy.load('en', entity=False) + vectors = get_vectors(nlp) + settings.input_types = [ + # The text is a sequence of integer values, and each value is a word id. + # The whole sequence is the sentences that we want to predict its + # sentimental. + integer_value(vectors.shape[0], seq_type=SequenceType), # text input + + # label positive/negative + integer_value(2) + ] + settings.nlp = nlp + settings.vectors = vectors + settings['batch_size'] = 32 + + +@provider(init_hook=on_init) +def process(settings, data_dir): # settings is not used currently. + texts, labels = read_data(data_dir) + for doc, label in izip(nlp.pipe(texts, batch_size=5000, n_threads=3), labels): + for sent in doc.sents: + ids = get_features(sent) + # give data to paddle. + yield ids, label diff --git a/examples/paddle/sentiment_bilstm/train.sh b/examples/paddle/sentiment_bilstm/train.sh new file mode 100755 index 000000000..ffc6dd4bd --- /dev/null +++ b/examples/paddle/sentiment_bilstm/train.sh @@ -0,0 +1,14 @@ +config=config.py +output=./model_output +paddle train --config=$config \ + --save_dir=$output \ + --job=train \ + --use_gpu=false \ + --trainer_count=4 \ + --num_passes=10 \ + --log_period=20 \ + --dot_period=20 \ + --show_parameter_stats_period=100 \ + --test_all_data_in_one_period=1 \ + --config_args=batch_size=100 \ + 2>&1 | tee 'train.log'_ diff --git a/website/docs/usage/index.jade b/website/docs/usage/index.jade index a4bb29d06..75ae641ae 100644 --- a/website/docs/usage/index.jade +++ b/website/docs/usage/index.jade @@ -7,7 +7,7 @@ p | runs on #[strong Unix/Linux], #[strong macOS/OS X] and | #[strong Windows]. The latest spaCy releases are currently only | available as source packages over - | #[+a("https://pypi.python.org/pypi/spacy") pip]. Installaton requires a + | #[+a("https://pypi.python.org/pypi/spacy") pip]. Installation requires a | working build environment. See notes on | #[a(href="#source-ubuntu") Ubuntu], #[a(href="#source-osx") macOS/OS X] | and #[a(href="#source-windows") Windows] for details. diff --git a/website/docs/usage/processing-text.jade b/website/docs/usage/processing-text.jade index 205986e8a..4bd6132d2 100644 --- a/website/docs/usage/processing-text.jade +++ b/website/docs/usage/processing-text.jade @@ -31,7 +31,8 @@ p p | I've tried to make sure that the #[code Language.__call__] function | doesn't do any "heavy lifting", so that you won't have complicated logic - | to replicate if you need to make your own pipeline class. This is all it | does. + | to replicate if you need to make your own pipeline class. This is all it + | does. p | The #[code .make_doc()] method and #[code .pipeline] attribute make it