From efe037e8be8bab1e626a9c5dd219ff35f71195a7 Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Sat, 24 Mar 2018 00:05:27 +0000 Subject: [PATCH 1/2] more exceptions --- spacy/lang/pl/tokenizer_exceptions.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/spacy/lang/pl/tokenizer_exceptions.py b/spacy/lang/pl/tokenizer_exceptions.py index 269634671..aa3f55d22 100644 --- a/spacy/lang/pl/tokenizer_exceptions.py +++ b/spacy/lang/pl/tokenizer_exceptions.py @@ -1,7 +1,7 @@ # encoding: utf8 from __future__ import unicode_literals -from ...symbols import ORTH, LEMMA, POS, ADV, ADJ, NOUN +from ...symbols import ORTH, LEMMA, POS, ADV, ADJ, NOUN, ADP _exc = {} @@ -12,11 +12,24 @@ for exc_data in [ {ORTH: "mgr.", LEMMA: "magister", POS: NOUN}, {ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV}, {ORTH: "tj.", LEMMA: "to jest", POS: ADV}, - {ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]: + {ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}, + {ORTH: "adw.", LEMMA: "adwokat", POS: NOUN}, + {ORTH: "afr.", LEMMA: "afrykański", POS: ADJ}, + {ORTH: "c.b.d.o.", LEMMA: "co było do okazania", POS: ADV}, + {ORTH: "cbdu.", LEMMA: "co było do udowodnienia", POS: ADV}, + {ORTH: "mn.w.", LEMMA: "mniej więcej", POS: ADV}, + {ORTH: "nt.", LEMMA: "na temat", POS: ADP}, + {ORTH: "ok.", LEMMA: "około"}, + {ORTH: "n.p.u.", LEMMA: "na psa urok"}, + {ORTH: "ww.", LEMMA: "wyżej wymieniony", POS: ADV}]: _exc[exc_data[ORTH]] = [exc_data] for orth in [ - "w.", "r."]: + "w.", "r.", "br.", "bm.", "b.r.", "amer.", "am.", "bdb.", "św.", "p.", "lit.", + "wym.", "czyt.", "daw.", "d.", "zob.", "gw.", "dn.", "dyr.", "im.", "mł.", + "min.", "dot.", "muz.", "k.k.", "k.p.a.", "k.p.c.", "n.p.m.", "p.p.m.", "nb.", + "ob.", "n.e.", "p.n.e.", "zw.", "zool.", "zach.", "żarg.", "żart.", "wzgl.", + "wyj.", "xx.", "ks.", "x.", "wyd.", "wsch.", "o.o."]: _exc[orth] = [{ORTH: orth}] From 4eeb1788562a31af4e910a3288b5abcd55e4c870 Mon Sep 17 00:00:00 2001 From: Justin DuJardin Date: Sun, 25 Mar 2018 21:50:13 -0700 Subject: [PATCH 2/2] Add example using TensorBoard standalone projector - the tensorboard standalone project expects a different set of files than the plugin to TensorFlow. --- examples/vectors_tensorboard_standalone.py | 88 ++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 examples/vectors_tensorboard_standalone.py diff --git a/examples/vectors_tensorboard_standalone.py b/examples/vectors_tensorboard_standalone.py new file mode 100644 index 000000000..7a9abf785 --- /dev/null +++ b/examples/vectors_tensorboard_standalone.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python +# coding: utf8 +"""Export spaCy model vectors for use in TensorBoard's standalone embedding projector. +https://github.com/tensorflow/embedding-projector-standalone + +Usage: + + python vectors_tensorboard_standalone.py ./myVectorModel ./output [name] + +This outputs two files that have to be copied into the "oss_data" of the standalone projector: + + [name]_labels.tsv - metadata such as human readable labels for vectors + [name]_tensors.bytes - numpy.ndarray of numpy.float32 precision vectors + +""" +from __future__ import unicode_literals + +import json +import math +from os import path + +import numpy +import plac +import spacy +import tqdm + + +@plac.annotations( + vectors_loc=("Path to spaCy model that contains vectors", "positional", None, str), + out_loc=("Path to output folder writing tensors and labels data", "positional", None, str), + name=("Human readable name for tsv file and vectors tensor", "positional", None, str), +) +def main(vectors_loc, out_loc, name="spaCy_vectors"): + # A tab-separated file that contains information about the vectors for visualization + # + # Learn more: https://www.tensorflow.org/programmers_guide/embedding#metadata + meta_file = "{}_labels.tsv".format(name) + out_meta_file = path.join(out_loc, meta_file) + + print('Loading spaCy vectors model: {}'.format(vectors_loc)) + model = spacy.load(vectors_loc) + + print('Finding lexemes with vectors attached: {}'.format(vectors_loc)) + voacb_strings = [ + w for w in tqdm.tqdm(model.vocab.strings, total=len(model.vocab.strings), leave=False) + if model.vocab.has_vector(w) + ] + vector_count = len(voacb_strings) + + print('Building Projector labels for {} vectors: {}'.format(vector_count, out_meta_file)) + vector_dimensions = model.vocab.vectors.shape[1] + tf_vectors_variable = numpy.zeros((vector_count, vector_dimensions), dtype=numpy.float32) + + # Write a tab-separated file that contains information about the vectors for visualization + # + # Reference: https://www.tensorflow.org/programmers_guide/embedding#metadata + with open(out_meta_file, 'wb') as file_metadata: + # Define columns in the first row + file_metadata.write("Text\tFrequency\n".encode('utf-8')) + # Write out a row for each vector that we add to the tensorflow variable we created + vec_index = 0 + + for text in tqdm.tqdm(voacb_strings, total=len(voacb_strings), leave=False): + # https://github.com/tensorflow/tensorflow/issues/9094 + text = '' if text.lstrip() == '' else text + lex = model.vocab[text] + + # Store vector data and metadata + tf_vectors_variable[vec_index] = numpy.float64(model.vocab.get_vector(text)) + file_metadata.write("{}\t{}\n".format(text, math.exp(lex.prob) * len(voacb_strings)).encode('utf-8')) + vec_index += 1 + + # Write out "[name]_tensors.bytes" file for standalone embeddings projector to load + tensor_path = '{}_tensors.bytes'.format(name) + tf_vectors_variable.tofile(path.join(out_loc, tensor_path)) + + print('Done.') + print('Add the following entry to "oss_data/oss_demo_projector_config.json"') + print(json.dumps({ + "tensorName": name, + "tensorShape": [vector_count, vector_dimensions], + "tensorPath": 'oss_data/{}'.format(tensor_path), + "metadataPath": 'oss_data/{}'.format(meta_file) + }, indent=2)) + + +if __name__ == '__main__': + plac.call(main)