mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Merge branch 'master' of https://github.com/explosion/spaCy
This commit is contained in:
commit
d4680e4d83
88
examples/vectors_tensorboard_standalone.py
Normal file
88
examples/vectors_tensorboard_standalone.py
Normal file
|
@ -0,0 +1,88 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# coding: utf8
|
||||||
|
"""Export spaCy model vectors for use in TensorBoard's standalone embedding projector.
|
||||||
|
https://github.com/tensorflow/embedding-projector-standalone
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
|
||||||
|
python vectors_tensorboard_standalone.py ./myVectorModel ./output [name]
|
||||||
|
|
||||||
|
This outputs two files that have to be copied into the "oss_data" of the standalone projector:
|
||||||
|
|
||||||
|
[name]_labels.tsv - metadata such as human readable labels for vectors
|
||||||
|
[name]_tensors.bytes - numpy.ndarray of numpy.float32 precision vectors
|
||||||
|
|
||||||
|
"""
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import json
|
||||||
|
import math
|
||||||
|
from os import path
|
||||||
|
|
||||||
|
import numpy
|
||||||
|
import plac
|
||||||
|
import spacy
|
||||||
|
import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
@plac.annotations(
|
||||||
|
vectors_loc=("Path to spaCy model that contains vectors", "positional", None, str),
|
||||||
|
out_loc=("Path to output folder writing tensors and labels data", "positional", None, str),
|
||||||
|
name=("Human readable name for tsv file and vectors tensor", "positional", None, str),
|
||||||
|
)
|
||||||
|
def main(vectors_loc, out_loc, name="spaCy_vectors"):
|
||||||
|
# A tab-separated file that contains information about the vectors for visualization
|
||||||
|
#
|
||||||
|
# Learn more: https://www.tensorflow.org/programmers_guide/embedding#metadata
|
||||||
|
meta_file = "{}_labels.tsv".format(name)
|
||||||
|
out_meta_file = path.join(out_loc, meta_file)
|
||||||
|
|
||||||
|
print('Loading spaCy vectors model: {}'.format(vectors_loc))
|
||||||
|
model = spacy.load(vectors_loc)
|
||||||
|
|
||||||
|
print('Finding lexemes with vectors attached: {}'.format(vectors_loc))
|
||||||
|
voacb_strings = [
|
||||||
|
w for w in tqdm.tqdm(model.vocab.strings, total=len(model.vocab.strings), leave=False)
|
||||||
|
if model.vocab.has_vector(w)
|
||||||
|
]
|
||||||
|
vector_count = len(voacb_strings)
|
||||||
|
|
||||||
|
print('Building Projector labels for {} vectors: {}'.format(vector_count, out_meta_file))
|
||||||
|
vector_dimensions = model.vocab.vectors.shape[1]
|
||||||
|
tf_vectors_variable = numpy.zeros((vector_count, vector_dimensions), dtype=numpy.float32)
|
||||||
|
|
||||||
|
# Write a tab-separated file that contains information about the vectors for visualization
|
||||||
|
#
|
||||||
|
# Reference: https://www.tensorflow.org/programmers_guide/embedding#metadata
|
||||||
|
with open(out_meta_file, 'wb') as file_metadata:
|
||||||
|
# Define columns in the first row
|
||||||
|
file_metadata.write("Text\tFrequency\n".encode('utf-8'))
|
||||||
|
# Write out a row for each vector that we add to the tensorflow variable we created
|
||||||
|
vec_index = 0
|
||||||
|
|
||||||
|
for text in tqdm.tqdm(voacb_strings, total=len(voacb_strings), leave=False):
|
||||||
|
# https://github.com/tensorflow/tensorflow/issues/9094
|
||||||
|
text = '<Space>' if text.lstrip() == '' else text
|
||||||
|
lex = model.vocab[text]
|
||||||
|
|
||||||
|
# Store vector data and metadata
|
||||||
|
tf_vectors_variable[vec_index] = numpy.float64(model.vocab.get_vector(text))
|
||||||
|
file_metadata.write("{}\t{}\n".format(text, math.exp(lex.prob) * len(voacb_strings)).encode('utf-8'))
|
||||||
|
vec_index += 1
|
||||||
|
|
||||||
|
# Write out "[name]_tensors.bytes" file for standalone embeddings projector to load
|
||||||
|
tensor_path = '{}_tensors.bytes'.format(name)
|
||||||
|
tf_vectors_variable.tofile(path.join(out_loc, tensor_path))
|
||||||
|
|
||||||
|
print('Done.')
|
||||||
|
print('Add the following entry to "oss_data/oss_demo_projector_config.json"')
|
||||||
|
print(json.dumps({
|
||||||
|
"tensorName": name,
|
||||||
|
"tensorShape": [vector_count, vector_dimensions],
|
||||||
|
"tensorPath": 'oss_data/{}'.format(tensor_path),
|
||||||
|
"metadataPath": 'oss_data/{}'.format(meta_file)
|
||||||
|
}, indent=2))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
plac.call(main)
|
|
@ -1,7 +1,7 @@
|
||||||
# encoding: utf8
|
# encoding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...symbols import ORTH, LEMMA, POS, ADV, ADJ, NOUN
|
from ...symbols import ORTH, LEMMA, POS, ADV, ADJ, NOUN, ADP
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
@ -12,11 +12,24 @@ for exc_data in [
|
||||||
{ORTH: "mgr.", LEMMA: "magister", POS: NOUN},
|
{ORTH: "mgr.", LEMMA: "magister", POS: NOUN},
|
||||||
{ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
|
{ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
|
||||||
{ORTH: "tj.", LEMMA: "to jest", POS: ADV},
|
{ORTH: "tj.", LEMMA: "to jest", POS: ADV},
|
||||||
{ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]:
|
{ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ},
|
||||||
|
{ORTH: "adw.", LEMMA: "adwokat", POS: NOUN},
|
||||||
|
{ORTH: "afr.", LEMMA: "afrykański", POS: ADJ},
|
||||||
|
{ORTH: "c.b.d.o.", LEMMA: "co było do okazania", POS: ADV},
|
||||||
|
{ORTH: "cbdu.", LEMMA: "co było do udowodnienia", POS: ADV},
|
||||||
|
{ORTH: "mn.w.", LEMMA: "mniej więcej", POS: ADV},
|
||||||
|
{ORTH: "nt.", LEMMA: "na temat", POS: ADP},
|
||||||
|
{ORTH: "ok.", LEMMA: "około"},
|
||||||
|
{ORTH: "n.p.u.", LEMMA: "na psa urok"},
|
||||||
|
{ORTH: "ww.", LEMMA: "wyżej wymieniony", POS: ADV}]:
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
for orth in [
|
for orth in [
|
||||||
"w.", "r."]:
|
"w.", "r.", "br.", "bm.", "b.r.", "amer.", "am.", "bdb.", "św.", "p.", "lit.",
|
||||||
|
"wym.", "czyt.", "daw.", "d.", "zob.", "gw.", "dn.", "dyr.", "im.", "mł.",
|
||||||
|
"min.", "dot.", "muz.", "k.k.", "k.p.a.", "k.p.c.", "n.p.m.", "p.p.m.", "nb.",
|
||||||
|
"ob.", "n.e.", "p.n.e.", "zw.", "zool.", "zach.", "żarg.", "żart.", "wzgl.",
|
||||||
|
"wyj.", "xx.", "ks.", "x.", "wyd.", "wsch.", "o.o."]:
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user