diff --git a/examples/streamlit_spacy.py b/examples/streamlit_spacy.py new file mode 100644 index 000000000..f99f7bf18 --- /dev/null +++ b/examples/streamlit_spacy.py @@ -0,0 +1,153 @@ +# coding: utf-8 +""" +Example of a Streamlit app for an interactive spaCy model visualizer. You can +either download the script, or point streamlit run to the raw URL of this +file. For more details, see https://streamlit.io. + +Installation: +pip install streamlit +python -m spacy download en_core_web_sm +python -m spacy download en_core_web_md +python -m spacy download de_core_news_sm + +Usage: +streamlit run streamlit_spacy.py +""" +from __future__ import unicode_literals + +import streamlit as st +import spacy +from spacy import displacy +import pandas as pd + + +SPACY_MODEL_NAMES = ["en_core_web_sm", "en_core_web_md", "de_core_news_sm"] +DEFAULT_TEXT = "Mark Zuckerberg is the CEO of Facebook." +HTML_WRAPPER = """
{}
""" + + +@st.cache(ignore_hash=True) +def load_model(name): + return spacy.load(name) + + +@st.cache(ignore_hash=True) +def process_text(model_name, text): + nlp = load_model(model_name) + return nlp(text) + + +st.sidebar.title("Interactive spaCy visualizer") +st.sidebar.markdown( + """ +Process text with [spaCy](https://spacy.io) models and visualize named entities, +dependencies and more. Uses spaCy's built-in +[displaCy](http://spacy.io/usage/visualizers) visualizer under the hood. +""" +) + +spacy_model = st.sidebar.selectbox("Model name", SPACY_MODEL_NAMES) +model_load_state = st.info(f"Loading model '{spacy_model}'...") +nlp = load_model(spacy_model) +model_load_state.empty() + +text = st.text_area("Text to analyze", DEFAULT_TEXT) +doc = process_text(spacy_model, text) + +if "parser" in nlp.pipe_names: + st.header("Dependency Parse & Part-of-speech tags") + st.sidebar.header("Dependency Parse") + split_sents = st.sidebar.checkbox("Split sentences", value=True) + collapse_punct = st.sidebar.checkbox("Collapse punctuation", value=True) + collapse_phrases = st.sidebar.checkbox("Collapse phrases") + compact = st.sidebar.checkbox("Compact mode") + options = { + "collapse_punct": collapse_punct, + "collapse_phrases": collapse_phrases, + "compact": compact, + } + docs = [span.as_doc() for span in doc.sents] if split_sents else [doc] + for sent in docs: + html = displacy.render(sent, options=options) + # Double newlines seem to mess with the rendering + html = html.replace("\n\n", "\n") + if split_sents and len(docs) > 1: + st.markdown(f"> {sent.text}") + st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True) + +if "ner" in nlp.pipe_names: + st.header("Named Entities") + st.sidebar.header("Named Entities") + default_labels = ["PERSON", "ORG", "GPE", "LOC"] + labels = st.sidebar.multiselect( + "Entity labels", nlp.get_pipe("ner").labels, default_labels + ) + html = displacy.render(doc, style="ent", options={"ents": labels}) + # Newlines seem to mess with the rendering + html = html.replace("\n", " ") + st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True) + attrs = ["text", "label_", "start", "end", "start_char", "end_char"] + if "entity_linker" in nlp.pipe_names: + attrs.append("kb_id_") + data = [ + [str(getattr(ent, attr)) for attr in attrs] + for ent in doc.ents + if ent.label_ in labels + ] + df = pd.DataFrame(data, columns=attrs) + st.dataframe(df) + + +if "textcat" in nlp.pipe_names: + st.header("Text Classification") + st.markdown(f"> {text}") + df = pd.DataFrame(doc.cats.items(), columns=("Label", "Score")) + st.dataframe(df) + + +vector_size = nlp.meta.get("vectors", {}).get("width", 0) +if vector_size: + st.header("Vectors & Similarity") + st.code(nlp.meta["vectors"]) + text1 = st.text_input("Text or word 1", "apple") + text2 = st.text_input("Text or word 2", "orange") + doc1 = process_text(spacy_model, text1) + doc2 = process_text(spacy_model, text2) + similarity = doc1.similarity(doc2) + if similarity > 0.5: + st.success(similarity) + else: + st.error(similarity) + +st.header("Token attributes") + +if st.button("Show token attributes"): + attrs = [ + "idx", + "text", + "lemma_", + "pos_", + "tag_", + "dep_", + "head", + "ent_type_", + "ent_iob_", + "shape_", + "is_alpha", + "is_ascii", + "is_digit", + "is_punct", + "like_num", + ] + data = [[str(getattr(token, attr)) for attr in attrs] for token in doc] + df = pd.DataFrame(data, columns=attrs) + st.dataframe(df) + + +st.header("JSON Doc") +if st.button("Show JSON Doc"): + st.json(doc.to_json()) + +st.header("JSON model meta") +if st.button("Show JSON model meta"): + st.json(nlp.meta)