mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			166 lines
		
	
	
		
			4.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			166 lines
		
	
	
		
			4.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# coding: utf-8
 | 
						|
"""
 | 
						|
Example of a Streamlit app for an interactive spaCy model visualizer. You can
 | 
						|
either download the script, or point `streamlit run` to the raw URL of this
 | 
						|
file. For more details, see https://streamlit.io.
 | 
						|
 | 
						|
Installation:
 | 
						|
pip install streamlit
 | 
						|
python -m spacy download en_core_web_sm
 | 
						|
python -m spacy download en_core_web_md
 | 
						|
python -m spacy download de_core_news_sm
 | 
						|
 | 
						|
Usage:
 | 
						|
streamlit run streamlit_spacy.py
 | 
						|
"""
 | 
						|
from __future__ import unicode_literals
 | 
						|
 | 
						|
import base64
 | 
						|
 | 
						|
import streamlit as st
 | 
						|
import spacy
 | 
						|
from spacy import displacy
 | 
						|
import pandas as pd
 | 
						|
 | 
						|
 | 
						|
SPACY_MODEL_NAMES = ["en_core_web_sm", "en_core_web_md", "de_core_news_sm"]
 | 
						|
DEFAULT_TEXT = "Mark Zuckerberg is the CEO of Facebook."
 | 
						|
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
 | 
						|
 | 
						|
 | 
						|
@st.cache(allow_output_mutation=True)
 | 
						|
def load_model(name):
 | 
						|
    return spacy.load(name)
 | 
						|
 | 
						|
 | 
						|
@st.cache(allow_output_mutation=True)
 | 
						|
def process_text(model_name, text):
 | 
						|
    nlp = load_model(model_name)
 | 
						|
    return nlp(text)
 | 
						|
 | 
						|
 | 
						|
st.sidebar.title("Interactive spaCy visualizer")
 | 
						|
st.sidebar.markdown(
 | 
						|
    """
 | 
						|
Process text with [spaCy](https://spacy.io) models and visualize named entities,
 | 
						|
dependencies and more. Uses spaCy's built-in
 | 
						|
[displaCy](http://spacy.io/usage/visualizers) visualizer under the hood.
 | 
						|
"""
 | 
						|
)
 | 
						|
 | 
						|
spacy_model = st.sidebar.selectbox("Model name", SPACY_MODEL_NAMES)
 | 
						|
model_load_state = st.info(f"Loading model '{spacy_model}'...")
 | 
						|
nlp = load_model(spacy_model)
 | 
						|
model_load_state.empty()
 | 
						|
 | 
						|
text = st.text_area("Text to analyze", DEFAULT_TEXT)
 | 
						|
doc = process_text(spacy_model, text)
 | 
						|
 | 
						|
 | 
						|
def render_svg(svg):
 | 
						|
    """Renders the given svg string."""
 | 
						|
    b64 = base64.b64encode(svg.encode('utf-8')).decode("utf-8")
 | 
						|
    html = r'<img src="data:image/svg+xml;base64,%s"/>' % b64
 | 
						|
    st.write(html, unsafe_allow_html=True)
 | 
						|
 | 
						|
 | 
						|
if "parser" in nlp.pipe_names:
 | 
						|
    st.header("Dependency Parse & Part-of-speech tags")
 | 
						|
    st.sidebar.header("Dependency Parse")
 | 
						|
    split_sents = st.sidebar.checkbox("Split sentences", value=True)
 | 
						|
    collapse_punct = st.sidebar.checkbox("Collapse punctuation", value=True)
 | 
						|
    collapse_phrases = st.sidebar.checkbox("Collapse phrases")
 | 
						|
    compact = st.sidebar.checkbox("Compact mode")
 | 
						|
    options = {
 | 
						|
        "collapse_punct": collapse_punct,
 | 
						|
        "collapse_phrases": collapse_phrases,
 | 
						|
        "compact": compact,
 | 
						|
    }
 | 
						|
    docs = [span.as_doc() for span in doc.sents] if split_sents else [doc]
 | 
						|
    for sent in docs:
 | 
						|
        html = displacy.render(sent, options=options, style="dep")
 | 
						|
        # Double newlines seem to mess with the rendering
 | 
						|
        html = html.replace("\n\n", "\n")
 | 
						|
        if split_sents and len(docs) > 1:
 | 
						|
            st.markdown(f"> {sent.text}")
 | 
						|
        render_svg(html)
 | 
						|
        # this didn't show the dep arc labels properly, cf #5089
 | 
						|
        # st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
 | 
						|
 | 
						|
if "ner" in nlp.pipe_names:
 | 
						|
    st.header("Named Entities")
 | 
						|
    st.sidebar.header("Named Entities")
 | 
						|
    label_set = nlp.get_pipe("ner").labels
 | 
						|
    labels = st.sidebar.multiselect(
 | 
						|
        "Entity labels", options=label_set, default=list(label_set)
 | 
						|
    )
 | 
						|
    html = displacy.render(doc, style="ent", options={"ents": labels})
 | 
						|
    # Newlines seem to mess with the rendering
 | 
						|
    html = html.replace("\n", " ")
 | 
						|
    st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
 | 
						|
    attrs = ["text", "label_", "start", "end", "start_char", "end_char"]
 | 
						|
    if "entity_linker" in nlp.pipe_names:
 | 
						|
        attrs.append("kb_id_")
 | 
						|
    data = [
 | 
						|
        [str(getattr(ent, attr)) for attr in attrs]
 | 
						|
        for ent in doc.ents
 | 
						|
        if ent.label_ in labels
 | 
						|
    ]
 | 
						|
    df = pd.DataFrame(data, columns=attrs)
 | 
						|
    st.dataframe(df)
 | 
						|
 | 
						|
 | 
						|
if "textcat" in nlp.pipe_names:
 | 
						|
    st.header("Text Classification")
 | 
						|
    st.markdown(f"> {text}")
 | 
						|
    df = pd.DataFrame(doc.cats.items(), columns=("Label", "Score"))
 | 
						|
    st.dataframe(df)
 | 
						|
 | 
						|
 | 
						|
vector_size = nlp.meta.get("vectors", {}).get("width", 0)
 | 
						|
if vector_size:
 | 
						|
    st.header("Vectors & Similarity")
 | 
						|
    st.code(nlp.meta["vectors"])
 | 
						|
    text1 = st.text_input("Text or word 1", "apple")
 | 
						|
    text2 = st.text_input("Text or word 2", "orange")
 | 
						|
    doc1 = process_text(spacy_model, text1)
 | 
						|
    doc2 = process_text(spacy_model, text2)
 | 
						|
    similarity = doc1.similarity(doc2)
 | 
						|
    if similarity > 0.5:
 | 
						|
        st.success(similarity)
 | 
						|
    else:
 | 
						|
        st.error(similarity)
 | 
						|
 | 
						|
st.header("Token attributes")
 | 
						|
 | 
						|
if st.button("Show token attributes"):
 | 
						|
    attrs = [
 | 
						|
        "idx",
 | 
						|
        "text",
 | 
						|
        "lemma_",
 | 
						|
        "pos_",
 | 
						|
        "tag_",
 | 
						|
        "dep_",
 | 
						|
        "head",
 | 
						|
        "ent_type_",
 | 
						|
        "ent_iob_",
 | 
						|
        "shape_",
 | 
						|
        "is_alpha",
 | 
						|
        "is_ascii",
 | 
						|
        "is_digit",
 | 
						|
        "is_punct",
 | 
						|
        "like_num",
 | 
						|
    ]
 | 
						|
    data = [[str(getattr(token, attr)) for attr in attrs] for token in doc]
 | 
						|
    df = pd.DataFrame(data, columns=attrs)
 | 
						|
    st.dataframe(df)
 | 
						|
 | 
						|
 | 
						|
st.header("JSON Doc")
 | 
						|
if st.button("Show JSON Doc"):
 | 
						|
    st.json(doc.to_json())
 | 
						|
 | 
						|
st.header("JSON model meta")
 | 
						|
if st.button("Show JSON model meta"):
 | 
						|
    st.json(nlp.meta)
 |