Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,28 +1,107 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
if 'data' not in st.session_state:
|
| 3 |
st.session_state.data = None
|
| 4 |
if 'processed_data' not in st.session_state:
|
| 5 |
st.session_state.processed_data = None
|
| 6 |
-
if '
|
| 7 |
-
st.session_state.
|
|
|
|
|
|
|
| 8 |
if 'x_var' not in st.session_state:
|
| 9 |
st.session_state.x_var = None
|
| 10 |
if 'y_var' not in st.session_state:
|
| 11 |
st.session_state.y_var = None
|
| 12 |
-
if '
|
| 13 |
-
st.session_state.
|
| 14 |
|
| 15 |
-
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
-
|
|
|
|
| 19 |
if data[col].nunique() <= 10: # κ³ μ κ°μ΄ 10κ° μ΄νμΈ κ²½μ°μλ§ μ¬λΌμ΄μ μμ±
|
| 20 |
-
if col not in st.session_state.slicers:
|
| 21 |
-
st.session_state.slicers[col] = sorted(data[col].unique())
|
| 22 |
st.session_state.slicers[col] = st.multiselect(
|
| 23 |
f"{col} μ ν",
|
| 24 |
options=sorted(data[col].unique()),
|
| 25 |
-
default=
|
| 26 |
)
|
| 27 |
|
| 28 |
def apply_slicers(data):
|
|
@@ -31,6 +110,46 @@ def apply_slicers(data):
|
|
| 31 |
data = data[data[col].isin(selected_values)]
|
| 32 |
return data
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
def perform_analysis(data):
|
| 35 |
st.header("νμμ λ°μ΄ν° λΆμ")
|
| 36 |
|
|
@@ -43,82 +162,40 @@ def perform_analysis(data):
|
|
| 43 |
st.write(filtered_data.describe())
|
| 44 |
|
| 45 |
# μκ΄κ΄κ³ ννΈλ§΅
|
| 46 |
-
st.
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
fig = px.imshow(numeric_data.corr(), color_continuous_scale='RdBu_r', zmin=-1, zmax=1)
|
| 50 |
-
fig.update_layout(title='μκ΄κ΄κ³ ννΈλ§΅')
|
| 51 |
-
st.plotly_chart(fig)
|
| 52 |
-
else:
|
| 53 |
-
st.write("μκ΄κ΄κ³ ννΈλ§΅μ 그릴 μ μλ μ«μν μ΄μ΄ μμ΅λλ€.")
|
| 54 |
-
|
| 55 |
# μ¬μ©μκ° μ νν λ λ³μμ λν μ°μ λ λ° νκ· λΆμ
|
| 56 |
st.subheader("λ λ³μ κ°μ κ΄κ³ λΆμ")
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
st.session_state.x_var = st.selectbox("XμΆ λ³μ μ ν", options=numeric_columns, key='x_var_select', index=numeric_columns.get_loc(st.session_state.x_var) if st.session_state.x_var in numeric_columns else 0)
|
| 60 |
-
y_options = [col for col in numeric_columns if col != st.session_state.x_var]
|
| 61 |
-
st.session_state.y_var = st.selectbox("YμΆ λ³μ μ ν", options=y_options, key='y_var_select', index=y_options.index(st.session_state.y_var) if st.session_state.y_var in y_options else 0)
|
| 62 |
|
| 63 |
-
if
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
# νκ·μ μΆκ°
|
| 67 |
-
x = filtered_data[st.session_state.x_var]
|
| 68 |
-
y = filtered_data[st.session_state.y_var]
|
| 69 |
-
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
|
| 70 |
-
line_x = np.array([x.min(), x.max()])
|
| 71 |
-
line_y = slope * line_x + intercept
|
| 72 |
-
fig.add_trace(go.Scatter(x=line_x, y=line_y, mode='lines', name='νκ·μ '))
|
| 73 |
-
|
| 74 |
-
r_squared = r_value ** 2
|
| 75 |
-
fig.update_layout(
|
| 76 |
-
title=f'{st.session_state.x_var}μ {st.session_state.y_var}μ κ΄κ³ (R-squared: {r_squared:.4f})',
|
| 77 |
-
xaxis_title=st.session_state.x_var,
|
| 78 |
-
yaxis_title=st.session_state.y_var,
|
| 79 |
-
annotations=[
|
| 80 |
-
dict(
|
| 81 |
-
x=0.5,
|
| 82 |
-
y=1.05,
|
| 83 |
-
xref='paper',
|
| 84 |
-
yref='paper',
|
| 85 |
-
text=f'R-squared: {r_squared:.4f}',
|
| 86 |
-
showarrow=False,
|
| 87 |
-
)
|
| 88 |
-
]
|
| 89 |
-
)
|
| 90 |
-
st.plotly_chart(fig)
|
| 91 |
-
|
| 92 |
-
# μΆκ° ν΅κ³ μ 보
|
| 93 |
-
st.write(f"μκ΄κ³μ: {r_value:.4f}")
|
| 94 |
-
st.write(f"p-value: {p_value:.4f}")
|
| 95 |
-
st.write(f"νμ€ μ€μ°¨: {std_err:.4f}")
|
| 96 |
-
|
| 97 |
-
st.session_state.analysis_performed = True
|
| 98 |
|
| 99 |
def main():
|
| 100 |
st.title("μΈν°λν°λΈ EDA ν΄ν·")
|
|
|
|
|
|
|
| 101 |
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
if
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
uploaded_file = st.file_uploader("CSV, XLS, λλ XLSX νμΌμ μ ννμΈμ", type=["csv", "xls", "xlsx"])
|
| 109 |
-
if uploaded_file is not None:
|
| 110 |
-
st.session_state.data = load_data(uploaded_file)
|
| 111 |
else:
|
| 112 |
-
st.session_state.data =
|
|
|
|
|
|
|
| 113 |
|
| 114 |
if st.session_state.data is not None:
|
| 115 |
st.subheader("λ°μ΄ν° 미리보기 λ° μμ ")
|
| 116 |
st.write("λ°μ΄ν°λ₯Ό νμΈνκ³ νμν κ²½μ° μμ νμΈμ:")
|
| 117 |
edited_data = st.data_editor(st.session_state.data, num_rows="dynamic")
|
| 118 |
|
| 119 |
-
if st.button("λ°μ΄ν° λΆμ μμ")
|
| 120 |
-
|
| 121 |
-
st.session_state.processed_data = preprocess_data(edited_data)
|
| 122 |
perform_analysis(st.session_state.processed_data)
|
| 123 |
|
| 124 |
if __name__ == "__main__":
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import plotly.express as px
|
| 5 |
+
import plotly.graph_objects as go
|
| 6 |
+
from io import StringIO
|
| 7 |
+
import openpyxl
|
| 8 |
+
import matplotlib.font_manager as fm
|
| 9 |
+
from scipy import stats
|
| 10 |
+
|
| 11 |
+
# νκΈ ν°νΈ μ€μ
|
| 12 |
+
def set_font():
|
| 13 |
+
font_path = "Pretendard-Bold.ttf" # μ€μ ν°νΈ νμΌ κ²½λ‘λ‘ λ³κ²½ν΄μ£ΌμΈμ
|
| 14 |
+
fm.fontManager.addfont(font_path)
|
| 15 |
+
return {'font.family': 'Pretendard-Bold', 'axes.unicode_minus': False}
|
| 16 |
+
|
| 17 |
+
# ν°νΈ μ€μ μ κ°μ Έμ΅λλ€
|
| 18 |
+
font_settings = set_font()
|
| 19 |
+
|
| 20 |
+
# μΈμ
μν μ΄κΈ°ν
|
| 21 |
+
def init_session_state():
|
| 22 |
if 'data' not in st.session_state:
|
| 23 |
st.session_state.data = None
|
| 24 |
if 'processed_data' not in st.session_state:
|
| 25 |
st.session_state.processed_data = None
|
| 26 |
+
if 'numeric_columns' not in st.session_state:
|
| 27 |
+
st.session_state.numeric_columns = []
|
| 28 |
+
if 'categorical_columns' not in st.session_state:
|
| 29 |
+
st.session_state.categorical_columns = []
|
| 30 |
if 'x_var' not in st.session_state:
|
| 31 |
st.session_state.x_var = None
|
| 32 |
if 'y_var' not in st.session_state:
|
| 33 |
st.session_state.y_var = None
|
| 34 |
+
if 'slicers' not in st.session_state:
|
| 35 |
+
st.session_state.slicers = {}
|
| 36 |
|
| 37 |
+
# λ°μ΄ν° λ‘λ
|
| 38 |
+
@st.cache_data
|
| 39 |
+
def load_data(file):
|
| 40 |
+
file_extension = file.name.split('.')[-1].lower()
|
| 41 |
+
if file_extension == 'csv':
|
| 42 |
+
data = pd.read_csv(file)
|
| 43 |
+
elif file_extension in ['xls', 'xlsx']:
|
| 44 |
+
data = pd.read_excel(file)
|
| 45 |
+
else:
|
| 46 |
+
st.error("μ§μλμ§ μλ νμΌ νμμ
λλ€. CSV, XLS, λλ XLSX νμΌμ μ
λ‘λν΄μ£ΌμΈμ.")
|
| 47 |
+
return None
|
| 48 |
+
return data
|
| 49 |
+
|
| 50 |
+
def manual_data_entry():
|
| 51 |
+
st.subheader("μλ λ°μ΄ν° μ
λ ₯")
|
| 52 |
+
col_names = st.text_input("μ΄ μ΄λ¦μ μΌνλ‘ κ΅¬λΆνμ¬ μ
λ ₯νμΈμ:").split(',')
|
| 53 |
+
col_names = [name.strip() for name in col_names if name.strip()]
|
| 54 |
+
|
| 55 |
+
if col_names:
|
| 56 |
+
num_rows = st.number_input("μ΄κΈ° νμ μλ₯Ό μ
λ ₯νμΈμ:", min_value=1, value=5)
|
| 57 |
+
data = pd.DataFrame(columns=col_names, index=range(num_rows))
|
| 58 |
+
|
| 59 |
+
edited_data = st.data_editor(data, num_rows="dynamic")
|
| 60 |
+
|
| 61 |
+
return edited_data
|
| 62 |
+
return None
|
| 63 |
+
|
| 64 |
+
def preprocess_data(data):
|
| 65 |
+
st.subheader("λ°μ΄ν° μ μ²λ¦¬")
|
| 66 |
+
|
| 67 |
+
# κ²°μΈ‘μΉ μ²λ¦¬
|
| 68 |
+
if data.isnull().sum().sum() > 0:
|
| 69 |
+
st.write("κ²°μΈ‘μΉ μ²λ¦¬:")
|
| 70 |
+
for column in data.columns:
|
| 71 |
+
if data[column].isnull().sum() > 0:
|
| 72 |
+
method = st.selectbox(f"{column} μ΄μ μ²λ¦¬ λ°©λ² μ ν:",
|
| 73 |
+
["μ κ±°", "νκ· μΌλ‘ λ체", "μ€μκ°μΌλ‘ λ체", "μ΅λΉκ°μΌλ‘ λ체"])
|
| 74 |
+
if method == "μ κ±°":
|
| 75 |
+
data = data.dropna(subset=[column])
|
| 76 |
+
elif method == "νκ· μΌλ‘ λ체":
|
| 77 |
+
data[column].fillna(data[column].mean(), inplace=True)
|
| 78 |
+
elif method == "μ€μκ°μΌλ‘ λ체":
|
| 79 |
+
data[column].fillna(data[column].median(), inplace=True)
|
| 80 |
+
elif method == "μ΅λΉκ°μΌλ‘ λ체":
|
| 81 |
+
data[column].fillna(data[column].mode()[0], inplace=True)
|
| 82 |
+
|
| 83 |
+
# λ°μ΄ν° νμ
λ³ν
|
| 84 |
+
for column in data.columns:
|
| 85 |
+
if data[column].dtype == 'object':
|
| 86 |
+
try:
|
| 87 |
+
data[column] = pd.to_numeric(data[column])
|
| 88 |
+
st.write(f"{column} μ΄μ μ«μνμΌλ‘ λ³ννμ΅λλ€.")
|
| 89 |
+
except ValueError:
|
| 90 |
+
st.write(f"{column} μ΄μ λ²μ£ΌνμΌλ‘ μ μ§λ©λλ€.")
|
| 91 |
+
|
| 92 |
+
# μ«μν μ΄κ³Ό λ²μ£Όν μ΄ λΆλ¦¬
|
| 93 |
+
st.session_state.numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
|
| 94 |
+
st.session_state.categorical_columns = data.select_dtypes(include=['object']).columns.tolist()
|
| 95 |
+
|
| 96 |
+
return data
|
| 97 |
|
| 98 |
+
def create_slicers(data):
|
| 99 |
+
for col in st.session_state.categorical_columns:
|
| 100 |
if data[col].nunique() <= 10: # κ³ μ κ°μ΄ 10κ° μ΄νμΈ κ²½μ°μλ§ μ¬λΌμ΄μ μμ±
|
|
|
|
|
|
|
| 101 |
st.session_state.slicers[col] = st.multiselect(
|
| 102 |
f"{col} μ ν",
|
| 103 |
options=sorted(data[col].unique()),
|
| 104 |
+
default=sorted(data[col].unique())
|
| 105 |
)
|
| 106 |
|
| 107 |
def apply_slicers(data):
|
|
|
|
| 110 |
data = data[data[col].isin(selected_values)]
|
| 111 |
return data
|
| 112 |
|
| 113 |
+
def plot_correlation_heatmap(data):
|
| 114 |
+
corr = data[st.session_state.numeric_columns].corr()
|
| 115 |
+
fig = px.imshow(corr, color_continuous_scale='RdBu_r', zmin=-1, zmax=1)
|
| 116 |
+
fig.update_layout(title='μκ΄κ΄κ³ ννΈλ§΅')
|
| 117 |
+
st.plotly_chart(fig)
|
| 118 |
+
|
| 119 |
+
def plot_scatter_with_regression(data, x_var, y_var):
|
| 120 |
+
fig = px.scatter(data, x=x_var, y=y_var, color='λ°' if 'λ°' in data.columns else None)
|
| 121 |
+
|
| 122 |
+
# νκ·μ μΆκ°
|
| 123 |
+
x = data[x_var]
|
| 124 |
+
y = data[y_var]
|
| 125 |
+
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
|
| 126 |
+
line_x = np.array([x.min(), x.max()])
|
| 127 |
+
line_y = slope * line_x + intercept
|
| 128 |
+
fig.add_trace(go.Scatter(x=line_x, y=line_y, mode='lines', name='νκ·μ '))
|
| 129 |
+
|
| 130 |
+
r_squared = r_value ** 2
|
| 131 |
+
fig.update_layout(
|
| 132 |
+
title=f'{x_var}μ {y_var}μ κ΄κ³ (R-squared: {r_squared:.4f})',
|
| 133 |
+
xaxis_title=x_var,
|
| 134 |
+
yaxis_title=y_var,
|
| 135 |
+
annotations=[
|
| 136 |
+
dict(
|
| 137 |
+
x=0.5,
|
| 138 |
+
y=1.05,
|
| 139 |
+
xref='paper',
|
| 140 |
+
yref='paper',
|
| 141 |
+
text=f'R-squared: {r_squared:.4f}',
|
| 142 |
+
showarrow=False,
|
| 143 |
+
)
|
| 144 |
+
]
|
| 145 |
+
)
|
| 146 |
+
st.plotly_chart(fig)
|
| 147 |
+
|
| 148 |
+
# μΆκ° ν΅κ³ μ 보
|
| 149 |
+
st.write(f"μκ΄κ³μ: {r_value:.4f}")
|
| 150 |
+
st.write(f"p-value: {p_value:.4f}")
|
| 151 |
+
st.write(f"νμ€ μ€μ°¨: {std_err:.4f}")
|
| 152 |
+
|
| 153 |
def perform_analysis(data):
|
| 154 |
st.header("νμμ λ°μ΄ν° λΆμ")
|
| 155 |
|
|
|
|
| 162 |
st.write(filtered_data.describe())
|
| 163 |
|
| 164 |
# μκ΄κ΄κ³ ννΈλ§΅
|
| 165 |
+
st.subheader("μκ΄κ΄κ³ ννΈλ§΅")
|
| 166 |
+
plot_correlation_heatmap(filtered_data)
|
| 167 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
# μ¬μ©μκ° μ νν λ λ³μμ λν μ°μ λ λ° νκ· λΆμ
|
| 169 |
st.subheader("λ λ³μ κ°μ κ΄κ³ λΆμ")
|
| 170 |
+
x_var = st.selectbox("XμΆ λ³μ μ ν", options=st.session_state.numeric_columns, key='x_var')
|
| 171 |
+
y_var = st.selectbox("YμΆ λ³μ μ ν", options=[col for col in st.session_state.numeric_columns if col != x_var], key='y_var')
|
|
|
|
|
|
|
|
|
|
| 172 |
|
| 173 |
+
if x_var and y_var:
|
| 174 |
+
plot_scatter_with_regression(filtered_data, x_var, y_var)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
|
| 176 |
def main():
|
| 177 |
st.title("μΈν°λν°λΈ EDA ν΄ν·")
|
| 178 |
+
|
| 179 |
+
init_session_state()
|
| 180 |
|
| 181 |
+
data_input_method = st.radio("λ°μ΄ν° μ
λ ₯ λ°©λ² μ ν:", ("νμΌ μ
λ‘λ", "μλ μ
λ ₯"))
|
| 182 |
+
|
| 183 |
+
if data_input_method == "νμΌ μ
λ‘λ":
|
| 184 |
+
uploaded_file = st.file_uploader("CSV, XLS, λλ XLSX νμΌμ μ ννμΈμ", type=["csv", "xls", "xlsx"])
|
| 185 |
+
if uploaded_file is not None:
|
| 186 |
+
st.session_state.data = load_data(uploaded_file)
|
|
|
|
|
|
|
|
|
|
| 187 |
else:
|
| 188 |
+
st.session_state.data = None
|
| 189 |
+
else:
|
| 190 |
+
st.session_state.data = manual_data_entry()
|
| 191 |
|
| 192 |
if st.session_state.data is not None:
|
| 193 |
st.subheader("λ°μ΄ν° 미리보기 λ° μμ ")
|
| 194 |
st.write("λ°μ΄ν°λ₯Ό νμΈνκ³ νμν κ²½μ° μμ νμΈμ:")
|
| 195 |
edited_data = st.data_editor(st.session_state.data, num_rows="dynamic")
|
| 196 |
|
| 197 |
+
if st.button("λ°μ΄ν° λΆμ μμ"):
|
| 198 |
+
st.session_state.processed_data = preprocess_data(edited_data)
|
|
|
|
| 199 |
perform_analysis(st.session_state.processed_data)
|
| 200 |
|
| 201 |
if __name__ == "__main__":
|