Update app.py
Browse files
app.py
CHANGED
|
@@ -5,45 +5,34 @@ import os
|
|
| 5 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
| 6 |
|
| 7 |
st.set_page_config(page_title="FW Clusters inspection", layout="wide")
|
| 8 |
-
st.title("FW clusters inspection (
|
| 9 |
|
| 10 |
st.markdown("""
|
| 11 |
We clustered 100k FineWeb samples using [text-clustering](https://github.com/huggingface/text-clustering).
|
| 12 |
|
| 13 |
-
Our approach involved prompting Mixtral to evaluate whether the topics in each cluster are educational or could be considered college material.
|
| 14 |
|
| 15 |
-
Additionally, the model was tasked with
|
| 16 |
-
|
| 17 |
-
Sometimes, the model may define its own category. This can happen either within the context of AFAIK topics or seperately. Hence the `Select Category Type` dropdown in our interface.
|
| 18 |
""")
|
| 19 |
|
| 20 |
@st.cache_data
|
| 21 |
-
def load_data(educational_topic):
|
| 22 |
-
ds = load_dataset("HuggingFaceTB/
|
| 23 |
-
|
| 24 |
-
ds = ds.filter(lambda x: x['is_topic_educational'] == educational_topic)
|
| 25 |
return ds
|
| 26 |
|
| 27 |
-
@st.cache_data
|
| 28 |
-
def get_categories_by_type(_ds, category_type):
|
| 29 |
-
filtered_ds = _ds.filter(lambda x: x['category_type'] == category_type)
|
| 30 |
-
return list(set(filtered_ds['category']))
|
| 31 |
-
|
| 32 |
|
| 33 |
st.subheader("Cluster information")
|
| 34 |
-
|
| 35 |
-
with
|
| 36 |
-
|
|
|
|
|
|
|
| 37 |
|
| 38 |
-
ds = load_data(educational_topic)
|
|
|
|
| 39 |
|
| 40 |
-
|
| 41 |
-
category_types = ['afaik', 'defined_by_llm', 'defined_by_llm_under_afaik']
|
| 42 |
-
default_index = 0 if educational_topic == "Yes" else 1
|
| 43 |
-
selected_category_type = st.selectbox("Select Category Type", category_types, index=default_index)
|
| 44 |
-
with col_3:
|
| 45 |
-
categories = get_categories_by_type(ds, selected_category_type)
|
| 46 |
-
selected_category = st.selectbox("Select Category", categories)
|
| 47 |
|
| 48 |
selected_cluster = ds.filter(lambda x: x['category'] == selected_category)
|
| 49 |
|
|
|
|
| 5 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
| 6 |
|
| 7 |
st.set_page_config(page_title="FW Clusters inspection", layout="wide")
|
| 8 |
+
st.title("FW clusters inspection (free topics)")
|
| 9 |
|
| 10 |
st.markdown("""
|
| 11 |
We clustered 100k FineWeb samples using [text-clustering](https://github.com/huggingface/text-clustering).
|
| 12 |
|
| 13 |
+
Our approach involved prompting Mixtral to evaluate whether the topics in each cluster are educational or could be considered college material using a score from 1 to 10.
|
| 14 |
|
| 15 |
+
Additionally, the model was tasked with finding the topic of each cluster.
|
|
|
|
|
|
|
| 16 |
""")
|
| 17 |
|
| 18 |
@st.cache_data
|
| 19 |
+
def load_data(educational_topic, min_score=1, max_score=10):
|
| 20 |
+
ds = load_dataset("HuggingFaceTB/FW_clusters_free_topics", split="train", token=HF_TOKEN, num_proc=2)
|
| 21 |
+
ds = ds.filter(lambda x: x['educational_score'] <= max_score and x['educational_score'] >= min_score)
|
|
|
|
| 22 |
return ds
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
st.subheader("Cluster information")
|
| 26 |
+
min_score, max_score = st.columns(2)
|
| 27 |
+
with min_score:
|
| 28 |
+
min_value = st.slider('Select minimum educational score', 1, 10, 1)
|
| 29 |
+
with max_score:
|
| 30 |
+
max_value = st.slider('Select maximum educational score', 1, 10, 10)
|
| 31 |
|
| 32 |
+
ds = load_data(educational_topic, min_score, max_score)
|
| 33 |
+
categories = list(set(ds["category"]))
|
| 34 |
|
| 35 |
+
selected_category_type = st.selectbox("Select a topic", categories)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
selected_cluster = ds.filter(lambda x: x['category'] == selected_category)
|
| 38 |
|