diff --git a/nemo/NeMo-Data-Designer/self-hosted-tutorials/community-contributions/nemotron-persona-jp/japanese_commonsense_qa_data_generator_nemotron_persona_jp_seed.ipynb b/nemo/NeMo-Data-Designer/self-hosted-tutorials/community-contributions/nemotron-persona-jp/japanese_commonsense_qa_data_generator_nemotron_persona_jp_seed.ipynb
new file mode 100644
index 00000000..a9c6bc71
--- /dev/null
+++ b/nemo/NeMo-Data-Designer/self-hosted-tutorials/community-contributions/nemotron-persona-jp/japanese_commonsense_qa_data_generator_nemotron_persona_jp_seed.ipynb
@@ -0,0 +1,3169 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "header",
+ "metadata": {},
+ "source": [
+ "# ๐จ NeMo Data Designer: Japanese Commonsense Reasoning Dataset Generation (Improved Version)\n",
+ "\n",
+ "#### ๐ Overview\n",
+ "\n",
+ "This notebook generates synthetic datasets for the following tasks using NeMo Data Designer:\n",
+ "- **jcommonsenseqa**: Japanese commonsense question answering\n",
+ "\n",
+ "**Seed Data**: Uses `nvidia/Nemotron-Personas-Japan` directly as the dataset\n",
+ "\n",
+ "
\n",
+ "\n",
+ "> ๐ **Important** โ Environment Setup\n",
+ ">\n",
+ "> - Ensure that NeMo Data Designer installation and configuration are completed\n",
+ "> - Ensure that the local LLM server is running\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "imports",
+ "metadata": {},
+ "source": [
+ "### ๐ฆ Import Required Modules"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "imports_code",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from nemo_microservices.data_designer.essentials import (\n",
+ " CategorySamplerParams,\n",
+ " DataDesignerConfigBuilder,\n",
+ " ExpressionColumnConfig,\n",
+ " InferenceParameters,\n",
+ " LLMJudgeColumnConfig,\n",
+ " LLMStructuredColumnConfig,\n",
+ " LLMTextColumnConfig,\n",
+ " ModelConfig,\n",
+ " NeMoDataDesignerClient,\n",
+ " SamplerColumnConfig,\n",
+ " SamplerType,\n",
+ " Score,\n",
+ ")\n",
+ "\n",
+ "from pydantic import BaseModel, Field\n",
+ "from datasets import load_dataset\n",
+ "import pandas as pd\n",
+ "import random"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "client_init",
+ "metadata": {},
+ "source": [
+ "### โ๏ธ Initialize NeMo Data Designer Client"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "client_code",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "NEMO_MICROSERVICES_BASE_URL = \"http://localhost:8080\"\n",
+ "\n",
+ "data_designer_client = NeMoDataDesignerClient(base_url=NEMO_MICROSERVICES_BASE_URL)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "model_config",
+ "metadata": {},
+ "source": [
+ "### ๐๏ธ Define Model Configuration"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "id": "model_config_code",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "MODEL_PROVIDER = \"nvidiabuild\"\n",
+ "MODEL_ID = \"openai/gpt-oss-120b\"\n",
+ "MODEL_ALIAS = \"gpt-oss-120b\"\n",
+ "SYSTEM_PROMPT = \"\" \n",
+ "JUDGE_MODEL_ALIAS = \"quality-judge\"\n",
+ "\n",
+ "model_configs = [\n",
+ " ModelConfig(\n",
+ " alias=MODEL_ALIAS,\n",
+ " model=MODEL_ID,\n",
+ " provider=MODEL_PROVIDER,\n",
+ " inference_parameters=InferenceParameters(\n",
+ " temperature=0.9,\n",
+ " top_p=0.95,\n",
+ " max_tokens=2048,\n",
+ " max_parallel_requests=8,\n",
+ " timeout=1200\n",
+ " ),\n",
+ " ),\n",
+ " ModelConfig(\n",
+ " alias=JUDGE_MODEL_ALIAS,\n",
+ " model=MODEL_ID,\n",
+ " provider=MODEL_PROVIDER,\n",
+ " inference_parameters=InferenceParameters(\n",
+ " temperature=0.3,\n",
+ " top_p=0.9,\n",
+ " max_tokens=1024,\n",
+ " max_parallel_requests=4,\n",
+ " timeout=1500,\n",
+ " ),\n",
+ " ),\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "seed_data",
+ "metadata": {},
+ "source": [
+ "### ๐ Prepare Seed Data\n",
+ "\n",
+ "Load persona data from `nvidia/Nemotron-Personas-Japan` and\n",
+ "pass it to Data Designer as a pandas DataFrame."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "2ca3c23c-d9df-4231-9311-1d8d3ad4d7d8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import json\n",
+ "import pandas as pd\n",
+ "\n",
+ "personas_dataset = load_dataset(\"nvidia/Nemotron-Personas-Japan\", split=\"train\")\n",
+ "\n",
+ "# DataFrameใซๅคๆ\n",
+ "df = personas_dataset.to_pandas()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2c0be399-7822-4c9b-b321-6d9bbe498302",
+ "metadata": {},
+ "source": [
+ "## Define Target Count and Category Breakdown\n",
+ "\n",
+ "Define target of 2000 total seeds with category-specific breakdowns.\n",
+ "\n",
+ "- **SEED_TARGET**: 2000 total seeds\n",
+ "- **WeakA**: geo(250), tools(100), public(200), other(150) = 400 total\n",
+ "- **WeakB** (weakness reinforcement): finance(400), safety(350), vocab(350) = 1100 total\n",
+ "- **Typical**: Remaining 500 seeds\n",
+ "- **Bias suppression**: Max 10 per occupation, max 12 per prefecture"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "32ff131c-9815-49ba-9df6-a77018618473",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import re\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from typing import List, Optional\n",
+ "\n",
+ "np.random.seed(42)\n",
+ "\n",
+ "\n",
+ "SEED_TARGET = 2000\n",
+ "\n",
+ "# weakB๏ผๅผฑ็น่ฃๅผท๏ผ: ๅ่จ1100\n",
+ "WEAKB_TARGETS = {\"finance\": 400, \"safety\": 350, \"vocab\": 350}\n",
+ "N_WEAK_A = 400 # weakAๅ
จไฝ\n",
+ "N_TYPICAL = SEED_TARGET - sum(WEAKB_TARGETS.values()) - N_WEAK_A # = 500\n",
+ "\n",
+ "# weakA ๅ
ใใตใใฏใฉใผใฟใงๅบๅฎ๏ผใใใๆ้่ฆ๏ผ\n",
+ "N_GEO = 250\n",
+ "N_TOOLS = 100\n",
+ "N_PUBLIC = 200\n",
+ "N_WEAK_A_OTHER = N_WEAK_A - N_GEO - N_TOOLS - N_PUBLIC # 150\n",
+ "\n",
+ "# ๅใๆๅถ๏ผcap๏ผ\n",
+ "CAP_PER_OCCUPATION = 10\n",
+ "CAP_PER_PREFECTURE = 12\n",
+ "\n",
+ "# D(ๅ
ฌๅ
ฑ)ใโๅฎๆ/ๅ็คผโใซๅธใใใชใใใใฎๅชๅ
ๅบฆ๏ผpublic_bonus๏ผ\n",
+ "PUBLIC_BONUS_KW = [\"ๅณๆธ้คจ\",\"็
้ข\",\"ๅฝนๆ\",\"็ชๅฃ\",\"ๅพ
ๅๅฎค\",\"ๅ\",\"ไธฆใถ\",\"้ ็ช\",\"ๅไป\",\"ไผ่จ\",\n",
+ " \"้ง
\",\"ๆนๆญ\",\"ๅ็ฌฆ\",\"ใดใ\",\"ๅๅฅ\",\"ใซใผใซ\",\"็ฆๆญข\",\"ๅชๅ
ๅธญ\",\"ใจในใซใฌใผใฟใผ\",\"ใจใฌใใผใฟใผ\"]\n",
+ "RELIGION_PENALTY_KW = [\"ๅฏบ\",\"็ฅ็คพ\",\"ๆไผ\",\"็คผๆๅ \",\"ใใต\",\"ไฟฎ้้ข\",\"ไปๅฃ\",\"ใ็ต\",\"็ท้ฆ\",\"ๆฐ็ \",\"ๅข\",\"ใๅฎใ\"]\n",
+ "\n",
+ "# ๅ็พๆง: hash(cat)็ฆๆญข โ ๅบๅฎseed\n",
+ "CAT_RS = {\"finance\": 201, \"safety\": 202, \"vocab\": 203}\n",
+ "WA_RS = {\"geo\": 111, \"tools\": 112, \"other\": 113}\n",
+ "TY_RS = 301\n",
+ "\n",
+ "# neutral ใๅขใใใใใชใ๏ผไธ้๏ผ\n",
+ "NEUTRAL_CAP = 50"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e5829815-c045-4a3d-ada5-6045775b115e",
+ "metadata": {},
+ "source": [
+ "## Handle Missing Values\n",
+ "\n",
+ "Fill missing values in required columns with empty strings.\n",
+ "Create columns with empty strings if they don't exist.\n",
+ "\n",
+ "## Text Construction\n",
+ "\n",
+ "Combine multiple columns to construct text for classification.\n",
+ "\n",
+ "- `_all_text`: Combine all columns\n",
+ "- `_core_text`: Combine core columns only (primary target for keyword matching)\n",
+ "- `_core_len`: Character count of core text"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "9b908d96-e515-494c-ae0e-5a76862b6a96",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "TEXT_COLS_ALL = [\n",
+ " \"occupation\",\"hobbies_and_interests\",\"marital_status\",\"education_level\",\"prefecture\",\"region\",\"area\",\n",
+ " \"professional_persona\",\"cultural_background\",\"travel_persona\",\"culinary_persona\",\"persona\",\n",
+ "]\n",
+ "CORE_COLS = [\"occupation\",\"hobbies_and_interests\",\"marital_status\",\"education_level\",\"prefecture\",\"region\",\"area\"]\n",
+ "EXTRA_COLS = [\"uuid\",\"age\",\"age_band\",\"skills_and_expertise_list\",\n",
+ " \"travel_persona\",\"hobbies_and_interests\",\"area\"]\n",
+ "\n",
+ "for col in sorted(set(TEXT_COLS_ALL + CORE_COLS + EXTRA_COLS)):\n",
+ " if col in df.columns:\n",
+ " df[col] = df[col].fillna(\"\").astype(str)\n",
+ " else:\n",
+ " df[col] = \"\"\n",
+ "\n",
+ "def build_text(row: pd.Series, cols: List[str]) -> str:\n",
+ " parts = [row.get(c, \"\") for c in cols]\n",
+ " parts = [p for p in parts if str(p).strip()]\n",
+ " return \" / \".join(parts)\n",
+ "\n",
+ "df[\"_all_text\"] = df.apply(lambda r: build_text(r, TEXT_COLS_ALL), axis=1)\n",
+ "df[\"_core_text\"] = df.apply(lambda r: build_text(r, CORE_COLS), axis=1)\n",
+ "df[\"_core_len\"] = df[\"_core_text\"].str.len()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "be999019-cd13-48df-96c6-9f643f4aca77",
+ "metadata": {},
+ "source": [
+ "## Create Duplicate Suppression Key\n",
+ "\n",
+ "Create a key (`_attr_key`) for duplicate detection based on attribute combinations.\n",
+ "This prevents selecting multiple similar personas.\n",
+ "\n",
+ "Exclude completely empty keys (all fields empty)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "0fe57a1f-0569-4e7c-b348-0b00a2e1de27",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "KEY_COLS = [\"prefecture\",\"region\",\"area\",\"occupation\",\"education_level\",\"marital_status\"]\n",
+ "\n",
+ "def _norm(s: str) -> str:\n",
+ " s = \"\" if s is None else str(s)\n",
+ " return re.sub(r\"\\s+\", \" \", s.strip())\n",
+ "\n",
+ "df[\"_attr_key\"] = df.apply(lambda r: \"|\".join([_norm(r.get(c, \"\")) for c in KEY_COLS]), axis=1)\n",
+ "df = df[df[\"_attr_key\"] != \"|||||\"].copy()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f1389684-da68-497c-8566-916696dcefdc",
+ "metadata": {},
+ "source": [
+ "## Exclude by Negative Keywords\n",
+ "\n",
+ "Exclude personas containing inappropriate keywords (extreme expressions, crime-related, etc.) unsuitable for JCommonsenseQA.\n",
+ "Evaluate `_core_text` and remove matching entries."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "b3c2fe84-5ffc-4f4b-a830-4022a3eab5ad",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "NEG_KW_JC = [\n",
+ " \"้ฐ่ฌ\",\"ๅใฏใฏ\",\"ๆฅตๅณ\",\"ๆฅตๅทฆ\",\"ใใคใ\",\"ๅทฎๅฅ\",\"ๆๅค\",\"ใใญ\",\"้ๆฟ\",\n",
+ " \"ไธๅผใ\",\"็ช็\",\"่ฉๆฌบ\",\"ๆจช้ \",\"่ฆ้ๅค\",\"้บป่ฌ\",\n",
+ " \"็ตถๅฏพ\",\"่จฑใใชใ\",\"ๅซๆช\",\"ๆใ\",\n",
+ "]\n",
+ "df[\"has_neg_jc\"] = df[\"_core_text\"].str.contains(\"|\".join(map(re.escape, NEG_KW_JC)), regex=True, na=False)\n",
+ "df_jc = df[~df[\"has_neg_jc\"]].copy()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4e32acc8-1226-4134-8913-f802e4ec8148",
+ "metadata": {},
+ "source": [
+ "## Define Category Keyword Dictionaries\n",
+ "\n",
+ "Define keywords for transportation/movement, daily life/housework, and tools.\n",
+ "\n",
+ "- **geo_kw**: A_Transportation/Movement (trains, stations, buses, walking, etc.)\n",
+ "- **life_kw**: F_Daily Life/Housework (cooking, cleaning, shopping, etc.)\n",
+ "- **tools_kw**: B_Tools/Usage (knives, vacuum cleaners, stationery, etc.)\n",
+ "\n",
+ "Define keywords for public facilities/manners, culture/etiquette, and finance.\n",
+ "\n",
+ "- **public_kw**: D_Public Facilities/Manners (lines, order, priority seats, etc.)\n",
+ "- **culture_kw**: D_Public Facilities/Manners (etiquette, ceremonies, etc.)\n",
+ "- **finance_kw**: C_Payment/Money (accounting, banking, card payments, etc.)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "e9c8b19e-d1f2-4598-b25d-6938c718bfed",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "geo_kw = [\"้ป่ป\",\"ๅฐไธ้\",\"ๆฐๅนน็ท\",\"้ง
\",\"ๆนๆญ\",\"ๅ็ฌฆ\",\"ๅฎๆๅธ\",\"ICใซใผใ\",\"Suica\",\"PASMO\",\"ใใน\",\"ใในๅ\",\"ใฟใฏใทใผ\",\n",
+ " \"้ซ้้่ทฏ\",\"้ง่ปๅ ด\",\"ไนใๆใ\",\"ๆๅป่กจ\",\"็ต้ป\",\"ๅง็บ\",\"ๆธๆป\",\"่ธๅ\",\"ไฟกๅท\",\"ใใผใ \",\"่ทฏ็ท\",\"ไน่ปๅธ\",\n",
+ " \"ๅพๆญฉ\",\"ๆญฉใ\",\"ๆญฉ้\",\"ๆจชๆญๆญฉ้\",\"ไบคๅทฎ็น\",\"ไฟกๅทๅพ
ใก\",\"ๅณๆ\",\"ๅทฆๆ\",\"็งปๅ\",\"็ฎ็ๅฐ\",\"็ต่ทฏ\",\"ใซใผใ\",\n",
+ " \"ๅฐๅณ\",\"ใใ\",\"ๆๅฏใ\",\"ไนใ\",\"้ใใ\",\"ไธ่ป\",\"ไน่ป\",\"ๅ
ฅๅฃ\",\"ๅบๅฃ\",\"ๆนๆญๅฃ\",\"ใใผใ ใใข\",\"้ๆฎต\",\"ไนใๅ ด\"]\n",
+ "\n",
+ "life_kw = [\"ๆ็\",\"ๆ้ค\",\"็ไปใ\",\"DIY\",\"่ฒทใ็ฉ\",\"ๅฅๅบท\",\"็ก็ \",\"่ฒๅ
\",\"ๅญ่ฒใฆ\",\"ๅผๅฝ\",\"ๆดๆฟฏ\",\"ๅฎถไบ\",\n",
+ " \"ใดใๅบใ\",\"ๅๅฅ\",\"่ณๆบใใฟ\",\"ๅฏ็\",\"ไธ็\",\"็ฒๅคงใใฟ\",\"็ฏ็ด\",\"ๆด็ๆด้ \",\"ๅ็ด\",\n",
+ " \"็
้ข\",\"่ฌๅฑ\",\"ไผ่จ\",\"ใฌใธ\",\n",
+ " \"ๆธๆฃ\",\"้ฃๅจๆฃ\",\"ๆฌๆฃ\",\"ๅผใๅบใ\",\"ใฏใญใผใผใใ\",\"ใฟใณใน\",\"ๆฃ\",\"ๆบ\",\"ๆค
ๅญ\",\n",
+ " \"ๆ้คๆฉ\",\"ๆดๆฟฏๆฉ\",\"้ปๅญใฌใณใธ\",\"็้ฃฏๅจ\"]\n",
+ "\n",
+ "tools_kw = [\"ๅ
ไธ\",\"ใพใชๆฟ\",\"้\",\"ใใฉใคใใณ\",\"่็ฎธ\",\"ใใใพ\",\"ใปใใ\",\"ใกใใจใ\",\"้ๅทพ\",\"ในใใณใธ\",\"ๆดๅค\",\"ๆผ็ฝๅค\",\n",
+ " \"ๆ้คๆฉ\",\"ใใฉใคใใผ\",\"ใใณใ\",\"้ใฅใก\",\"ใใชใฅใก\",\"ใใณใฎใช\",\"ใทใฃใใซ\",\"้\",\"ใใตใ\",\n",
+ " \"ใใณ\",\"้็ญ\",\"ใทใฃใผใใณ\",\"ๆถใใดใ \",\"ใใผใ\",\"ไป็ฎ\",\"ใใใใญใน\",\"ใฏใชใใ\",\"ๅฎ่ฆ\",\"ใใผใซใผ\",\"ใซใใฟใผ\",\n",
+ " \"ใฌใ ใใผใ\",\"ใปใญใใผใ\",\"ใฉใใ\",\"ใขใซใใใคใซ\",\"ใใณใฐ\",\"่จ้ในใใผใณ\",\"่จ้ใซใใ\",\n",
+ " \"ๅปถ้ทใณใผใ\",\"ๅ
้ปๅจ\",\"้ปๆฑ \",\"ใใฉใคใคใผ\",\"ใขใคใญใณ\"]\n",
+ "\n",
+ "public_kw = [\"ๅ
ฌๅ
ฑ\",\"ใซใผใซ\",\"้ ็ช\",\"ๅ\",\"ไธฆใถ\",\"ๅฒใ่พผใฟ\",\"ๅชๅ
ๅธญ\",\"็ฆ็
\",\"ๅซ็
\",\"้จ้ณ\",\"้ใใซ\",\"ใดใ\",\"ใใคๆจใฆ\",\"่ฟทๆ\",\n",
+ " \"ๅณๆธ้คจ\",\"ๆ ็ป้คจ\",\"็
้ข\",\"ๅพ
ๅๅฎค\",\"ใจในใซใฌใผใฟใผ\",\"ใจใฌใใผใฟใผ\",\"ใณใณใใ\",\"ในใผใใผ\",\"ๅบๅ
\"]\n",
+ "\n",
+ "culture_kw = [\"็คผๅ\",\"ไฝๆณ\",\"ๅ ๅฉ่ฌ็ฅญ\",\"ใ่พๅ\",\"ๆญฃๆ\",\"ใ็\",\"็ฅญใ\",\"ๅนดไธญ่กไบ\",\"็็ฉ\",\"่ถ้\",\n",
+ " \"ๆจๆถ\",\"ๆฌ่ช\",\"ๅๅบ\",\"ๆๅ็ฃ\",\"ใ็คผ\",\"ใ่ฉซใณ\",\"่ฌ็ฝช\",\"ๆญใ\",\"้ ๆ
ฎ\",\"ๅคฑ็คผ\",\"้
ๆ
ฎ\",\"ๆฐ้ฃใ\",\n",
+ " \"็ฅ็คพ\",\"ๅฏบ\",\"ไปๆ\",\"็ฅ้\"]\n",
+ "\n",
+ "finance_kw = [\"ๆฏๆใ\",\"ไผ่จ\",\"ๆ้\",\"ๅคๆฎต\",\"ๅฒๅผ\",\"ใฏใผใใณ\",\"็ฒพ็ฎ\",\"่ฟ้\",\"้ ๅๆธ\",\"ใฌใทใผใ\",\"้ฃ้ญ\",\"ใใคใ\",\n",
+ " \"้่ก\",\"ATM\",\"ๅฃๅบง\",\"ๆฏ่พผ\",\"้้\",\"ๅผใ่ฝใจใ\",\"่ซๆฑ\",\"่ซๆฑๆธ\",\"ๅฉ็จๆ็ดฐ\",\"ๆ็ดฐ\",\"ๆๆฐๆ\",\"ใใคใณใ\",\n",
+ " \"ใฏใฌใธใใใซใผใ\",\"ใใใใ\",\"้ปๅญใใใผ\",\"ใญใฃใใทใฅใฌใน\",\"็จ\",\"็ดไป\",\"ๆง้ค\",\"ไฟ้บ\",\"ใญใผใณ\",\"ๅๅฒๆใ\",\"ๅฎถ่จ\",\"ไบ็ฎ\"]\n",
+ "\n",
+ "safety_kw = [\"ๅฑ้บ\",\"ๅฑใชใ\",\"ไบๆ
\",\"่ปขๅ\",\"็ซ\",\"็ซไบ\",\"็ซๅท\",\"ใใใฉ\",\"ใฌใน\",\"ใฌในๆผใ\",\"ไธ้
ธๅ็ญ็ด \",\"ๆๆฐ\",\n",
+ " \"ๆ้ป\",\"ๆผ้ป\",\"ใฌใฝใชใณ\",\"็บ็ซ\",\"ๅ็ฉ\",\"ใใคใ\",\"ๅ
ไธ\",\n",
+ " \"้ฃฒ้
้่ปข\",\"ไฟกๅท็ก่ฆ\",\"ในใใผใ\",\"้ช้\",\"ๅ็ต\",\"ใใผใใซใฟใคใค\",\"ใใซใกใใ\",\"ใทใผใใใซใ\",\n",
+ " \"ๆผ็ฝๅค\",\"ๆฎบ่ซๅค\",\"ๆดๅค\",\"่ฌ\",\"่ชค้ฃฒ\",\"้ฃไธญๆฏ\",\"็ฑไธญ็\",\"ๅๅท\",\n",
+ " \"้ฟ้ฃ\",\"้ๅธธๅฃ\",\"ๆถ็ซๅจ\",\"็ซ็ฝๅ ฑ็ฅๅจ\",\"่ญฆๅ ฑ\",\"้ฒ็ฝ\"]\n",
+ "\n",
+ "vocab_kw = [\"็จ้\",\"็ฎ็\",\"ๅ้ก\",\"็จฎ้ก\",\"ๆๅณ\",\"ๅฎ็พฉ\",\"่จใๆใ\",\"่จใๅใ\",\"่กจ็พ\",\n",
+ " \"้ก็พฉ่ช\",\"ๅฏพ็พฉ่ช\",\"ๅๅฏพ่ช\",\"ใใจใใ\",\"ๆ
ฃ็จๅฅ\",\"ๆฏๅฉ\",\"ไพใ\",\n",
+ " \"้ฉๅ\",\"ไธ้ฉๅ\",\"ไธ่ฌ็\",\"ไปฃ่กจ็\",\"ๆฅๅธธ็\"]\n",
+ "\n",
+ "JC_CAT_KWS = {\n",
+ " \"finance\": finance_kw,\n",
+ " \"safety\": safety_kw,\n",
+ " \"vocab\": vocab_kw,\n",
+ " \"public\": public_kw,\n",
+ " \"tools\": tools_kw,\n",
+ " \"life\": life_kw,\n",
+ " \"geo\": geo_kw,\n",
+ " \"culture\": culture_kw,\n",
+ "}\n",
+ "\n",
+ "def count_hits(text: str, kws: List[str]) -> int:\n",
+ " if not text:\n",
+ " return 0\n",
+ " return sum(1 for k in kws if k in text)\n",
+ "\n",
+ "ALL_JC_KW = [kw for kws in JC_CAT_KWS.values() for kw in kws]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5d7da658-2cf7-408c-a8f8-d7e3ff3b95d1",
+ "metadata": {},
+ "source": [
+ "## Calculate Keyword Scores\n",
+ "\n",
+ "Score how many keywords from each category are contained in the persona text.\n",
+ "\n",
+ "- Primarily calculate scores using `_core_text`\n",
+ "- **geo/tools** only: recalculate by adding supplementary text (travel_persona, hobbies, etc.)\n",
+ " - This prevents depletion of geo and tools categories and suppresses misclassification to public\n",
+ " \n",
+ "## Exclude Abnormal Scores and Estimate Categories\n",
+ "\n",
+ "Exclude data with abnormally high scores (containing unnaturally many keywords) and estimate the most suitable category for each persona.\n",
+ "\n",
+ "- Select the category with the highest score\n",
+ "- In case of ties, decide by priority (finance > safety > vocab > ...)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "b2a5b386-e702-41ef-9775-c629590a69be",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# ๅบๆฌใฏ core_text\n",
+ "for cat, kws in JC_CAT_KWS.items():\n",
+ " df_jc[f\"score_{cat}\"] = df_jc[\"_core_text\"].map(lambda t, _kws=kws: count_hits(t, _kws))\n",
+ "\n",
+ "# geo/tools ใฎใฟ่ฃๅฉใใญในใใงๅ่จ็ฎ๏ผ้ๅฎๅใฎใฟ่ถณใ๏ผ\n",
+ "df_jc[\"_geo_text\"] = df_jc[\"_core_text\"] + \" / \" + df_jc[\"travel_persona\"] + \" / \" + df_jc[\"area\"]\n",
+ "df_jc[\"_tools_text\"] = df_jc[\"_core_text\"] + \" / \" + df_jc[\"hobbies_and_interests\"] + \" / \" + df_jc[\"skills_and_expertise_list\"]\n",
+ "\n",
+ "df_jc[\"score_geo\"] = df_jc[\"_geo_text\"].map(lambda t: count_hits(t, geo_kw))\n",
+ "df_jc[\"score_tools\"] = df_jc[\"_tools_text\"].map(lambda t: count_hits(t, tools_kw))\n",
+ "\n",
+ "df_jc[\"_kw_hits\"] = df_jc[\"_core_text\"].map(lambda t: count_hits(t, ALL_JC_KW))\n",
+ "\n",
+ "MAX_SCORE = {\"finance\":14,\"safety\":16,\"vocab\":12,\"public\":10,\"tools\":10,\"life\":10,\"geo\":10,\"culture\":10}\n",
+ "mask_ok = pd.Series(True, index=df_jc.index)\n",
+ "for c, mx in MAX_SCORE.items():\n",
+ " mask_ok &= (df_jc[f\"score_{c}\"] <= mx)\n",
+ "df_jc = df_jc[mask_ok].copy()\n",
+ "\n",
+ "def assign_jc_category(row: pd.Series) -> Optional[str]:\n",
+ " scores = {c: row[f\"score_{c}\"] for c in JC_CAT_KWS}\n",
+ " maxv = max(scores.values()) if scores else 0\n",
+ " if maxv <= 0:\n",
+ " return None\n",
+ " tied = [c for c, v in scores.items() if v == maxv]\n",
+ " priority = [\"finance\",\"safety\",\"vocab\",\"public\",\"tools\",\"life\",\"geo\",\"culture\"]\n",
+ " for p in priority:\n",
+ " if p in tied:\n",
+ " return p\n",
+ " return tied[0]\n",
+ "\n",
+ "df_jc[\"jc_category\"] = df_jc.apply(assign_jc_category, axis=1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "289345ae-c255-4f28-b1bb-478037062b7b",
+ "metadata": {},
+ "source": [
+ "## Determine Neutral Data\n",
+ "\n",
+ "Classify personas with few keyword hits and shorter length as Neutral.\n",
+ "\n",
+ "**Conditions:**\n",
+ "- Core text length is 260 characters or less\n",
+ "- Keyword hit count is 0\n",
+ "- Does not contain definition keywords (such as '~ใจใฏ')\n",
+ "\n",
+ "Limit Neutral to 50 entries to prevent too many thin seeds.\n",
+ "\n",
+ "## Create Sampling Pools\n",
+ "\n",
+ "Create sampling pools for each category.\n",
+ "\n",
+ "- **typical_pool**: Neutral and thin data (max_score โค 2)\n",
+ "- **weakB_pool**: Reinforcement targets (finance, safety, vocab)\n",
+ "- **geo_pool, tools_pool, public_pool, other_pool**: Each sub-category of WeakA"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "86d3ef19-a9bf-4bbb-9adc-46a5ad241c2a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "DEFINITION_KW = [\"ใจใฏ\",\"ใจใใ\",\"ๆๅณ\",\"ๅฎ็พฉ\",\"ไฝใจใใ\",\"ไฝใจ่จใ\",\"ใฉใใๆใ\"]\n",
+ "df_jc[\"_has_definition\"] = df_jc[\"_core_text\"].str.contains(\"|\".join(map(re.escape, DEFINITION_KW)), regex=True, na=False)\n",
+ "\n",
+ "NEUTRAL_MAX_LEN_CORE = 260\n",
+ "NEUTRAL_MAX_HITS = 0\n",
+ "df_jc[\"is_neutral\"] = (\n",
+ " (df_jc[\"_core_len\"] > 0)\n",
+ " & (df_jc[\"_core_len\"] <= NEUTRAL_MAX_LEN_CORE)\n",
+ " & (df_jc[\"_kw_hits\"] <= NEUTRAL_MAX_HITS)\n",
+ " & (~df_jc[\"_has_definition\"])\n",
+ ")\n",
+ "\n",
+ "neutral_pool = df_jc[df_jc[\"is_neutral\"]].drop_duplicates(subset=[\"_attr_key\"]).copy()\n",
+ "neutral_pool[\"jc_category\"] = \"neutral\"\n",
+ "neutral_pool = neutral_pool.sort_values(\"_core_len\").head(NEUTRAL_CAP) # โ
ไธ้\n",
+ "\n",
+ "jc_pool = df_jc[df_jc[\"jc_category\"].notna()].drop_duplicates(subset=[\"_attr_key\"]).copy()\n",
+ "\n",
+ "score_cols = [f\"score_{c}\" for c in JC_CAT_KWS]\n",
+ "jc_pool[\"max_score_any\"] = jc_pool[score_cols].max(axis=1)\n",
+ "\n",
+ "typical_pool = (\n",
+ " pd.concat([neutral_pool, jc_pool[jc_pool[\"max_score_any\"] <= 2]], axis=0)\n",
+ " .drop_duplicates(subset=[\"_attr_key\"])\n",
+ " .copy()\n",
+ ")\n",
+ "\n",
+ "weakB_pool = jc_pool[jc_pool[\"jc_category\"].isin([\"finance\",\"safety\",\"vocab\"])].copy()\n",
+ "# weakAใฏใใใงๅๅฒใใฆไฝฟใ๏ผgeo/tools/other๏ผ\n",
+ "geo_pool = jc_pool[jc_pool[\"jc_category\"]==\"geo\"].copy()\n",
+ "tools_pool = jc_pool[jc_pool[\"jc_category\"]==\"tools\"].copy()\n",
+ "public_pool = jc_pool[jc_pool[\"jc_category\"]==\"public\"].copy() # โ
่ฟฝๅ \n",
+ "other_pool = jc_pool[jc_pool[\"jc_category\"].isin([\"culture\",\"life\"])].copy() # โ
publicใ้คๅค"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e1a6a9ed-628d-4d7f-9339-fded62df4efb",
+ "metadata": {},
+ "source": [
+ "## Define Sampling Function with Caps\n",
+ "\n",
+ "Function that samples while suppressing bias by occupation and prefecture.\n",
+ "\n",
+ "**Operation:**\n",
+ "1. First sample while respecting caps\n",
+ "2. If insufficient, relax caps to fill the remainder\n",
+ "3. Always ensure the specified count is met"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "c52f7716-2cb3-4b6d-9706-2c00c65b51a8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def sample_with_caps(pool: pd.DataFrame, n: int, random_state: int = 0) -> pd.DataFrame:\n",
+ " if n <= 0 or len(pool) == 0:\n",
+ " return pool.iloc[0:0].copy()\n",
+ "\n",
+ " pool2 = pool.sample(frac=1.0, random_state=random_state).copy()\n",
+ "\n",
+ " out = []\n",
+ " occ_cnt, pref_cnt = {}, {}\n",
+ "\n",
+ " # 1st pass: capใใ\n",
+ " for _, r in pool2.iterrows():\n",
+ " occ = str(r.get(\"occupation\",\"\"))\n",
+ " pref = str(r.get(\"prefecture\",\"\"))\n",
+ " if occ and occ_cnt.get(occ, 0) >= CAP_PER_OCCUPATION:\n",
+ " continue\n",
+ " if pref and pref_cnt.get(pref, 0) >= CAP_PER_PREFECTURE:\n",
+ " continue\n",
+ " out.append(r)\n",
+ " occ_cnt[occ] = occ_cnt.get(occ, 0) + 1\n",
+ " pref_cnt[pref] = pref_cnt.get(pref, 0) + 1\n",
+ " if len(out) >= n:\n",
+ " return pd.DataFrame(out)\n",
+ "\n",
+ " # 2nd pass: cap็ทฉๅ๏ผๆฎใใ็ด ็ดใซๅใใ๏ผ\n",
+ " used_keys = set([r[\"_attr_key\"] for r in out if \"_attr_key\" in r])\n",
+ " for _, r in pool2.iterrows():\n",
+ " k = r.get(\"_attr_key\", None)\n",
+ " if k is not None and k in used_keys:\n",
+ " continue\n",
+ " out.append(r)\n",
+ " if len(out) >= n:\n",
+ " break\n",
+ "\n",
+ " return pd.DataFrame(out)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dcfdca13-7207-46d3-8ed9-f79af75c99d6",
+ "metadata": {},
+ "source": [
+ "## Sample WeakB Categories\n",
+ "\n",
+ "Sample weakness reinforcement targets (WeakB).\n",
+ "\n",
+ "- **finance**: 400 entries\n",
+ "- **safety**: 350 entries\n",
+ "- **vocab**: 350 entries\n",
+ "\n",
+ "Sample a total of 1100 entries, excluding already selected data from subsequent sampling.\n",
+ "\n",
+ "## WeakA - Sample Geo/Tools\n",
+ "\n",
+ "Sample transportation/movement and tools categories from WeakA.\n",
+ "\n",
+ "- **geo** (Transportation/Movement): 250 entries\n",
+ "- **tools** (Tools): 100 entries\n",
+ "\n",
+ "## WeakA - Sample Public/Other\n",
+ "\n",
+ "Sample the remainder of WeakA.\n",
+ "\n",
+ "- **public** (Public Facilities/Manners): 200 entries\n",
+ "- **other** (culture/life): 150 entries\n",
+ " - For Other, prioritize those with public facility-related keywords\n",
+ " - Suppress those with religion-related keywords (penalty)\n",
+ " - This prevents category D from being biased toward religion\n",
+ "\n",
+ "## Sample Typical and Final Adjustments\n",
+ "\n",
+ "Fill the remaining slots (approx. 500 entries) from the Typical category.\n",
+ "\n",
+ "**Process:**\n",
+ "1. Sample remaining count from Typical pool\n",
+ "2. Combine all parts\n",
+ "3. If insufficient, add from unused data\n",
+ "4. If excess, adjust to 2000 entries\n",
+ "5. **Always ensure exactly 2000 entries**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "cc028a9a-bfee-4fc2-8f4a-4cb28f5b6774",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "seed_parts = []\n",
+ "used = set()\n",
+ "\n",
+ "# ---- weakB๏ผๅบๅฎๅ
่จณ๏ผ----\n",
+ "for cat, n in WEAKB_TARGETS.items():\n",
+ " g = weakB_pool[(weakB_pool[\"jc_category\"] == cat) & (~weakB_pool[\"_attr_key\"].isin(used))].copy()\n",
+ " s = sample_with_caps(g, min(n, len(g)), random_state=CAT_RS[cat])\n",
+ " seed_parts.append(s)\n",
+ " used |= set(s[\"_attr_key\"].tolist())\n",
+ "\n",
+ "# ---- weakA๏ผใตใใฏใฉใผใฟ๏ผ----\n",
+ "g = geo_pool[~geo_pool[\"_attr_key\"].isin(used)].copy()\n",
+ "s = sample_with_caps(g, min(N_GEO, len(g)), random_state=WA_RS[\"geo\"])\n",
+ "seed_parts.append(s)\n",
+ "used |= set(s[\"_attr_key\"].tolist())\n",
+ "\n",
+ "g = tools_pool[~tools_pool[\"_attr_key\"].isin(used)].copy()\n",
+ "s = sample_with_caps(g, min(N_TOOLS, len(g)), random_state=WA_RS[\"tools\"])\n",
+ "seed_parts.append(s)\n",
+ "used |= set(s[\"_attr_key\"].tolist())\n",
+ "\n",
+ "g = public_pool[~public_pool[\"_attr_key\"].isin(used)].copy()\n",
+ "s = sample_with_caps(g, min(N_PUBLIC, len(g)), random_state=WA_RS.get(\"public\", 114))\n",
+ "seed_parts.append(s)\n",
+ "used |= set(s[\"_attr_key\"].tolist())\n",
+ "\n",
+ "g = other_pool[~other_pool[\"_attr_key\"].isin(used)].copy()\n",
+ "# publicๅชๅ
ใปๅฎๆๆๅถใๅนใใใ\n",
+ "def _count_kw(text: str, kws) -> int:\n",
+ " if not text:\n",
+ " return 0\n",
+ " return sum(1 for k in kws if k in text)\n",
+ "\n",
+ "g[\"_public_bonus\"] = g[\"_core_text\"].map(lambda t: _count_kw(t, PUBLIC_BONUS_KW))\n",
+ "g[\"_religion_pen\"] = g[\"_core_text\"].map(lambda t: _count_kw(t, RELIGION_PENALTY_KW))\n",
+ "g = g.sort_values([\"_public_bonus\",\"_religion_pen\",\"_core_len\"], ascending=[False, True, True]).copy()\n",
+ "s = sample_with_caps(g, min(N_WEAK_A_OTHER, len(g)), random_state=WA_RS[\"other\"])\n",
+ "seed_parts.append(s)\n",
+ "used |= set(s[\"_attr_key\"].tolist())\n",
+ "\n",
+ "# ---- typical๏ผๆฎใ๏ผ----\n",
+ "remain = SEED_TARGET - sum(len(x) for x in seed_parts)\n",
+ "g = typical_pool[~typical_pool[\"_attr_key\"].isin(used)].sort_values(\"_core_len\", ascending=True).copy()\n",
+ "s = sample_with_caps(g, min(remain, len(g)), random_state=TY_RS)\n",
+ "seed_parts.append(s)\n",
+ "\n",
+ "seed_jc = pd.concat(seed_parts, axis=0).drop_duplicates(subset=[\"_attr_key\"]).copy()\n",
+ "\n",
+ "# ใตใคใบใๅฟ
ใๅใใใ๏ผๆๅพใซๅใๆปใ๏ผ\n",
+ "remain = SEED_TARGET - len(seed_jc)\n",
+ "if remain > 0:\n",
+ " pool = df_jc[~df_jc[\"_attr_key\"].isin(set(seed_jc[\"_attr_key\"]))].drop_duplicates(subset=[\"_attr_key\"]).copy()\n",
+ " if len(pool) == 0:\n",
+ " raise ValueError(\"fill pool empty; cannot reach SEED_TARGET\")\n",
+ " seed_jc = pd.concat([seed_jc, pool.head(remain)], axis=0).drop_duplicates(subset=[\"_attr_key\"]).copy()\n",
+ "\n",
+ "# ๆ็ต่ชฟๆด\n",
+ "if len(seed_jc) > SEED_TARGET:\n",
+ " seed_jc = seed_jc.sample(n=SEED_TARGET, random_state=123)\n",
+ "elif len(seed_jc) < SEED_TARGET:\n",
+ " raise ValueError(f\"seed pool too small: seed_jc={len(seed_jc)} < {SEED_TARGET}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3ad1fe5a-fb41-4d0f-9b36-9a72428a1c56",
+ "metadata": {},
+ "source": [
+ "## Assign Themes and Check Distribution\n",
+ "\n",
+ "Map categories to JCommonsenseQA themes (A-F, N) and check the final distribution.\n",
+ "\n",
+ "**Themes:**\n",
+ "- A: Transportation/Movement\n",
+ "- B: Tools/Usage\n",
+ "- C: Payment/Money\n",
+ "- D: Public Facilities/Manners\n",
+ "- E: Safety/Danger\n",
+ "- F: Daily Life/Housework\n",
+ "- N: Neutral"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "f4bdbf37-59be-4844-89b4-4e8f08017ce7",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[seed_jc] size: 2000\n",
+ "jc_category\n",
+ "finance 407\n",
+ "vocab 364\n",
+ "safety 352\n",
+ "geo 320\n",
+ "public 208\n",
+ "culture 145\n",
+ "tools 102\n",
+ "life 102\n",
+ "Name: count, dtype: int64\n",
+ "jc_theme\n",
+ "B_้ๅ
ทใป็จ้ 466\n",
+ "C_ๆฏๆใใปใ้ 407\n",
+ "D_ๅ
ฌๅ
ฑๆฝ่จญใปใใใผๆ้ 353\n",
+ "E_ๅฎๅ
จใปๅฑ้บ 352\n",
+ "A_ไบค้ใป็งปๅ 320\n",
+ "F_็ๆดปใปๅฎถไบ 102\n",
+ "Name: count, dtype: int64\n"
+ ]
+ }
+ ],
+ "source": [
+ "# ============================================================\n",
+ "# 9) jc_theme๏ผcategoryใใๆฑบใๆใก๏ผ\n",
+ "# ============================================================\n",
+ "CAT2THEME = {\n",
+ " \"finance\": \"C_ๆฏๆใใปใ้\",\n",
+ " \"safety\": \"E_ๅฎๅ
จใปๅฑ้บ\",\n",
+ " \"vocab\": \"B_้ๅ
ทใป็จ้\",\n",
+ " \"tools\": \"B_้ๅ
ทใป็จ้\",\n",
+ " \"public\": \"D_ๅ
ฌๅ
ฑๆฝ่จญใปใใใผๆ้ \",\n",
+ " \"culture\": \"D_ๅ
ฌๅ
ฑๆฝ่จญใปใใใผๆ้ \",\n",
+ " \"geo\": \"A_ไบค้ใป็งปๅ\",\n",
+ " \"life\": \"F_็ๆดปใปๅฎถไบ\",\n",
+ " \"neutral\": \"N_ใใฅใผใใฉใซ\",\n",
+ "}\n",
+ "seed_jc[\"jc_theme\"] = seed_jc[\"jc_category\"].map(CAT2THEME).fillna(\"N_ใใฅใผใใฉใซ\")\n",
+ "\n",
+ "print(\"[seed_jc] size:\", len(seed_jc))\n",
+ "print(seed_jc[\"jc_category\"].value_counts(dropna=False))\n",
+ "print(seed_jc[\"jc_theme\"].value_counts(dropna=False))\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8be1a05c-2725-464b-b7d8-3f8f7bf7bb2b",
+ "metadata": {},
+ "source": [
+ "## Create and Save Final Output Data\n",
+ "\n",
+ "Select columns needed for prompt generation and create the final seed data.\n",
+ "\n",
+ "**Output Columns:**\n",
+ "- uuid, occupation, prefecture, region, marital_status\n",
+ "- age_band, skills_and_expertise_list\n",
+ "- jc_theme, jc_category, _attr_key\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "1d9c154a-1e0b-42bf-9afe-a74a8efa76f5",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
| \n", + " | uuid | \n", + "occupation | \n", + "prefecture | \n", + "region | \n", + "marital_status | \n", + "age_band | \n", + "skills_and_expertise_list | \n", + "jc_theme | \n", + "jc_category | \n", + "_attr_key | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|
| 836023 | \n", + "749db6e7c2e245b2ae3b46aa12c4f1e0 | \n", + "ไฟ้บๆฅญ ไธญๅฐ | \n", + "ไธ้็ | \n", + "่ฟ็ฟๅฐๆน | \n", + "้ขๅฅ (ๅญไพใใ) | \n", + "\n", + " | ['ไฟ้บๅฅ็ด็ฎก็', 'ใชในใฏ่ฉไพก', '้กงๅฎขๅฏพๅฟ', 'ๆณ่ฆๅถ้ตๅฎ', 'Excelๆฅญๅ'] | \n", + "C_ๆฏๆใใปใ้ | \n", + "finance | \n", + "ไธ้็|่ฟ็ฟๅฐๆน|่ฅฟๆฅๆฌ|ไฟ้บๆฅญ ไธญๅฐ|ไธญๅญฆๅ|้ขๅฅ (ๅญไพใใ) | \n", + "
| 333198 | \n", + "e8d5c500090946b586bd969f84f8ca78 | \n", + "ไฟ้บๆฅญ ไธญๅฐ | \n", + "ๅคง้ชๅบ | \n", + "่ฟ็ฟๅฐๆน | \n", + "ๆขๅฉ | \n", + "\n", + " | ['ไฟ้บๆๆกใปๅฅ็ดๆ็ถใ', 'ใชในใฏ่ฉไพก', '้กงๅฎขไฟก้ ผๆง็ฏ', 'ๆณไปค้ตๅฎใใงใใฏ', ... | \n", + "C_ๆฏๆใใปใ้ | \n", + "finance | \n", + "ๅคง้ชๅบ|่ฟ็ฟๅฐๆน|่ฅฟๆฅๆฌ|ไฟ้บๆฅญ ไธญๅฐ|้ซๆ กๅ|ๆขๅฉ | \n", + "
| 633465 | \n", + "07f13df066824f33ba51dac21eafccf4 | \n", + "ไฟ้บๆฅญ ไธญๅ | \n", + "้ณฅๅ็ | \n", + "ไธญๅฝๅฐๆน | \n", + "ๆชๅฉ | \n", + "\n", + " | [\"ใชในใฏๅๆ\", \"ไฟ้บๅๅ็ฅ่ญ\", \"้กงๅฎขๅฏพๅฟ\", \"ๆธ้ก็ฎก็\", \"ๆณไปค้ตๅฎ\", \"ใ... | \n", + "C_ๆฏๆใใปใ้ | \n", + "finance | \n", + "้ณฅๅ็|ไธญๅฝๅฐๆน|่ฅฟๆฅๆฌ|ไฟ้บๆฅญ ไธญๅ |ๅคงๅญฆๅ ๆ็ณป|ๆชๅฉ | \n", + "
| 505505 | \n", + "74f03ab6afbe4ba1b56f727c8aca5884 | \n", + "ไฟ้บๆฅญ ๅคงๆ (็พๅจใฏๅผ้) | \n", + "ไธ้็ | \n", + "่ฟ็ฟๅฐๆน | \n", + "ๆญปๅฅ | \n", + "\n", + " | [\"ไฟ้บใชในใฏ่ฉไพก\",\"้กงๅฎขๅฏพๅฟใปใซใฆใณใปใชใณใฐ\",\"ๅถๅบฆ้ตๅฎใปใณใณใใฉใคใขใณใน\",\"่ณ็ฃ่จญ... | \n", + "C_ๆฏๆใใปใ้ | \n", + "finance | \n", + "ไธ้็|่ฟ็ฟๅฐๆน|่ฅฟๆฅๆฌ|ไฟ้บๆฅญ ๅคงๆ (็พๅจใฏๅผ้)|้ซๅฐๅ|ๆญปๅฅ | \n", + "
| 136573 | \n", + "cb1384fa55294c888630952ee92c488c | \n", + "ไฟ้บๆฅญ ไธญๅ (็พๅจใฏๅผ้) | \n", + "ๆ็ฅ็ | \n", + "ไธญ้จๅฐๆน | \n", + "ๆญปๅฅ (ๅญไพใใ) | \n", + "\n", + " | ['ใชในใฏ่ฉไพก','้กงๅฎขๅฏพๅฟ','ๅฅ็ด็ฎก็','Excel','ใกใผใซ','ไฟ้บใชใณใฉใคใณใ... | \n", + "C_ๆฏๆใใปใ้ | \n", + "finance | \n", + "ๆ็ฅ็|ไธญ้จๅฐๆน|ๆฑๆฅๆฌ|ไฟ้บๆฅญ ไธญๅ (็พๅจใฏๅผ้)|ๅคงๅญฆๅ ็็ณป|ๆญปๅฅ (ๅญไพใใ) | \n", + "
| 12885 | \n", + "32ab728886554d89a04ee7ea7d42a3cd | \n", + "ไฟ้บๆฅญ ๅคงๆ | \n", + "ไบฌ้ฝๅบ | \n", + "่ฟ็ฟๅฐๆน | \n", + "ๆขๅฉ | \n", + "\n", + " | ['ไฟ้บๅๅ็ฅ่ญ', 'ใชในใฏๅๆ', 'ใใผใ ใใใธใกใณใ', 'Excel', 'Pow... | \n", + "C_ๆฏๆใใปใ้ | \n", + "finance | \n", + "ไบฌ้ฝๅบ|่ฟ็ฟๅฐๆน|่ฅฟๆฅๆฌ|ไฟ้บๆฅญ ๅคงๆ|ๅคงๅญฆๅ ๆ็ณป|ๆขๅฉ | \n", + "
| 112485 | \n", + "c20c606f4b864e1db4fdd379299409e2 | \n", + "ไฟ้บๆฅญ ไธญๅ (็พๅจใฏๅผ้) | \n", + "ๆฑไบฌ้ฝ | \n", + "้ขๆฑๅฐๆน | \n", + "ๆญปๅฅ | \n", + "\n", + " | ['ไฟ้บๅฅ็ดใฎใชในใฏ่ฉไพก', '้กงๅฎขๅฏพๅฟใจไฟก้ ผ้ขไฟๆง็ฏ', 'ไฟ้บๅๅใซ้ขใใๆณ่ฆ้ตๅฎ', ... | \n", + "C_ๆฏๆใใปใ้ | \n", + "finance | \n", + "ๆฑไบฌ้ฝ|้ขๆฑๅฐๆน|ๆฑๆฅๆฌ|ไฟ้บๆฅญ ไธญๅ (็พๅจใฏๅผ้)|้ซๆ กๅ|ๆญปๅฅ | \n", + "
| 110168 | \n", + "395ad83f309343a7adef1ed7d3e1b3f0 | \n", + "ไฟ้บๆฅญ ไธญๅฐ | \n", + "ๅ่็ | \n", + "้ขๆฑๅฐๆน | \n", + "ๆขๅฉ | \n", + "\n", + " | ['ไฟ้บๅๅ็ฅ่ญ','้กงๅฎขๅฏพๅฟ','ใชในใฏๅๆ','ๅถๆฅญๆๆก','ๆธ้กไฝๆ','ใใผใ ใชใผใ... | \n", + "C_ๆฏๆใใปใ้ | \n", + "finance | \n", + "ๅ่็|้ขๆฑๅฐๆน|ๆฑๆฅๆฌ|ไฟ้บๆฅญ ไธญๅฐ|้ซๆ กๅ|ๆขๅฉ | \n", + "
| 419644 | \n", + "8f2df7cef4bd42c884f965f310423221 | \n", + "ไฟ้บๆฅญ ไธญๅ | \n", + "้ฆๅท็ | \n", + "ๅๅฝๅฐๆน | \n", + "ๆขๅฉ (ๅญไพใใ) | \n", + "\n", + " | ['ไฟ้บๅๅ็ฅ่ญ','้กงๅฎขๅฏพๅฟ','ๅฅ็ดๆ็ถใ็ฎก็','Microsoft Officeๆดป็จ... | \n", + "C_ๆฏๆใใปใ้ | \n", + "finance | \n", + "้ฆๅท็|ๅๅฝๅฐๆน|่ฅฟๆฅๆฌ|ไฟ้บๆฅญ ไธญๅ |้ซๆ กๅ|ๆขๅฉ (ๅญไพใใ) | \n", + "
| 604760 | \n", + "94f5b775f7264e9a872baf7d2ec8e25e | \n", + "ไฟ้บๆฅญ ไธญๅ (็พๅจใฏๅผ้) | \n", + "ๆป่ณ็ | \n", + "่ฟ็ฟๅฐๆน | \n", + "ๆญปๅฅ | \n", + "\n", + " | [\"ใชในใฏ่ฉไพกใจๅฅ็ดๅฏฉๆป\", \"ใทใใข่ณ็ฃ้็จใขใใใคใน\", \"้่ๆณ่ฆ้ตๅฎ\", \"ๅฐๅ้่... | \n", + "C_ๆฏๆใใปใ้ | \n", + "finance | \n", + "ๆป่ณ็|่ฟ็ฟๅฐๆน|่ฅฟๆฅๆฌ|ไฟ้บๆฅญ ไธญๅ (็พๅจใฏๅผ้)|ๅคงๅญฆๅ ็็ณป|ๆญปๅฅ | \n", + "
DataDesignerConfigBuilder(\n", + " seed_dataset: 'data-designer/filtered_personas_clean_balanced_2000_jcommonsense_add_jc_theme/filtered_personas_balanced_clean_2000_jcommonsense.csv'\n", + " seed_dataset_columns: [\n", + " "uuid",\n", + " "professional_persona",\n", + " "sports_persona",\n", + " "arts_persona",\n", + " "travel_persona",\n", + " "culinary_persona",\n", + " "persona",\n", + " "cultural_background",\n", + " "skills_and_expertise",\n", + " "skills_and_expertise_list",\n", + " "hobbies_and_interests",\n", + " "hobbies_and_interests_list",\n", + " "career_goals_and_ambitions",\n", + " "sex",\n", + " "age",\n", + " "marital_status",\n", + " "education_level",\n", + " "occupation",\n", + " "region",\n", + " "area",\n", + " "prefecture",\n", + " "country",\n", + " "age_band",\n", + " "_all_text",\n", + " "_core_text",\n", + " "_core_len",\n", + " "_attr_key",\n", + " "score_finance",\n", + " "score_safety",\n", + " "score_vocab",\n", + " "score_public",\n", + " "score_tools",\n", + " "score_life",\n", + " "score_geo",\n", + " "score_culture",\n", + " "_geo_text",\n", + " "_tools_text",\n", + " "_kw_hits",\n", + " "jc_category",\n", + " "max_score_any",\n", + " "_public_bonus",\n", + " "_religion_pen",\n", + " "jc_theme",\n", + " "topic_category"\n", + " ]\n", + ")\n", + "
\n", + " Seed Columns \n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + "โ Name โ Value โ\n", + "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", + "โ uuid โ 749db6e7c2e245b2ae3b46aa12c4f1e0 โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ professional_persona โ ไธญๅถ โ\n", + "โ โ ไปๅญใฏไฟ้บๅฅ็ดใฎใชในใฏ่ฉไพกใจ้กงๅฎขใใผใบใฎไฝ็ณป็ๅๆใซ้ทๅนดๅพไบใใ้่ทๅพใใกใณใฟใช โฆ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ sports_persona โ ไธญๅถ โ\n", + "โ โ ไปๅญใฏๅญฃ็ฏใซๅใใใใฆใฉใผใญใณใฐใจใณใใฅใใใฃใฎ่ปฝ้ๅใฏใฉในใงไฝๅ็ถญๆใๅณใใ็ซถ โฆ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ arts_persona โ ไธญๅถ โ\n", + "โ โ ไปๅญใฏ่ถ้ใจๆธ้ใฎไผ็ตฑ็็จฝๅคใๅบ็คใซใใใธใฟใซๅขจ็ตตใใคใณใฟใฉใฏใใฃใ่ถๅฎคไฝ้จใจใ โฆ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ travel_persona โ ไธญๅถ โ\n", + "โ โ ไปๅญใฏ่ฟ้ใฎๆญดๅฒ็ๅฏบ้ขใๅญฃ็ฏใฎ่พฒ็ฃ็ฉ็ดๅฃฒๆใธใฎๆฅๅธฐใ่จชๅใ่จ็ปใใๅ่ปไบ็ดใจๅฎฟๆณ โฆ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ culinary_persona โ ไธญๅถ โ\n", + "โ โ ไปๅญใฏๅญฃ็ฏใฎๆ น่ใจๆตท่ปใไฝฟ็จใใไฝๅกฉๅ้ฃใๅฅฝใฟใๆน่ถใจ็ ่ถใฎๆฝๅบๆ้ใๅพฎ่ชฟๆดใใช โฆ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ persona โ ไธญๅถ โ\n", + "โ โ ไปๅญใฏ็ต็น็ใชใชในใฏ็ฎก็ใจๅฅๅบทๅฟๅใฎ็ๆดป็ฟๆ ฃใ็ตฑๅใใใชใผใใณใใคใณใใจ่จ็ปๆงใง โฆ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ cultural_background โ ไธ้็ๅบ่บซใง่ฟ็ฟๅฐๆน็นๆใฎๆธฉใใไบบๆ ใจใๅนด้ท่ ใธใฎๆฌๆใ้ใใใไพกๅค่ฆณใๆใกใ็ โฆ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ skills_and_expertise โ ไฟ้บๅฅ็ดใฎ็ฎก็ใจๆดๆฐใใชในใฏ่ฉไพกใป้กงๅฎขใฎใใผใบๆๆกใๆณ่ฆๅถใฎ้ตๅฎใซๅ ใใฆใExcelใโฆ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ skills_and_expertise_list โ ['ไฟ้บๅฅ็ด็ฎก็', 'ใชในใฏ่ฉไพก', '้กงๅฎขๅฏพๅฟ', 'ๆณ่ฆๅถ้ตๅฎ', 'Excelๆฅญๅ'] โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ hobbies_and_interests โ ๆฏๆใฎๆฃๆญฉใจ่ปฝใไฝๆใงๅฅๅบท็ถญๆใซๅชใใๅฐๅใฎๅฅๅบท่ฌๅบงใง่ฌๅธซใๅใใใปใใ่ถ้ใๆธ โฆ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ hobbies_and_interests_list โ ['ๆฃๆญฉใปไฝๆ', 'ๅฅๅบท่ฌๅบง่ฌๅธซ', '่ถ้', 'ๆธ้', 'ๆฐใใ่กจ็พใธใฎๆข็ดข'] โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ career_goals_and_ambitions โ ้่ทๅพใๅฅๅบทไฟ้บๅๅใ้ใใ้ซ้ฝข่ ใฎ็ๆดปๆฏๆดใซ้ขใใใ็ต้จใๆดปใใใ่ฅๆไปฃ็ๅบใธ โฆ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ sex โ ๅฅณ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ age โ 64 โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ marital_status โ ้ขๅฅ (ๅญไพใใ) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ education_level โ ไธญๅญฆๅ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ occupation โ ไฟ้บๆฅญ ไธญๅฐ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ region โ ่ฟ็ฟๅฐๆน โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ area โ ่ฅฟๆฅๆฌ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ prefecture โ ไธ้็ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ country โ ๆฅๆฌ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ age_band โ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _all_text โ ไฟ้บๆฅญ ไธญๅฐ / โ\n", + "โ โ ๆฏๆใฎๆฃๆญฉใจ่ปฝใไฝๆใงๅฅๅบท็ถญๆใซๅชใใๅฐๅใฎๅฅๅบท่ฌๅบงใง่ฌๅธซใๅใใใปใใ่ถ้ใๆธ โฆ โ\n", + "โ โ / ้ขๅฅ (ๅญไพใใ) / ไธญๅญฆๅ / ไธ้็ / ่ฟ็ฟๅฐๆน / ่ฅฟๆฅๆฌ / ไธญๅถ โ\n", + "โ โ ไปๅญใฏไฟ้บๅฅ็ดใฎใชในใฏ่ฉไพกใจ้กงๅฎขใใผใบใฎไฝ็ณป็ๅๆใซ้ทๅนดๅพไบใใ้่ทๅพใใกใณใฟใช โฆ โ\n", + "โ โ / โ\n", + "โ โ ไธ้็ๅบ่บซใง่ฟ็ฟๅฐๆน็นๆใฎๆธฉใใไบบๆ ใจใๅนด้ท่ ใธใฎๆฌๆใ้ใใใไพกๅค่ฆณใๆใกใ็ โฆ โ\n", + "โ โ / ไธญๅถ โ\n", + "โ โ ไปๅญใฏ่ฟ้ใฎๆญดๅฒ็ๅฏบ้ขใๅญฃ็ฏใฎ่พฒ็ฃ็ฉ็ดๅฃฒๆใธใฎๆฅๅธฐใ่จชๅใ่จ็ปใใๅ่ปไบ็ดใจๅฎฟๆณ โฆ โ\n", + "โ โ / ไธญๅถ โ\n", + "โ โ ไปๅญใฏๅญฃ็ฏใฎๆ น่ใจๆตท่ปใไฝฟ็จใใไฝๅกฉๅ้ฃใๅฅฝใฟใๆน่ถใจ็ ่ถใฎๆฝๅบๆ้ใๅพฎ่ชฟๆดใใช โฆ โ\n", + "โ โ / ไธญๅถ โ\n", + "โ โ ไปๅญใฏ็ต็น็ใชใชในใฏ็ฎก็ใจๅฅๅบทๅฟๅใฎ็ๆดป็ฟๆ ฃใ็ตฑๅใใใชใผใใณใใคใณใใจ่จ็ปๆงใง โฆ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _core_text โ ไฟ้บๆฅญ ไธญๅฐ / โ\n", + "โ โ ๆฏๆใฎๆฃๆญฉใจ่ปฝใไฝๆใงๅฅๅบท็ถญๆใซๅชใใๅฐๅใฎๅฅๅบท่ฌๅบงใง่ฌๅธซใๅใใใปใใ่ถ้ใๆธ โฆ โ\n", + "โ โ / ้ขๅฅ (ๅญไพใใ) / ไธญๅญฆๅ / ไธ้็ / ่ฟ็ฟๅฐๆน / ่ฅฟๆฅๆฌ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _core_len โ 116 โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _attr_key โ ไธ้็|่ฟ็ฟๅฐๆน|่ฅฟๆฅๆฌ|ไฟ้บๆฅญ ไธญๅฐ|ไธญๅญฆๅ|้ขๅฅ (ๅญไพใใ) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_finance โ 1 โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_safety โ 0 โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_vocab โ 1 โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_public โ 0 โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_tools โ 0 โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_life โ 1 โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_geo โ 0 โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_culture โ 1 โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _geo_text โ ไฟ้บๆฅญ ไธญๅฐ / โ\n", + "โ โ ๆฏๆใฎๆฃๆญฉใจ่ปฝใไฝๆใงๅฅๅบท็ถญๆใซๅชใใๅฐๅใฎๅฅๅบท่ฌๅบงใง่ฌๅธซใๅใใใปใใ่ถ้ใๆธ โฆ โ\n", + "โ โ / ้ขๅฅ (ๅญไพใใ) / ไธญๅญฆๅ / ไธ้็ / ่ฟ็ฟๅฐๆน / ่ฅฟๆฅๆฌ / ไธญๅถ โ\n", + "โ โ ไปๅญใฏ่ฟ้ใฎๆญดๅฒ็ๅฏบ้ขใๅญฃ็ฏใฎ่พฒ็ฃ็ฉ็ดๅฃฒๆใธใฎๆฅๅธฐใ่จชๅใ่จ็ปใใๅ่ปไบ็ดใจๅฎฟๆณ โฆ โ\n", + "โ โ / ่ฅฟๆฅๆฌ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _tools_text โ ไฟ้บๆฅญ ไธญๅฐ / โ\n", + "โ โ ๆฏๆใฎๆฃๆญฉใจ่ปฝใไฝๆใงๅฅๅบท็ถญๆใซๅชใใๅฐๅใฎๅฅๅบท่ฌๅบงใง่ฌๅธซใๅใใใปใใ่ถ้ใๆธ โฆ โ\n", + "โ โ / ้ขๅฅ (ๅญไพใใ) / ไธญๅญฆๅ / ไธ้็ / ่ฟ็ฟๅฐๆน / ่ฅฟๆฅๆฌ / โ\n", + "โ โ ๆฏๆใฎๆฃๆญฉใจ่ปฝใไฝๆใงๅฅๅบท็ถญๆใซๅชใใๅฐๅใฎๅฅๅบท่ฌๅบงใง่ฌๅธซใๅใใใปใใ่ถ้ใๆธ โฆ โ\n", + "โ โ / ['ไฟ้บๅฅ็ด็ฎก็', 'ใชในใฏ่ฉไพก', '้กงๅฎขๅฏพๅฟ', 'ๆณ่ฆๅถ้ตๅฎ', 'Excelๆฅญๅ'] โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _kw_hits โ 4 โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ jc_category โ finance โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ max_score_any โ 1.0 โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _public_bonus โ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _religion_pen โ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ jc_theme โ C_ๆฏๆใใปใ้ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ topic_category โ ๅ ฌๅ ฑใฎๅ ด โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + " \n", + " \n", + " Generated Columns \n", + "โโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + "โ Name โ Value โ\n", + "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", + "โ clarity_score โ ๆ็ขบ โ\n", + "โโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ difficulty โ ๆฎ้ โ\n", + "โโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ jcqa_data โ { โ\n", + "โ โ 'answer_index': 0, โ\n", + "โ โ 'choice0': '็พ้๏ผ็ดๅนฃใป็กฌ่ฒจ๏ผ', โ\n", + "โ โ 'choice1': 'ใฏใฌใธใใใซใผใ', โ\n", + "โ โ 'choice2': '้ปๅญใใใผ๏ผSuicaใปICOCA ใชใฉ๏ผ', โ\n", + "โ โ 'choice3': 'ๅฐๅๆ', โ\n", + "โ โ 'choice4': 'ในใใผใใใฉใณๆฑบๆธ๏ผPayPay ใชใฉ๏ผ', โ\n", + "โ โ 'question': โ\n", + "โ โ 'ไธ้็ใฎๅ ฌๅ ฑๅณๆธ้คจใงๆฌใฎ่ฟๅดๆ้ใ้ใใ้ใฎๅปถๆป้ใๆฏๆใใจใใๅไปใงไธ่ฌ็ใซๅใไปใใฆใใ โฆ โ\n", + "โ โ 'reasoning': โ\n", + "โ โ 'ๆฅๆฌใฎๅคใใฎๅ ฌๅ ฑๅณๆธ้คจใงใฏใๅปถๆป้ใฎๆฏๆใใฏ็ชๅฃใงใฎ็พ้ใฎใฟใๆจๆบ็ใซๅใไปใใใใฆใใพใใ โฆ โ\n", + "โ โ } โ\n", + "โโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + " \n", + " \n", + " LLM-as-a-Judge: quality_metrics \n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + "โ difficulty โ question_clarity โ\n", + "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", + "โ score: ๆฎ้ โ score: ๆ็ขบ โ\n", + "โ reasoning: โ reasoning: โ\n", + "โ ใใฎๅ้กใฏๆฅๆฌใฎๅ ฌๅ ฑๅณๆธ้คจใฎไธ่ฌ็ใช้ๅถๅฎๅใซ้ขใใ โฆ โ ่ณชๅใฏใไธ้็ใฎๅ ฌๅ ฑๅณๆธ้คจใงๆฌใฎ่ฟๅดๆ้ใ้ใใ้ใฎ โฆ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + " \n", + " [index: 0] \n", + "\n" + ], + "text/plain": [ + " \n", + "\u001b[3m Seed Columns \u001b[0m\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + "โ\u001b[1m \u001b[0m\u001b[1mName \u001b[0m\u001b[1m \u001b[0mโ\u001b[1m \u001b[0m\u001b[1mValue \u001b[0m\u001b[1m \u001b[0mโ\n", + "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", + "โ uuid โ 749db6e7c2e245b2ae3b46aa12c4f1e0 โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ professional_persona โ ไธญๅถ โ\n", + "โ โ ไปๅญใฏไฟ้บๅฅ็ดใฎใชในใฏ่ฉไพกใจ้กงๅฎขใใผใบใฎไฝ็ณป็ๅๆใซ้ทๅนดๅพไบใใ้่ทๅพใใกใณใฟใช โฆ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ sports_persona โ ไธญๅถ โ\n", + "โ โ ไปๅญใฏๅญฃ็ฏใซๅใใใใฆใฉใผใญใณใฐใจใณใใฅใใใฃใฎ่ปฝ้ๅใฏใฉในใงไฝๅ็ถญๆใๅณใใ็ซถ โฆ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ arts_persona โ ไธญๅถ โ\n", + "โ โ ไปๅญใฏ่ถ้ใจๆธ้ใฎไผ็ตฑ็็จฝๅคใๅบ็คใซใใใธใฟใซๅขจ็ตตใใคใณใฟใฉใฏใใฃใ่ถๅฎคไฝ้จใจใ โฆ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ travel_persona โ ไธญๅถ โ\n", + "โ โ ไปๅญใฏ่ฟ้ใฎๆญดๅฒ็ๅฏบ้ขใๅญฃ็ฏใฎ่พฒ็ฃ็ฉ็ดๅฃฒๆใธใฎๆฅๅธฐใ่จชๅใ่จ็ปใใๅ่ปไบ็ดใจๅฎฟๆณ โฆ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ culinary_persona โ ไธญๅถ โ\n", + "โ โ ไปๅญใฏๅญฃ็ฏใฎๆ น่ใจๆตท่ปใไฝฟ็จใใไฝๅกฉๅ้ฃใๅฅฝใฟใๆน่ถใจ็ ่ถใฎๆฝๅบๆ้ใๅพฎ่ชฟๆดใใช โฆ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ persona โ ไธญๅถ โ\n", + "โ โ ไปๅญใฏ็ต็น็ใชใชในใฏ็ฎก็ใจๅฅๅบทๅฟๅใฎ็ๆดป็ฟๆ ฃใ็ตฑๅใใใชใผใใณใใคใณใใจ่จ็ปๆงใง โฆ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ cultural_background โ ไธ้็ๅบ่บซใง่ฟ็ฟๅฐๆน็นๆใฎๆธฉใใไบบๆ ใจใๅนด้ท่ ใธใฎๆฌๆใ้ใใใไพกๅค่ฆณใๆใกใ็ โฆ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ skills_and_expertise โ ไฟ้บๅฅ็ดใฎ็ฎก็ใจๆดๆฐใใชในใฏ่ฉไพกใป้กงๅฎขใฎใใผใบๆๆกใๆณ่ฆๅถใฎ้ตๅฎใซๅ ใใฆใExcelใโฆ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ skills_and_expertise_list โ ['ไฟ้บๅฅ็ด็ฎก็', 'ใชในใฏ่ฉไพก', '้กงๅฎขๅฏพๅฟ', 'ๆณ่ฆๅถ้ตๅฎ', 'Excelๆฅญๅ'] โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ hobbies_and_interests โ ๆฏๆใฎๆฃๆญฉใจ่ปฝใไฝๆใงๅฅๅบท็ถญๆใซๅชใใๅฐๅใฎๅฅๅบท่ฌๅบงใง่ฌๅธซใๅใใใปใใ่ถ้ใๆธ โฆ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ hobbies_and_interests_list โ ['ๆฃๆญฉใปไฝๆ', 'ๅฅๅบท่ฌๅบง่ฌๅธซ', '่ถ้', 'ๆธ้', 'ๆฐใใ่กจ็พใธใฎๆข็ดข'] โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ career_goals_and_ambitions โ ้่ทๅพใๅฅๅบทไฟ้บๅๅใ้ใใ้ซ้ฝข่ ใฎ็ๆดปๆฏๆดใซ้ขใใใ็ต้จใๆดปใใใ่ฅๆไปฃ็ๅบใธ โฆ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ sex โ ๅฅณ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ age โ 64 โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ marital_status โ ้ขๅฅ (ๅญไพใใ) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ education_level โ ไธญๅญฆๅ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ occupation โ ไฟ้บๆฅญ ไธญๅฐ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ region โ ่ฟ็ฟๅฐๆน โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ area โ ่ฅฟๆฅๆฌ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ prefecture โ ไธ้็ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ country โ ๆฅๆฌ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ age_band โ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _all_text โ ไฟ้บๆฅญ ไธญๅฐ / โ\n", + "โ โ ๆฏๆใฎๆฃๆญฉใจ่ปฝใไฝๆใงๅฅๅบท็ถญๆใซๅชใใๅฐๅใฎๅฅๅบท่ฌๅบงใง่ฌๅธซใๅใใใปใใ่ถ้ใๆธ โฆ โ\n", + "โ โ / ้ขๅฅ (ๅญไพใใ) / ไธญๅญฆๅ / ไธ้็ / ่ฟ็ฟๅฐๆน / ่ฅฟๆฅๆฌ / ไธญๅถ โ\n", + "โ โ ไปๅญใฏไฟ้บๅฅ็ดใฎใชในใฏ่ฉไพกใจ้กงๅฎขใใผใบใฎไฝ็ณป็ๅๆใซ้ทๅนดๅพไบใใ้่ทๅพใใกใณใฟใช โฆ โ\n", + "โ โ / โ\n", + "โ โ ไธ้็ๅบ่บซใง่ฟ็ฟๅฐๆน็นๆใฎๆธฉใใไบบๆ ใจใๅนด้ท่ ใธใฎๆฌๆใ้ใใใไพกๅค่ฆณใๆใกใ็ โฆ โ\n", + "โ โ / ไธญๅถ โ\n", + "โ โ ไปๅญใฏ่ฟ้ใฎๆญดๅฒ็ๅฏบ้ขใๅญฃ็ฏใฎ่พฒ็ฃ็ฉ็ดๅฃฒๆใธใฎๆฅๅธฐใ่จชๅใ่จ็ปใใๅ่ปไบ็ดใจๅฎฟๆณ โฆ โ\n", + "โ โ / ไธญๅถ โ\n", + "โ โ ไปๅญใฏๅญฃ็ฏใฎๆ น่ใจๆตท่ปใไฝฟ็จใใไฝๅกฉๅ้ฃใๅฅฝใฟใๆน่ถใจ็ ่ถใฎๆฝๅบๆ้ใๅพฎ่ชฟๆดใใช โฆ โ\n", + "โ โ / ไธญๅถ โ\n", + "โ โ ไปๅญใฏ็ต็น็ใชใชในใฏ็ฎก็ใจๅฅๅบทๅฟๅใฎ็ๆดป็ฟๆ ฃใ็ตฑๅใใใชใผใใณใใคใณใใจ่จ็ปๆงใง โฆ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _core_text โ ไฟ้บๆฅญ ไธญๅฐ / โ\n", + "โ โ ๆฏๆใฎๆฃๆญฉใจ่ปฝใไฝๆใงๅฅๅบท็ถญๆใซๅชใใๅฐๅใฎๅฅๅบท่ฌๅบงใง่ฌๅธซใๅใใใปใใ่ถ้ใๆธ โฆ โ\n", + "โ โ / ้ขๅฅ (ๅญไพใใ) / ไธญๅญฆๅ / ไธ้็ / ่ฟ็ฟๅฐๆน / ่ฅฟๆฅๆฌ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _core_len โ 116 โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _attr_key โ ไธ้็|่ฟ็ฟๅฐๆน|่ฅฟๆฅๆฌ|ไฟ้บๆฅญ ไธญๅฐ|ไธญๅญฆๅ|้ขๅฅ (ๅญไพใใ) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_finance โ 1 โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_safety โ 0 โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_vocab โ 1 โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_public โ 0 โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_tools โ 0 โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_life โ 1 โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_geo โ 0 โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_culture โ 1 โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _geo_text โ ไฟ้บๆฅญ ไธญๅฐ / โ\n", + "โ โ ๆฏๆใฎๆฃๆญฉใจ่ปฝใไฝๆใงๅฅๅบท็ถญๆใซๅชใใๅฐๅใฎๅฅๅบท่ฌๅบงใง่ฌๅธซใๅใใใปใใ่ถ้ใๆธ โฆ โ\n", + "โ โ / ้ขๅฅ (ๅญไพใใ) / ไธญๅญฆๅ / ไธ้็ / ่ฟ็ฟๅฐๆน / ่ฅฟๆฅๆฌ / ไธญๅถ โ\n", + "โ โ ไปๅญใฏ่ฟ้ใฎๆญดๅฒ็ๅฏบ้ขใๅญฃ็ฏใฎ่พฒ็ฃ็ฉ็ดๅฃฒๆใธใฎๆฅๅธฐใ่จชๅใ่จ็ปใใๅ่ปไบ็ดใจๅฎฟๆณ โฆ โ\n", + "โ โ / ่ฅฟๆฅๆฌ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _tools_text โ ไฟ้บๆฅญ ไธญๅฐ / โ\n", + "โ โ ๆฏๆใฎๆฃๆญฉใจ่ปฝใไฝๆใงๅฅๅบท็ถญๆใซๅชใใๅฐๅใฎๅฅๅบท่ฌๅบงใง่ฌๅธซใๅใใใปใใ่ถ้ใๆธ โฆ โ\n", + "โ โ / ้ขๅฅ (ๅญไพใใ) / ไธญๅญฆๅ / ไธ้็ / ่ฟ็ฟๅฐๆน / ่ฅฟๆฅๆฌ / โ\n", + "โ โ ๆฏๆใฎๆฃๆญฉใจ่ปฝใไฝๆใงๅฅๅบท็ถญๆใซๅชใใๅฐๅใฎๅฅๅบท่ฌๅบงใง่ฌๅธซใๅใใใปใใ่ถ้ใๆธ โฆ โ\n", + "โ โ / ['ไฟ้บๅฅ็ด็ฎก็', 'ใชในใฏ่ฉไพก', '้กงๅฎขๅฏพๅฟ', 'ๆณ่ฆๅถ้ตๅฎ', 'Excelๆฅญๅ'] โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _kw_hits โ 4 โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ jc_category โ finance โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ max_score_any โ 1.0 โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _public_bonus โ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _religion_pen โ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ jc_theme โ C_ๆฏๆใใปใ้ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ topic_category โ ๅ ฌๅ ฑใฎๅ ด โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + " \n", + " \n", + "\u001b[3m Generated Columns \u001b[0m\n", + "โโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + "โ\u001b[1m \u001b[0m\u001b[1mName \u001b[0m\u001b[1m \u001b[0mโ\u001b[1m \u001b[0m\u001b[1mValue \u001b[0m\u001b[1m \u001b[0mโ\n", + "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", + "โ clarity_score โ ๆ็ขบ โ\n", + "โโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ difficulty โ ๆฎ้ โ\n", + "โโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ jcqa_data โ \u001b[1m{\u001b[0m โ\n", + "โ โ \u001b[32m'answer_index'\u001b[0m: \u001b[1;36m0\u001b[0m, โ\n", + "โ โ \u001b[32m'choice0'\u001b[0m: \u001b[32m'็พ้๏ผ็ดๅนฃใป็กฌ่ฒจ๏ผ'\u001b[0m, โ\n", + "โ โ \u001b[32m'choice1'\u001b[0m: \u001b[32m'ใฏใฌใธใใใซใผใ'\u001b[0m, โ\n", + "โ โ \u001b[32m'choice2'\u001b[0m: \u001b[32m'้ปๅญใใใผ๏ผSuicaใปICOCA ใชใฉ๏ผ'\u001b[0m, โ\n", + "โ โ \u001b[32m'choice3'\u001b[0m: \u001b[32m'ๅฐๅๆ'\u001b[0m, โ\n", + "โ โ \u001b[32m'choice4'\u001b[0m: \u001b[32m'ในใใผใใใฉใณๆฑบๆธ๏ผPayPay ใชใฉ๏ผ'\u001b[0m, โ\n", + "โ โ \u001b[32m'question'\u001b[0m: โ\n", + "โ โ \u001b[32m'ไธ้็ใฎๅ ฌๅ ฑๅณๆธ้คจใงๆฌใฎ่ฟๅดๆ้ใ้ใใ้ใฎๅปถๆป้ใๆฏๆใใจใใๅไปใงไธ่ฌ็ใซๅใไปใใฆใใ โฆ\u001b[0m โ\n", + "โ โ \u001b[32m'reasoning'\u001b[0m: โ\n", + "โ โ \u001b[32m'ๆฅๆฌใฎๅคใใฎๅ ฌๅ ฑๅณๆธ้คจใงใฏใๅปถๆป้ใฎๆฏๆใใฏ็ชๅฃใงใฎ็พ้ใฎใฟใๆจๆบ็ใซๅใไปใใใใฆใใพใใ โฆ\u001b[0m โ\n", + "โ โ \u001b[1m}\u001b[0m โ\n", + "โโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + " \n", + " \n", + "\u001b[3m LLM-as-a-Judge: quality_metrics \u001b[0m\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + "โ\u001b[1m \u001b[0m\u001b[1mdifficulty \u001b[0m\u001b[1m \u001b[0mโ\u001b[1m \u001b[0m\u001b[1mquestion_clarity \u001b[0m\u001b[1m \u001b[0mโ\n", + "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", + "โ score: ๆฎ้ โ score: ๆ็ขบ โ\n", + "โ reasoning: โ reasoning: โ\n", + "โ ใใฎๅ้กใฏๆฅๆฌใฎๅ ฌๅ ฑๅณๆธ้คจใฎไธ่ฌ็ใช้ๅถๅฎๅใซ้ขใใ โฆ โ ่ณชๅใฏใไธ้็ใฎๅ ฌๅ ฑๅณๆธ้คจใงๆฌใฎ่ฟๅดๆ้ใ้ใใ้ใฎ โฆ โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + " \n", + " [index: 0] \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Seedใใ็ใฎใใฌใใฅใผ\n", + "print(\"\\n\" + \"=\"*0)\n", + "print(\"Seedใใผใฟใใ็ใฎใใฌใใฅใผใ็ๆไธญ...\")\n", + "print(\"=\"*10)\n", + "\n", + "preview_with_seed_jcommonsenseqa = data_designer_client.preview(\n", + " config_builder_with_seed_jcommonsenseqa,\n", + " num_records=1,\n", + ")\n", + "\n", + "print(\"\\nใใฌใใฅใผ็ๆๅฎไบ!\")\n", + "preview_with_seed_jcommonsenseqa.display_sample_record()" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "preview_analysis", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "ใใฌใใฅใผใใผใฟใฎๅๆ:\n" + ] + }, + { + "data": { + "text/html": [ + "
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ ๐จ Data Designer Dataset Profile โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + "\n", + " \n", + " Dataset Overview \n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + "โ number of records โ number of columns โ percent complete records โ\n", + "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", + "โ 1 โ 48 โ 100.0% โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + " \n", + " \n", + " ๐ฑ Seed-Dataset Columns \n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + "โ column name โ data type โ number unique values โ\n", + "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", + "โ uuid โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ professional_persona โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ sports_persona โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ arts_persona โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ travel_persona โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ culinary_persona โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ persona โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ cultural_background โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ skills_and_expertise โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ skills_and_expertise_list โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ hobbies_and_interests โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ hobbies_and_interests_list โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ career_goals_and_ambitions โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ sex โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ age โ int โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ marital_status โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ education_level โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ occupation โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ region โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ area โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ prefecture โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ country โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ age_band โ None โ 0 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _all_text โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _core_text โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _core_len โ int โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _attr_key โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_finance โ int โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_safety โ int โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_vocab โ int โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_public โ int โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_tools โ int โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_life โ int โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_geo โ int โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_culture โ int โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _geo_text โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _tools_text โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _kw_hits โ int โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ jc_category โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ max_score_any โ float โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _public_bonus โ None โ 0 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _religion_pen โ None โ 0 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ jc_theme โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ topic_category โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + " \n", + " \n", + " ๐๏ธ LLM-Structured Columns \n", + "โโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + "โ โ โ โ prompt tokens โ completion tokens โ\n", + "โ column name โ data type โ number unique values โ per record โ per record โ\n", + "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", + "โ jcqa_data โ dict โ 1 (100.0%) โ 1319.0 +/- 0.0 โ 396.0 +/- nan โ\n", + "โโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + " \n", + " \n", + " โ๏ธ LLM-Judge Columns \n", + "โโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโ\n", + "โ โ โ โ prompt tokens โ completion tokens โ\n", + "โ column name โ data type โ number unique values โ per record โ per record โ\n", + "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", + "โ quality_metrics โ dict โ 1 (100.0%) โ 1746.0 +/- 0.0 โ 272.0 +/- nan โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโ\n", + " \n", + " \n", + " ๐งฉ Expression Columns \n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + "โ column name โ data type โ number unique values โ\n", + "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", + "โ clarity_score โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ difficulty โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + " \n", + " \n", + "โญโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ Table Notes โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฎ\n", + "โ โ\n", + "โ 1. All token statistics are based on a sample of max(1000, len(dataset)) records. โ\n", + "โ 2. Tokens are calculated using tiktoken's cl100k_base tokenizer. โ\n", + "โ โ\n", + "โฐโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ\n", + " \n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + "\n" + ], + "text/plain": [ + "\u001b[1;38;2;118;185;0mโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ \u001b[0m๐จ Data Designer Dataset Profile\u001b[1;38;2;118;185;0m โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\u001b[0m\n", + "\n", + " \n", + "\u001b[1;38;2;118;185;0m Dataset Overview \u001b[0m\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + "โ\u001b[1;2m \u001b[0m\u001b[1;2mnumber of records \u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2mnumber of columns \u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2mpercent complete records \u001b[0m\u001b[1;2m \u001b[0mโ\n", + "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", + "โ 1 โ 48 โ 100.0% โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + " \n", + " \n", + "\u001b[1;38;2;118;185;0m ๐ฑ Seed-Dataset Columns \u001b[0m\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + "โ\u001b[1;2m \u001b[0m\u001b[1;2mcolumn name \u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m data type\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m number unique values\u001b[0m\u001b[1;2m \u001b[0mโ\n", + "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", + "โ uuid โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ professional_persona โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ sports_persona โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ arts_persona โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ travel_persona โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ culinary_persona โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ persona โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ cultural_background โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ skills_and_expertise โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ skills_and_expertise_list โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ hobbies_and_interests โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ hobbies_and_interests_list โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ career_goals_and_ambitions โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ sex โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ age โ int โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ marital_status โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ education_level โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ occupation โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ region โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ area โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ prefecture โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ country โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ age_band โ \u001b[3;35mNone\u001b[0m โ 0 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _all_text โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _core_text โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _core_len โ int โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _attr_key โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_finance โ int โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_safety โ int โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_vocab โ int โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_public โ int โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_tools โ int โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_life โ int โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_geo โ int โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_culture โ int โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _geo_text โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _tools_text โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _kw_hits โ int โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ jc_category โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ max_score_any โ float โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _public_bonus โ \u001b[3;35mNone\u001b[0m โ 0 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _religion_pen โ \u001b[3;35mNone\u001b[0m โ 0 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ jc_theme โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ topic_category โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + " \n", + " \n", + "\u001b[1;38;2;118;185;0m ๐๏ธ LLM-Structured Columns \u001b[0m\n", + "โโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + "โ\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m prompt tokens\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m completion tokens\u001b[0m\u001b[1;2m \u001b[0mโ\n", + "โ\u001b[1;2m \u001b[0m\u001b[1;2mcolumn name \u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m data type\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m number unique values\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m per record\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m per record\u001b[0m\u001b[1;2m \u001b[0mโ\n", + "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", + "โ jcqa_data โ dict โ 1 (100.0%) โ 1319.0 +/- 0.0 โ 396.0 +/- nan โ\n", + "โโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + " \n", + " \n", + "\u001b[1;38;2;118;185;0m โ๏ธ LLM-Judge Columns \u001b[0m\n", + "โโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโ\n", + "โ\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m prompt tokens\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m completion tokens\u001b[0m\u001b[1;2m \u001b[0mโ\n", + "โ\u001b[1;2m \u001b[0m\u001b[1;2mcolumn name \u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m data type\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m number unique values\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m per record\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m per record\u001b[0m\u001b[1;2m \u001b[0mโ\n", + "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", + "โ quality_metrics โ dict โ 1 (100.0%) โ 1746.0 +/- 0.0 โ 272.0 +/- nan โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโ\n", + " \n", + " \n", + "\u001b[1;38;2;118;185;0m ๐งฉ Expression Columns \u001b[0m\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + "โ\u001b[1;2m \u001b[0m\u001b[1;2mcolumn name \u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m data type\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m number unique values\u001b[0m\u001b[1;2m \u001b[0mโ\n", + "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", + "โ clarity_score โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ difficulty โ string โ 1 (100.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + " \n", + " \n", + "\u001b[2mโญโ\u001b[0m\u001b[2mโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\u001b[0m\u001b[2m Table Notes \u001b[0m\u001b[2mโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\u001b[0m\u001b[2mโโฎ\u001b[0m\n", + "\u001b[2mโ\u001b[0m \u001b[2mโ\u001b[0m\n", + "\u001b[2mโ\u001b[0m 1. All token statistics are based on a sample of max(1000, len(dataset)) records. \u001b[2mโ\u001b[0m\n", + "\u001b[2mโ\u001b[0m 2. Tokens are calculated using tiktoken's cl100k_base tokenizer. \u001b[2mโ\u001b[0m\n", + "\u001b[2mโ\u001b[0m \u001b[2mโ\u001b[0m\n", + "\u001b[2mโฐโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ\u001b[0m\n", + " \n", + "\u001b[1;38;2;118;185;0mโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# ใใฌใใฅใผใฎๅๆใ่กจ็คบ\n", + "print(\"\\nใใฌใใฅใผใใผใฟใฎๅๆ:\")\n", + "preview_with_seed_jcommonsenseqa.analysis.to_report()" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "id": "preview_dataset", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "ใใฌใใฅใผใใผใฟใฎๆๅใฎๆฐไปถ:\n", + " uuid \\\n", + "0 749db6e7c2e245b2ae3b46aa12c4f1e0 \n", + "\n", + " professional_persona \\\n", + "0 ไธญๅถ ไปๅญใฏไฟ้บๅฅ็ดใฎใชในใฏ่ฉไพกใจ้กงๅฎขใใผใบใฎไฝ็ณป็ๅๆใซ้ทๅนดๅพไบใใ้่ทๅพใใกใณใฟใชใณใฐใจ... \n", + "\n", + " sports_persona \\\n", + "0 ไธญๅถ ไปๅญใฏๅญฃ็ฏใซๅใใใใฆใฉใผใญใณใฐใจใณใใฅใใใฃใฎ่ปฝ้ๅใฏใฉในใงไฝๅ็ถญๆใๅณใใ็ซถไบ็ใช... \n", + "\n", + " arts_persona \\\n", + "0 ไธญๅถ ไปๅญใฏ่ถ้ใจๆธ้ใฎไผ็ตฑ็็จฝๅคใๅบ็คใซใใใธใฟใซๅขจ็ตตใใคใณใฟใฉใฏใใฃใ่ถๅฎคไฝ้จใจใใฃใ้... \n", + "\n", + " travel_persona \\\n", + "0 ไธญๅถ ไปๅญใฏ่ฟ้ใฎๆญดๅฒ็ๅฏบ้ขใๅญฃ็ฏใฎ่พฒ็ฃ็ฉ็ดๅฃฒๆใธใฎๆฅๅธฐใ่จชๅใ่จ็ปใใๅ่ปไบ็ดใจๅฎฟๆณๅ ใฎใญ... \n", + "\n", + " culinary_persona \\\n", + "0 ไธญๅถ ไปๅญใฏๅญฃ็ฏใฎๆ น่ใจๆตท่ปใไฝฟ็จใใไฝๅกฉๅ้ฃใๅฅฝใฟใๆน่ถใจ็ ่ถใฎๆฝๅบๆ้ใๅพฎ่ชฟๆดใใชใใใ... \n", + "\n", + " persona \\\n", + "0 ไธญๅถ ไปๅญใฏ็ต็น็ใชใชในใฏ็ฎก็ใจๅฅๅบทๅฟๅใฎ็ๆดป็ฟๆ ฃใ็ตฑๅใใใชใผใใณใใคใณใใจ่จ็ปๆงใง้ซ้ฝข่ ... \n", + "\n", + " cultural_background \\\n", + "0 ไธ้็ๅบ่บซใง่ฟ็ฟๅฐๆน็นๆใฎๆธฉใใไบบๆ ใจใๅนด้ท่ ใธใฎๆฌๆใ้ใใใไพกๅค่ฆณใๆใกใ็้ข็ฎใใจๅฅๅบท... \n", + "\n", + " skills_and_expertise \\\n", + "0 ไฟ้บๅฅ็ดใฎ็ฎก็ใจๆดๆฐใใชในใฏ่ฉไพกใป้กงๅฎขใฎใใผใบๆๆกใๆณ่ฆๅถใฎ้ตๅฎใซๅ ใใฆใExcelใWor... \n", + "\n", + " skills_and_expertise_list ... _public_bonus \\\n", + "0 ['ไฟ้บๅฅ็ด็ฎก็', 'ใชในใฏ่ฉไพก', '้กงๅฎขๅฏพๅฟ', 'ๆณ่ฆๅถ้ตๅฎ', 'Excelๆฅญๅ'] ... None \n", + "\n", + " _religion_pen jc_theme topic_category \\\n", + "0 None C_ๆฏๆใใปใ้ ๅ ฌๅ ฑใฎๅ ด \n", + "\n", + " jcqa_data \\\n", + "0 {'answer_index': 0, 'choice0': '็พ้๏ผ็ดๅนฃใป็กฌ่ฒจ๏ผ', 'c... \n", + "\n", + " jcqa_data__reasoning_trace \\\n", + "0 We need to output JSON with fields: question, ... \n", + "\n", + " quality_metrics \\\n", + "0 {'difficulty': {'reasoning': 'ใใฎๅ้กใฏๆฅๆฌใฎๅ ฌๅ ฑๅณๆธ้คจใฎไธ่ฌ... \n", + "\n", + " quality_metrics__reasoning_trace clarity_score difficulty \n", + "0 We need to evaluate the generated data's quali... ๆ็ขบ ๆฎ้ \n", + "\n", + "[1 rows x 50 columns]\n" + ] + } + ], + "source": [ + "# ใใฌใใฅใผใใผใฟใDataFrameใจใใฆ็ขบ่ช\n", + "preview_df = preview_with_seed_jcommonsenseqa.dataset\n", + "print(\"\\nใใฌใใฅใผใใผใฟใฎๆๅใฎๆฐไปถ:\")\n", + "print(preview_df.head())" + ] + }, + { + "cell_type": "markdown", + "id": "full_generation", + "metadata": {}, + "source": [ + "## ๐ Generate Production Data\n", + "\n", + "If no issues in preview, generate a large-scale dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "724f1033-05ed-4bbb-bb94-594657c08893", + "metadata": {}, + "outputs": [], + "source": [ + "# Seedใใ็ใฎๆฌ็ช็ๆ\n", + "NUM_RECORDS = 8000 # ๅฟ ่ฆใซๅฟใใฆ่ชฟๆด\n", + "\n", + "print(\"\\n\" + \"=\"*80)\n", + "print(f\"Seedใใผใฟใใ็ {NUM_RECORDS}ไปถใฎใใผใฟใ็ๆไธญ...\")\n", + "print(\"=\"*80)\n", + "\n", + "job_with_seed = data_designer_client.create(\n", + " config_builder_with_seed_jcommonsenseqa,\n", + " num_records=NUM_RECORDS,\n", + ")\n", + "\n", + "print(\"ใธใงใใๅฎ่กไธญ... ๅฎไบใๅพ ๆฉใใฆใใพใ\")\n", + "\n", + "# ใธใงใๅฎไบใๅพ ๆฉ\n", + "results_with_seed = job_with_seed.wait_until_done()\n", + "print(\"\\nSeedใใ็ใฎ็ๆๅฎไบ!\")" + ] + }, + { + "cell_type": "markdown", + "id": "analysis", + "metadata": {}, + "source": [ + "## ๐ Analyze Results" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "analysis_with_seed", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "================================================================================\n", + "Seedใใผใฟใใ็ใฎๅๆ\n", + "================================================================================\n" + ] + }, + { + "data": { + "text/html": [ + "
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ ๐จ Data Designer Dataset Profile โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + "\n", + " \n", + " Dataset Overview \n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + "โ number of records โ number of columns โ percent complete records โ\n", + "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", + "โ 7,992 โ 48 โ 99.9% โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + " \n", + " \n", + " ๐ฑ Seed-Dataset Columns \n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + "โ column name โ data type โ number unique values โ\n", + "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", + "โ uuid โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ professional_persona โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ sports_persona โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ arts_persona โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ travel_persona โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ culinary_persona โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ persona โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ cultural_background โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ skills_and_expertise โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ skills_and_expertise_list โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ hobbies_and_interests โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ hobbies_and_interests_list โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ career_goals_and_ambitions โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ sex โ string โ 2 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ age โ int โ 82 (1.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ marital_status โ string โ 8 (0.1%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ education_level โ string โ 10 (0.1%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ occupation โ string โ 615 (7.7%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ region โ string โ 8 (0.1%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ area โ string โ 2 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ prefecture โ string โ 47 (0.6%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ country โ string โ 1 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ age_band โ None โ 0 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _all_text โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _core_text โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _core_len โ int โ 130 (1.6%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _attr_key โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_finance โ int โ 4 (0.1%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_safety โ int โ 3 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_vocab โ int โ 3 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_public โ int โ 3 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_tools โ int โ 2 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_life โ int โ 3 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_geo โ int โ 7 (0.1%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_culture โ int โ 3 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _geo_text โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _tools_text โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _kw_hits โ int โ 7 (0.1%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ jc_category โ string โ 8 (0.1%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ max_score_any โ float โ 6 (0.1%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _public_bonus โ None โ 0 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _religion_pen โ None โ 0 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ jc_theme โ string โ 6 (0.1%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ topic_category โ string โ 7 (0.1%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + " \n", + " \n", + " ๐๏ธ LLM-Structured Columns \n", + "โโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + "โ โ โ โ prompt tokens โ completion tokens โ\n", + "โ column name โ data type โ number unique values โ per record โ per record โ\n", + "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", + "โ jcqa_data โ dict โ 7992 (100.0%) โ 1322.0 +/- 7.7 โ 318.0 +/- 64.9 โ\n", + "โโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + " \n", + " \n", + " โ๏ธ LLM-Judge Columns \n", + "โโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโ\n", + "โ โ โ โ prompt tokens โ completion tokens โ\n", + "โ column name โ data type โ number unique values โ per record โ per record โ\n", + "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", + "โ quality_metrics โ dict โ 7992 (100.0%) โ 1667.0 +/- 66.5 โ 281.0 +/- 42.2 โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโ\n", + " \n", + " \n", + " ๐งฉ Expression Columns \n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + "โ column name โ data type โ number unique values โ\n", + "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", + "โ clarity_score โ string โ 2 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ difficulty โ string โ 3 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + " \n", + " \n", + "โญโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ Table Notes โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฎ\n", + "โ โ\n", + "โ 1. All token statistics are based on a sample of max(1000, len(dataset)) records. โ\n", + "โ 2. Tokens are calculated using tiktoken's cl100k_base tokenizer. โ\n", + "โ โ\n", + "โฐโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ\n", + " \n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + "\n" + ], + "text/plain": [ + "\u001b[1;38;2;118;185;0mโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ \u001b[0m๐จ Data Designer Dataset Profile\u001b[1;38;2;118;185;0m โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\u001b[0m\n", + "\n", + " \n", + "\u001b[1;38;2;118;185;0m Dataset Overview \u001b[0m\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + "โ\u001b[1;2m \u001b[0m\u001b[1;2mnumber of records \u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2mnumber of columns \u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2mpercent complete records \u001b[0m\u001b[1;2m \u001b[0mโ\n", + "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", + "โ 7,992 โ 48 โ 99.9% โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + " \n", + " \n", + "\u001b[1;38;2;118;185;0m ๐ฑ Seed-Dataset Columns \u001b[0m\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + "โ\u001b[1;2m \u001b[0m\u001b[1;2mcolumn name \u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m data type\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m number unique values\u001b[0m\u001b[1;2m \u001b[0mโ\n", + "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", + "โ uuid โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ professional_persona โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ sports_persona โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ arts_persona โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ travel_persona โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ culinary_persona โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ persona โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ cultural_background โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ skills_and_expertise โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ skills_and_expertise_list โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ hobbies_and_interests โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ hobbies_and_interests_list โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ career_goals_and_ambitions โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ sex โ string โ 2 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ age โ int โ 82 (1.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ marital_status โ string โ 8 (0.1%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ education_level โ string โ 10 (0.1%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ occupation โ string โ 615 (7.7%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ region โ string โ 8 (0.1%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ area โ string โ 2 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ prefecture โ string โ 47 (0.6%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ country โ string โ 1 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ age_band โ \u001b[3;35mNone\u001b[0m โ 0 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _all_text โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _core_text โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _core_len โ int โ 130 (1.6%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _attr_key โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_finance โ int โ 4 (0.1%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_safety โ int โ 3 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_vocab โ int โ 3 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_public โ int โ 3 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_tools โ int โ 2 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_life โ int โ 3 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_geo โ int โ 7 (0.1%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ score_culture โ int โ 3 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _geo_text โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _tools_text โ string โ 2000 (25.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _kw_hits โ int โ 7 (0.1%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ jc_category โ string โ 8 (0.1%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ max_score_any โ float โ 6 (0.1%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _public_bonus โ \u001b[3;35mNone\u001b[0m โ 0 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ _religion_pen โ \u001b[3;35mNone\u001b[0m โ 0 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ jc_theme โ string โ 6 (0.1%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ topic_category โ string โ 7 (0.1%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + " \n", + " \n", + "\u001b[1;38;2;118;185;0m ๐๏ธ LLM-Structured Columns \u001b[0m\n", + "โโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + "โ\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m prompt tokens\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m completion tokens\u001b[0m\u001b[1;2m \u001b[0mโ\n", + "โ\u001b[1;2m \u001b[0m\u001b[1;2mcolumn name \u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m data type\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m number unique values\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m per record\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m per record\u001b[0m\u001b[1;2m \u001b[0mโ\n", + "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", + "โ jcqa_data โ dict โ 7992 (100.0%) โ 1322.0 +/- 7.7 โ 318.0 +/- 64.9 โ\n", + "โโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + " \n", + " \n", + "\u001b[1;38;2;118;185;0m โ๏ธ LLM-Judge Columns \u001b[0m\n", + "โโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโ\n", + "โ\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m prompt tokens\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m completion tokens\u001b[0m\u001b[1;2m \u001b[0mโ\n", + "โ\u001b[1;2m \u001b[0m\u001b[1;2mcolumn name \u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m data type\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m number unique values\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m per record\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m per record\u001b[0m\u001b[1;2m \u001b[0mโ\n", + "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", + "โ quality_metrics โ dict โ 7992 (100.0%) โ 1667.0 +/- 66.5 โ 281.0 +/- 42.2 โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโ\n", + " \n", + " \n", + "\u001b[1;38;2;118;185;0m ๐งฉ Expression Columns \u001b[0m\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + "โ\u001b[1;2m \u001b[0m\u001b[1;2mcolumn name \u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m data type\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m number unique values\u001b[0m\u001b[1;2m \u001b[0mโ\n", + "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", + "โ clarity_score โ string โ 2 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", + "โ difficulty โ string โ 3 (0.0%) โ\n", + "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", + " \n", + " \n", + "\u001b[2mโญโ\u001b[0m\u001b[2mโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\u001b[0m\u001b[2m Table Notes \u001b[0m\u001b[2mโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\u001b[0m\u001b[2mโโฎ\u001b[0m\n", + "\u001b[2mโ\u001b[0m \u001b[2mโ\u001b[0m\n", + "\u001b[2mโ\u001b[0m 1. All token statistics are based on a sample of max(1000, len(dataset)) records. \u001b[2mโ\u001b[0m\n", + "\u001b[2mโ\u001b[0m 2. Tokens are calculated using tiktoken's cl100k_base tokenizer. \u001b[2mโ\u001b[0m\n", + "\u001b[2mโ\u001b[0m \u001b[2mโ\u001b[0m\n", + "\u001b[2mโฐโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ\u001b[0m\n", + " \n", + "\u001b[1;38;2;118;185;0mโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Seedใใ็ใฎๅๆ\n", + "print(\"\\n\" + \"=\"*80)\n", + "print(\"Seedใใผใฟใใ็ใฎๅๆ\")\n", + "print(\"=\"*80)\n", + "\n", + "analysis_with_seed = job_with_seed.load_analysis()\n", + "analysis_with_seed.to_report()" + ] + }, + { + "cell_type": "markdown", + "id": "comparison", + "metadata": {}, + "source": [ + "## ๐ Quality Comparison\n", + "\n", + "Compare quality with and without seed data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "269b2e97-4244-462b-824a-21a231eaac96", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Seedใใ็ใฎใใผใฟๆฐ: 7992\n", + "\n", + "Seedใใ็ใฎๅ่ณชในใณใขๅๅธ:\n", + "\n", + "clarity_score:\n", + "clarity_score\n", + "ๆ็ขบ 7974\n", + "ใใไธๆ็ขบ 18\n", + "Name: count, dtype: int64[pyarrow]\n", + "\n", + "difficulty:\n", + "difficulty\n", + "ๆใใ 6984\n", + "ๆฎ้ 1007\n", + "้ฃใใ 1\n", + "Name: count, dtype: int64[pyarrow]\n" + ] + } + ], + "source": [ + "# ใใผใฟใฎ่ชญใฟ่พผใฟ\n", + "df_with_seed = job_with_seed.load_dataset()\n", + "\n", + "print(f\"Seedใใ็ใฎใใผใฟๆฐ: {len(df_with_seed)}\")\n", + "\n", + "# ๅ่ณชในใณใขใฎ้่จ\n", + "def count_scores(df, name):\n", + " print(f\"\\n{name}ใฎๅ่ณชในใณใขๅๅธ:\")\n", + " \n", + " for metric in ['clarity_score', 'difficulty']:\n", + " if metric in df.columns:\n", + " counts = df[metric].value_counts()\n", + " print(f\"\\n{metric}:\")\n", + " print(counts)\n", + " \n", + " scores = {\n", + " 'clarity': df['clarity_score'].value_counts().to_dict() if 'clarity_score' in df.columns else {},\n", + " 'difficulty': df['difficulty'].value_counts().to_dict() if 'difficulty' in df.columns else {},\n", + " }\n", + " return scores\n", + "\n", + "scores_with_seed = count_scores(df_with_seed, \"Seedใใ็\")" + ] + }, + { + "cell_type": "markdown", + "id": "save_data", + "metadata": {}, + "source": [ + "## ๐พ Save jcommonsenseqa Data" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "4d1023ea-dd88-4301-85ed-40b60e481024", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[17:52:08] [INFO] ๐บ Downloading artifacts from Job with ID 'job-2qxnztmlrfrkdkjzuxv2kd'\n", + "[17:52:10] [INFO] โ Artifacts downloaded to jcommonsenseqa_data_output_filter_jcommonsenseqa_seed_adjust_2000_temperature_0_9_remake_metric_8000_blog_check_en/with_seed_data\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "ใใผใฟใ 'jcommonsenseqa_data_output_filter_jcommonsenseqa_seed_adjust_2000_temperature_0_9_remake_metric_8000_blog_check_en' ใใฃใฌใฏใใชใซไฟๅญใใพใใใ\n", + "\n", + "ไฟๅญใใใใใกใคใซ:\n", + " - with_seed_data.jsonl (103554.34 KB)\n" + ] + } + ], + "source": [ + "OUTPUT_DIR = \"jcommonsenseqa_8000_filter_jcommonsenseqa_seed_2000_temperature_0_9\"\n", + "import os\n", + "\n", + "os.makedirs(OUTPUT_DIR, exist_ok=True)\n", + "\n", + "# Seedใใ็ใฎไฟๅญ\n", + "job_with_seed.download_artifacts(\n", + " output_path=OUTPUT_DIR,\n", + " artifacts_folder_name=\"with_seed_data\",\n", + ")\n", + "\n", + "df_with_seed = job_with_seed.load_dataset()\n", + "\n", + "# DataFrameใJSONLใงไฟๅญ๏ผLoRAใใฅใผใใณใฐ็จ๏ผ\n", + "df_with_seed.to_json(\n", + " f\"{OUTPUT_DIR}/with_seed_data.jsonl\",\n", + " orient='records',\n", + " lines=True,\n", + " force_ascii=False\n", + ")\n", + "\n", + "print(f\"\\nใใผใฟใ '{OUTPUT_DIR}' ใใฃใฌใฏใใชใซไฟๅญใใพใใใ\")\n", + "print(\"\\nไฟๅญใใใใใกใคใซ:\")\n", + "for filename in os.listdir(OUTPUT_DIR):\n", + " filepath = os.path.join(OUTPUT_DIR, filename)\n", + " if os.path.isfile(filepath):\n", + " size = os.path.getsize(filepath) / 1024 # KB\n", + " print(f\" - {filename} ({size:.2f} KB)\")" + ] + }, + { + "cell_type": "markdown", + "id": "summary", + "metadata": {}, + "source": [ + "## ๐ Summary\n", + "\n", + "### What We Did\n", + "1. โ Correctly configured nvidia/Nemotron-Personas-Japan as seed data\n", + "2. โ Generated data by directly referencing seed data columns\n", + "3. โ Generated synthetic data for jcommonsenseqa and commonsensemoralja\n", + "4. โ Created 2 versions: with and without seed data\n", + "5. โ Quality evaluation using LLM-as-a-Judge\n", + "6. โ Generated quality comparison report\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}