{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Logging with a schema\n",
    "\n",
    "Create a ``rubicon_ml`` project"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<rubicon_ml.client.project.Project at 0x11c99e890>"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from rubicon_ml import Rubicon\n",
    "\n",
    "rubicon = Rubicon(persistence=\"memory\", auto_git_enabled=True)\n",
    "project = rubicon.create_project(name=\"apply schema\")\n",
    "project"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train a ``RandomForestClassifier``\n",
    "\n",
    "Load a training dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from sklearn.datasets import load_wine\n",
    "\n",
    "X, y = load_wine(return_X_y=True, as_frame=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Train an instance of the model the schema represents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RandomForestClassifier(ccp_alpha=0.005, criterion='log_loss',\n",
      "                       max_features='log2', n_estimators=24, oob_score=True,\n",
      "                       random_state=121)\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "rfc = RandomForestClassifier(\n",
    "    ccp_alpha=5e-3,\n",
    "    criterion=\"log_loss\",\n",
    "    max_features=\"log2\",\n",
    "    n_estimators=24,\n",
    "    oob_score=True,\n",
    "    random_state=121,\n",
    ")\n",
    "rfc.fit(X, y)\n",
    "\n",
    "print(rfc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Infer schema and log model metadata\n",
    "\n",
    "Log the model metadata defined in the applied schema to a new experiment in ``project`` with ``project.log_with_schema``\n",
    "\n",
    "**Note:** ``project.log_with_schema`` will infer the correct schema based on the given object to log"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "inferred schema name: sklearn__RandomForestClassifier\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<rubicon_ml.client.experiment.Experiment at 0x16d392b10>"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "experiment = project.log_with_schema(\n",
    "    rfc,\n",
    "    experiment_kwargs={  # additional kwargs to be passed to `project.log_experiment`\n",
    "        \"name\": \"log with schema\",\n",
    "        \"model_name\": \"RandomForestClassifier\",\n",
    "        \"description\": \"logged with the `RandomForestClassifier` `rubicon_schema`\",\n",
    "    },\n",
    ")\n",
    "\n",
    "print(f\"inferred schema name: {project.schema_['name']}\")\n",
    "experiment"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## View the experiment's logged metadata\n",
    "\n",
    "Each experiment contains all the data represented in the schema - more information on the data captured by\n",
    "a ``rubicon_schema`` can be found in the \"Representing model metadata with a ``rubicon_schema``\" section"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'project_name': 'apply schema',\n",
       " 'id': 'ec4c3ead-3337-4623-9a97-c61f48e8de3d',\n",
       " 'name': 'log with schema',\n",
       " 'description': 'logged with the `RandomForestClassifier` `rubicon_schema`',\n",
       " 'model_name': 'RandomForestClassifier',\n",
       " 'branch_name': 'schema',\n",
       " 'commit_hash': 'c9f696408a03c6a6fbf2fbff39fa48bbf722bae1',\n",
       " 'training_metadata': None,\n",
       " 'tags': [],\n",
       " 'created_at': datetime.datetime(2023, 9, 25, 15, 47, 37, 552091)}"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vars(experiment._domain)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The features and their importances are logged as defined in the schema's \"features\" section"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'names_attr': 'feature_names_in_',\n",
       "  'importances_attr': 'feature_importances_',\n",
       "  'optional': True}]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "project.schema_[\"features\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "alcohol (0.1276831830349219)\n",
      "malic_acid (0.03863837532736449)\n",
      "ash (0.006168227239831861)\n",
      "alcalinity_of_ash (0.025490751927615605)\n",
      "magnesium (0.02935763050777937)\n",
      "total_phenols (0.058427899304369986)\n",
      "flavanoids (0.15309812550131274)\n",
      "nonflavanoid_phenols (0.007414542189797497)\n",
      "proanthocyanins (0.012615187741781065)\n",
      "color_intensity (0.13608806341133572)\n",
      "hue (0.0892558912217226)\n",
      "od280/od315_of_diluted_wines (0.15604181694153108)\n",
      "proline (0.15972030565063608)\n"
     ]
    }
   ],
   "source": [
    "for feature in experiment.features():\n",
    "    print(f\"{feature.name} ({feature.importance})\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Each parameter and its value are logged as defined in the schema's \"parameters\" section"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'name': 'bootstrap', 'value_attr': 'bootstrap'},\n",
       " {'name': 'ccp_alpha', 'value_attr': 'ccp_alpha'},\n",
       " {'name': 'class_weight', 'value_attr': 'class_weight'},\n",
       " {'name': 'criterion', 'value_attr': 'criterion'},\n",
       " {'name': 'max_depth', 'value_attr': 'max_depth'},\n",
       " {'name': 'max_features', 'value_attr': 'max_features'},\n",
       " {'name': 'min_impurity_decrease', 'value_attr': 'min_impurity_decrease'},\n",
       " {'name': 'max_leaf_nodes', 'value_attr': 'max_leaf_nodes'},\n",
       " {'name': 'max_samples', 'value_attr': 'max_samples'},\n",
       " {'name': 'min_samples_split', 'value_attr': 'min_samples_split'},\n",
       " {'name': 'min_samples_leaf', 'value_attr': 'min_samples_leaf'},\n",
       " {'name': 'min_weight_fraction_leaf',\n",
       "  'value_attr': 'min_weight_fraction_leaf'},\n",
       " {'name': 'n_estimators', 'value_attr': 'n_estimators'},\n",
       " {'name': 'oob_score', 'value_attr': 'oob_score'},\n",
       " {'name': 'random_state', 'value_attr': 'random_state'}]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "project.schema_[\"parameters\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "bootstrap: True\n",
      "ccp_alpha: 0.005\n",
      "class_weight: None\n",
      "criterion: log_loss\n",
      "max_depth: None\n",
      "max_features: log2\n",
      "min_impurity_decrease: 0.0\n",
      "max_leaf_nodes: None\n",
      "max_samples: None\n",
      "min_samples_split: 2\n",
      "min_samples_leaf: 1\n",
      "min_weight_fraction_leaf: 0.0\n",
      "n_estimators: 24\n",
      "oob_score: True\n",
      "random_state: 121\n"
     ]
    }
   ],
   "source": [
    "for parameter in experiment.parameters():\n",
    "    print(f\"{parameter.name}: {parameter.value}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Each metric and its value are logged as defined in the schema's \"metrics\" section"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'name': 'classes', 'value_attr': 'classes_'},\n",
       " {'name': 'n_classes', 'value_attr': 'n_classes_'},\n",
       " {'name': 'n_features_in', 'value_attr': 'n_features_in_'},\n",
       " {'name': 'n_outputs', 'value_attr': 'n_outputs_'},\n",
       " {'name': 'oob_decision_function',\n",
       "  'value_attr': 'oob_decision_function_',\n",
       "  'optional': True},\n",
       " {'name': 'oob_score', 'value_attr': 'oob_score_', 'optional': True}]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "project.schema_[\"metrics\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "classes: ...\n",
      "n_classes: 3\n",
      "n_features_in: 13\n",
      "n_outputs: 1\n",
      "oob_decision_function: ...\n",
      "oob_score: 0.9775280898876404\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "for metric in experiment.metrics():\n",
    "    if np.isscalar(metric.value):\n",
    "        print(f\"{metric.name}: {metric.value}\")\n",
    "    else:  # don't print long metrics\n",
    "        print(f\"{metric.name}: ...\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "A copy of the trained model is logged as defined in the schema's \"artifacts\" section"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['self']"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "project.schema_[\"artifacts\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RandomForestClassifier:\n",
      "RandomForestClassifier(ccp_alpha=0.005, criterion='log_loss',\n",
      "                       max_features='log2', n_estimators=24, oob_score=True,\n",
      "                       random_state=121)\n"
     ]
    }
   ],
   "source": [
    "for artifact in experiment.artifacts():\n",
    "    print(f\"{artifact.name}:\\n{artifact.get_data(unpickle=True)}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}