{ "cells": [ { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "# Logging with a schema\n", "\n", "Create a ``rubicon_ml`` project" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from rubicon_ml import Rubicon\n", "\n", "rubicon = Rubicon(persistence=\"memory\", auto_git_enabled=True)\n", "project = rubicon.create_project(name=\"apply schema\")\n", "project" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Train a ``RandomForestClassifier``\n", "\n", "Load a training dataset" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "tags": [] }, "outputs": [], "source": [ "from sklearn.datasets import load_wine\n", "\n", "X, y = load_wine(return_X_y=True, as_frame=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Train an instance of the model the schema represents" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "RandomForestClassifier(ccp_alpha=0.005, criterion='log_loss',\n", " max_features='log2', n_estimators=24, oob_score=True,\n", " random_state=121)\n" ] } ], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "\n", "rfc = RandomForestClassifier(\n", " ccp_alpha=5e-3,\n", " criterion=\"log_loss\",\n", " max_features=\"log2\",\n", " n_estimators=24,\n", " oob_score=True,\n", " random_state=121,\n", ")\n", "rfc.fit(X, y)\n", "\n", "print(rfc)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Infer schema and log model metadata\n", "\n", "Log the model metadata defined in the applied schema to a new experiment in ``project`` with ``project.log_with_schema``\n", "\n", "**Note:** ``project.log_with_schema`` will infer the correct schema based on the given object to log" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "inferred schema name: sklearn__RandomForestClassifier\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "experiment = project.log_with_schema(\n", " rfc,\n", " experiment_kwargs={ # additional kwargs to be passed to `project.log_experiment`\n", " \"name\": \"log with schema\",\n", " \"model_name\": \"RandomForestClassifier\",\n", " \"description\": \"logged with the `RandomForestClassifier` `rubicon_schema`\",\n", " },\n", ")\n", "\n", "print(f\"inferred schema name: {project.schema_['name']}\")\n", "experiment" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## View the experiment's logged metadata\n", "\n", "Each experiment contains all the data represented in the schema - more information on the data captured by\n", "a ``rubicon_schema`` can be found in the \"Representing model metadata with a ``rubicon_schema``\" section" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "{'project_name': 'apply schema',\n", " 'id': 'ec4c3ead-3337-4623-9a97-c61f48e8de3d',\n", " 'name': 'log with schema',\n", " 'description': 'logged with the `RandomForestClassifier` `rubicon_schema`',\n", " 'model_name': 'RandomForestClassifier',\n", " 'branch_name': 'schema',\n", " 'commit_hash': 'c9f696408a03c6a6fbf2fbff39fa48bbf722bae1',\n", " 'training_metadata': None,\n", " 'tags': [],\n", " 'created_at': datetime.datetime(2023, 9, 25, 15, 47, 37, 552091)}" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vars(experiment._domain)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The features and their importances are logged as defined in the schema's \"features\" section" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "[{'names_attr': 'feature_names_in_',\n", " 'importances_attr': 'feature_importances_',\n", " 'optional': True}]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "project.schema_[\"features\"]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "alcohol (0.1276831830349219)\n", "malic_acid (0.03863837532736449)\n", "ash (0.006168227239831861)\n", "alcalinity_of_ash (0.025490751927615605)\n", "magnesium (0.02935763050777937)\n", "total_phenols (0.058427899304369986)\n", "flavanoids (0.15309812550131274)\n", "nonflavanoid_phenols (0.007414542189797497)\n", "proanthocyanins (0.012615187741781065)\n", "color_intensity (0.13608806341133572)\n", "hue (0.0892558912217226)\n", "od280/od315_of_diluted_wines (0.15604181694153108)\n", "proline (0.15972030565063608)\n" ] } ], "source": [ "for feature in experiment.features():\n", " print(f\"{feature.name} ({feature.importance})\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Each parameter and its value are logged as defined in the schema's \"parameters\" section" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "[{'name': 'bootstrap', 'value_attr': 'bootstrap'},\n", " {'name': 'ccp_alpha', 'value_attr': 'ccp_alpha'},\n", " {'name': 'class_weight', 'value_attr': 'class_weight'},\n", " {'name': 'criterion', 'value_attr': 'criterion'},\n", " {'name': 'max_depth', 'value_attr': 'max_depth'},\n", " {'name': 'max_features', 'value_attr': 'max_features'},\n", " {'name': 'min_impurity_decrease', 'value_attr': 'min_impurity_decrease'},\n", " {'name': 'max_leaf_nodes', 'value_attr': 'max_leaf_nodes'},\n", " {'name': 'max_samples', 'value_attr': 'max_samples'},\n", " {'name': 'min_samples_split', 'value_attr': 'min_samples_split'},\n", " {'name': 'min_samples_leaf', 'value_attr': 'min_samples_leaf'},\n", " {'name': 'min_weight_fraction_leaf',\n", " 'value_attr': 'min_weight_fraction_leaf'},\n", " {'name': 'n_estimators', 'value_attr': 'n_estimators'},\n", " {'name': 'oob_score', 'value_attr': 'oob_score'},\n", " {'name': 'random_state', 'value_attr': 'random_state'}]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "project.schema_[\"parameters\"]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "bootstrap: True\n", "ccp_alpha: 0.005\n", "class_weight: None\n", "criterion: log_loss\n", "max_depth: None\n", "max_features: log2\n", "min_impurity_decrease: 0.0\n", "max_leaf_nodes: None\n", "max_samples: None\n", "min_samples_split: 2\n", "min_samples_leaf: 1\n", "min_weight_fraction_leaf: 0.0\n", "n_estimators: 24\n", "oob_score: True\n", "random_state: 121\n" ] } ], "source": [ "for parameter in experiment.parameters():\n", " print(f\"{parameter.name}: {parameter.value}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Each metric and its value are logged as defined in the schema's \"metrics\" section" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "[{'name': 'classes', 'value_attr': 'classes_'},\n", " {'name': 'n_classes', 'value_attr': 'n_classes_'},\n", " {'name': 'n_features_in', 'value_attr': 'n_features_in_'},\n", " {'name': 'n_outputs', 'value_attr': 'n_outputs_'},\n", " {'name': 'oob_decision_function',\n", " 'value_attr': 'oob_decision_function_',\n", " 'optional': True},\n", " {'name': 'oob_score', 'value_attr': 'oob_score_', 'optional': True}]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "project.schema_[\"metrics\"]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "classes: ...\n", "n_classes: 3\n", "n_features_in: 13\n", "n_outputs: 1\n", "oob_decision_function: ...\n", "oob_score: 0.9775280898876404\n" ] } ], "source": [ "import numpy as np\n", "\n", "for metric in experiment.metrics():\n", " if np.isscalar(metric.value):\n", " print(f\"{metric.name}: {metric.value}\")\n", " else: # don't print long metrics\n", " print(f\"{metric.name}: ...\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A copy of the trained model is logged as defined in the schema's \"artifacts\" section" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "['self']" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "project.schema_[\"artifacts\"]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "RandomForestClassifier:\n", "RandomForestClassifier(ccp_alpha=0.005, criterion='log_loss',\n", " max_features='log2', n_estimators=24, oob_score=True,\n", " random_state=121)\n" ] } ], "source": [ "for artifact in experiment.artifacts():\n", " print(f\"{artifact.name}:\\n{artifact.get_data(unpickle=True)}\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 4 }