wiki-llm/LLM-gpt.ipynb

{
 "metadata": {
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3",
   "language": "python"
  },
  "language_info": {
   "name": "python",
   "version": "3.12.12",
   "mimetype": "text/x-python",
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "pygments_lexer": "ipython3",
   "nbconvert_exporter": "python",
   "file_extension": ".py"
  },
  "colab": {
   "provenance": [],
   "gpuType": "T4"
  },
  "accelerator": "GPU",
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "8e25812455fd4680b9d664aaecfe47f2": {
     "model_module": "@jupyter-widgets/controls",
     "model_name": "HBoxModel",
     "model_module_version": "1.5.0",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_54c2e9c7137049ff8beb05bb8c972658",
       "IPY_MODEL_7b1dc4f98c654745a1a6ce607a4a9be0",
       "IPY_MODEL_f2157862dc9d4ef294741a2e8a36d97d"
      ],
      "layout": "IPY_MODEL_ce23d7097c2a4d189f0e7d9a2c22c9be"
     }
    },
    "54c2e9c7137049ff8beb05bb8c972658": {
     "model_module": "@jupyter-widgets/controls",
     "model_name": "HTMLModel",
     "model_module_version": "1.5.0",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_0741c9735ba24cd8b2e574780ed9bb2c",
      "placeholder": "",
      "style": "IPY_MODEL_5bdc83d116af40c88bbb879e5f4df01f",
      "value": "Map: 100%"
     }
    },
    "7b1dc4f98c654745a1a6ce607a4a9be0": {
     "model_module": "@jupyter-widgets/controls",
     "model_name": "FloatProgressModel",
     "model_module_version": "1.5.0",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "success",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_b26f87cd94374c4985e9d8ea029fcb19",
      "max": 23767,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_4b60e0df52a7481d8e4b9df0f3171a2d",
      "value": 23767
     }
    },
    "f2157862dc9d4ef294741a2e8a36d97d": {
     "model_module": "@jupyter-widgets/controls",
     "model_name": "HTMLModel",
     "model_module_version": "1.5.0",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_d5c1213d8f1d420c8835d9d8740aad4d",
      "placeholder": "",
      "style": "IPY_MODEL_0e45bcff086342129fdb48114477f777",
      "value": " 23767/23767 [00:08&lt;00:00, 3680.56 examples/s]"
     }
    },
    "ce23d7097c2a4d189f0e7d9a2c22c9be": {
     "model_module": "@jupyter-widgets/base",
     "model_name": "LayoutModel",
     "model_module_version": "1.2.0",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "0741c9735ba24cd8b2e574780ed9bb2c": {
     "model_module": "@jupyter-widgets/base",
     "model_name": "LayoutModel",
     "model_module_version": "1.2.0",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "5bdc83d116af40c88bbb879e5f4df01f": {
     "model_module": "@jupyter-widgets/controls",
     "model_name": "DescriptionStyleModel",
     "model_module_version": "1.5.0",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "b26f87cd94374c4985e9d8ea029fcb19": {
     "model_module": "@jupyter-widgets/base",
     "model_name": "LayoutModel",
     "model_module_version": "1.2.0",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "4b60e0df52a7481d8e4b9df0f3171a2d": {
     "model_module": "@jupyter-widgets/controls",
     "model_name": "ProgressStyleModel",
     "model_module_version": "1.5.0",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "d5c1213d8f1d420c8835d9d8740aad4d": {
     "model_module": "@jupyter-widgets/base",
     "model_name": "LayoutModel",
     "model_module_version": "1.2.0",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "0e45bcff086342129fdb48114477f777": {
     "model_module": "@jupyter-widgets/controls",
     "model_name": "DescriptionStyleModel",
     "model_module_version": "1.5.0",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "09c6543498e34a368543a953aa4a83ba": {
     "model_module": "@jupyter-widgets/controls",
     "model_name": "HBoxModel",
     "model_module_version": "1.5.0",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_1c0993d123a74bb0852197162b2e2d3e",
       "IPY_MODEL_6cd5ddd66dbb4bf1ac3066f268b1b363",
       "IPY_MODEL_6f265574427b4a73b774de42d12659c2"
      ],
      "layout": "IPY_MODEL_5680619f39e94c419be50e40bd1a775a"
     }
    },
    "1c0993d123a74bb0852197162b2e2d3e": {
     "model_module": "@jupyter-widgets/controls",
     "model_name": "HTMLModel",
     "model_module_version": "1.5.0",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_f5b01ae92a974970a4f7b191951840bf",
      "placeholder": "",
      "style": "IPY_MODEL_3fd65bd7c9c64480a99356bc7836640e",
      "value": "Map: 100%"
     }
    },
    "6cd5ddd66dbb4bf1ac3066f268b1b363": {
     "model_module": "@jupyter-widgets/controls",
     "model_name": "FloatProgressModel",
     "model_module_version": "1.5.0",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "success",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_8215a42be5c64fe9a25d92fc8008fa0b",
      "max": 23767,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_2349ef76bed847b79b5e7da6540aedef",
      "value": 23767
     }
    },
    "6f265574427b4a73b774de42d12659c2": {
     "model_module": "@jupyter-widgets/controls",
     "model_name": "HTMLModel",
     "model_module_version": "1.5.0",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_aec637e7513b442c9326632354961ada",
      "placeholder": "",
      "style": "IPY_MODEL_03e0356919f14fd685bcc1612c9050d6",
      "value": " 23767/23767 [00:02&lt;00:00, 9996.68 examples/s]"
     }
    },
    "5680619f39e94c419be50e40bd1a775a": {
     "model_module": "@jupyter-widgets/base",
     "model_name": "LayoutModel",
     "model_module_version": "1.2.0",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "f5b01ae92a974970a4f7b191951840bf": {
     "model_module": "@jupyter-widgets/base",
     "model_name": "LayoutModel",
     "model_module_version": "1.2.0",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "3fd65bd7c9c64480a99356bc7836640e": {
     "model_module": "@jupyter-widgets/controls",
     "model_name": "DescriptionStyleModel",
     "model_module_version": "1.5.0",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "8215a42be5c64fe9a25d92fc8008fa0b": {
     "model_module": "@jupyter-widgets/base",
     "model_name": "LayoutModel",
     "model_module_version": "1.2.0",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "2349ef76bed847b79b5e7da6540aedef": {
     "model_module": "@jupyter-widgets/controls",
     "model_name": "ProgressStyleModel",
     "model_module_version": "1.5.0",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "aec637e7513b442c9326632354961ada": {
     "model_module": "@jupyter-widgets/base",
     "model_name": "LayoutModel",
     "model_module_version": "1.2.0",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "03e0356919f14fd685bcc1612c9050d6": {
     "model_module": "@jupyter-widgets/controls",
     "model_name": "DescriptionStyleModel",
     "model_module_version": "1.5.0",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "8f05f89d52b047368c4405847015de33": {
     "model_module": "@jupyter-widgets/controls",
     "model_name": "HBoxModel",
     "model_module_version": "1.5.0",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_c9173aa946df480d81648bf01808a153",
       "IPY_MODEL_4b279cc9d382415fa1d2cf21a94415f2",
       "IPY_MODEL_0be49291f31545f789176d9bc9e642c4"
      ],
      "layout": "IPY_MODEL_f12fe68a33ff4a699d24ac0f6b3ba320"
     }
    },
    "c9173aa946df480d81648bf01808a153": {
     "model_module": "@jupyter-widgets/controls",
     "model_name": "HTMLModel",
     "model_module_version": "1.5.0",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_36433e0953664a70981a91977217cf3f",
      "placeholder": "",
      "style": "IPY_MODEL_caaaadd11e8d49559e8278629f6f0b45",
      "value": "Loading weights: 100%"
     }
    },
    "4b279cc9d382415fa1d2cf21a94415f2": {
     "model_module": "@jupyter-widgets/controls",
     "model_name": "FloatProgressModel",
     "model_module_version": "1.5.0",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "success",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_c70ad2b68e9e49649e2bfc71764cb5a0",
      "max": 148,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_7b6f9b2d74c643ab8fd016d7e8117a53",
      "value": 148
     }
    },
    "0be49291f31545f789176d9bc9e642c4": {
     "model_module": "@jupyter-widgets/controls",
     "model_name": "HTMLModel",
     "model_module_version": "1.5.0",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_434d9c38ce03441f9b69ca86ddfb3c0b",
      "placeholder": "",
      "style": "IPY_MODEL_b54f1d1380b94261b13997f9eceb423e",
      "value": " 148/148 [00:00&lt;00:00, 614.69it/s, Materializing param=transformer.wte.weight]"
     }
    },
    "f12fe68a33ff4a699d24ac0f6b3ba320": {
     "model_module": "@jupyter-widgets/base",
     "model_name": "LayoutModel",
     "model_module_version": "1.2.0",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "36433e0953664a70981a91977217cf3f": {
     "model_module": "@jupyter-widgets/base",
     "model_name": "LayoutModel",
     "model_module_version": "1.2.0",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "caaaadd11e8d49559e8278629f6f0b45": {
     "model_module": "@jupyter-widgets/controls",
     "model_name": "DescriptionStyleModel",
     "model_module_version": "1.5.0",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "c70ad2b68e9e49649e2bfc71764cb5a0": {
     "model_module": "@jupyter-widgets/base",
     "model_name": "LayoutModel",
     "model_module_version": "1.2.0",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "7b6f9b2d74c643ab8fd016d7e8117a53": {
     "model_module": "@jupyter-widgets/controls",
     "model_name": "ProgressStyleModel",
     "model_module_version": "1.5.0",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "434d9c38ce03441f9b69ca86ddfb3c0b": {
     "model_module": "@jupyter-widgets/base",
     "model_name": "LayoutModel",
     "model_module_version": "1.2.0",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "b54f1d1380b94261b13997f9eceb423e": {
     "model_module": "@jupyter-widgets/controls",
     "model_name": "DescriptionStyleModel",
     "model_module_version": "1.5.0",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    }
   }
  },
  "kaggle": {
   "accelerator": "nvidiaTeslaT4",
   "dataSources": [],
   "dockerImageVersionId": 31328,
   "isInternetEnabled": true,
   "language": "python",
   "sourceType": "notebook",
   "isGpuEnabled": true
  }
 },
 "nbformat_minor": 5,
 "nbformat": 4,
 "cells": [
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": [
    "# Training GPT2 on a wikipedia data to understand how to finetune a foundational model\n",
    "\n",
    "## Tokenization of the data\n",
    "\n",
    "So we need to tokenize the data using the byte pair encoding method to get the training data ready. The model does not understand UTF-8 characters but can make sense of the raw UTF-8 bytes that you can encode using the byte pair encoding method."
   ],
   "id": "b38f8e53b70dd168"
  },
  {
   "id": "50f94f7b-8eaa-4791-8224-fb6df4dffad0",
   "cell_type": "code",
   "source": [
    "import torch\n",
    "import time\n",
    "import shutil\n",
    "import os\n",
    "from IPython.display import FileLink\n",
    "import torch.optim as optim\n",
    "from torch.utils.data import DataLoader\n",
    "from peft import LoraConfig, get_peft_model\n",
    "from peft import \n",
    "from datasets import load_dataset\n",
    "from transformers import GPT2Tokenizer, GPT2LMHeadModel"
   ],
   "metadata": {
    "trusted": true,
    "execution": {
     "iopub.status.busy": "2026-04-12T12:53:07.272209Z",
     "iopub.execute_input": "2026-04-12T12:53:07.272563Z",
     "iopub.status.idle": "2026-04-12T12:53:07.277632Z",
     "shell.execute_reply.started": "2026-04-12T12:53:07.272519Z",
     "shell.execute_reply": "2026-04-12T12:53:07.276963Z"
    },
    "ExecuteTime": {
     "end_time": "2026-04-12T15:43:48.697416531Z",
     "start_time": "2026-04-12T15:43:42.476271470Z"
    }
   },
   "outputs": [],
   "execution_count": 1
  },
  {
   "id": "e826153a4c898239",
   "cell_type": "code",
   "source": "dataset = load_dataset(\"wikitext\", \"wikitext-2-raw-v1\")\ndataset = dataset.filter(lambda x: len(x[\"text\"].strip()) > 0)",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2026-04-10T18:41:26.391038949Z",
     "start_time": "2026-04-10T18:41:23.168674202Z"
    },
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "e826153a4c898239",
    "outputId": "56e438e4-47ed-49a7-e514-d8a61bd32c94",
    "trusted": true,
    "execution": {
     "iopub.status.busy": "2026-04-12T12:53:07.278856Z",
     "iopub.execute_input": "2026-04-12T12:53:07.279139Z",
     "iopub.status.idle": "2026-04-12T12:53:07.594668Z",
     "shell.execute_reply.started": "2026-04-12T12:53:07.279099Z",
     "shell.execute_reply": "2026-04-12T12:53:07.593931Z"
    }
   },
   "outputs": [],
   "execution_count": 9
  },
  {
   "id": "27c55eaf29e555a2",
   "cell_type": "markdown",
   "source": "Let's see if there's some data for us to use",
   "metadata": {
    "id": "27c55eaf29e555a2"
   }
  },
  {
   "id": "b1efb38c36d2dd42",
   "cell_type": "code",
   "source": "for i in range(10):\n    print(dataset[\"train\"][i])",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2026-04-10T18:41:26.542937056Z",
     "start_time": "2026-04-10T18:41:26.411012195Z"
    },
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "b1efb38c36d2dd42",
    "outputId": "826d3ede-0e4b-483f-de7d-1fd2f294cb14",
    "trusted": true,
    "execution": {
     "iopub.status.busy": "2026-04-12T12:53:07.595665Z",
     "iopub.execute_input": "2026-04-12T12:53:07.596204Z",
     "iopub.status.idle": "2026-04-12T12:53:07.602292Z",
     "shell.execute_reply.started": "2026-04-12T12:53:07.596177Z",
     "shell.execute_reply": "2026-04-12T12:53:07.601606Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "text": "{'text': ' = Valkyria Chronicles III = \\n'}\n{'text': ' Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the \" Nameless \" , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit \" Calamaty Raven \" . \\n'}\n{'text': \" The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for series newcomers . Character designer Raita Honjou and composer Hitoshi Sakimoto both returned from previous entries , along with Valkyria Chronicles II director Takeshi Ozawa . A large team of writers handled the script . The game 's opening theme was sung by May 'n . \\n\"}\n{'text': \" It met with positive sales in Japan , and was praised by both Japanese and western critics . After release , it received downloadable content , along with an expanded edition in November of that year . It was also adapted into manga and an original video animation series . Due to low sales of Valkyria Chronicles II , Valkyria Chronicles III was not localized , but a fan translation compatible with the game 's expanded edition was released in 2014 . Media.Vision would return to the franchise with the development of Valkyria : Azure Revolution for the PlayStation 4 . \\n\"}\n{'text': ' = = Gameplay = = \\n'}\n{'text': \" As with previous Valkyira Chronicles games , Valkyria Chronicles III is a tactical role @-@ playing game where players take control of a military unit and take part in missions against enemy forces . Stories are told through comic book @-@ like panels with animated character portraits , with characters speaking partially through voiced speech bubbles and partially through unvoiced text . The player progresses through a series of linear missions , gradually unlocked as maps that can be freely scanned through and replayed as they are unlocked . The route to each story location on the map varies depending on an individual player 's approach : when one option is selected , the other is sealed off to the player . Outside missions , the player characters rest in a camp , where units can be customized and character growth occurs . Alongside the main story missions are character @-@ specific sub missions relating to different squad members . After the game 's completion , additional episodes are unlocked , some of them having a higher difficulty than those found in the rest of the game . There are also love simulation elements related to the game 's two main heroines , although they take a very minor role . \\n\"}\n{'text': ' The game \\'s battle system , the BliTZ system , is carried over directly from Valkyira Chronicles . During missions , players select each unit using a top @-@ down perspective of the battlefield map : once a character is selected , the player moves the character around the battlefield in third @-@ person . A character can only act once per @-@ turn , but characters can be granted multiple turns at the expense of other characters \\' turns . Each character has a field and distance of movement limited by their Action Gauge . Up to nine characters can be assigned to a single mission . During gameplay , characters will call out if something happens to them , such as their health points ( HP ) getting low or being knocked out by enemy attacks . Each character has specific \" Potentials \" , skills unique to each character . They are divided into \" Personal Potential \" , which are innate skills that remain unaltered unless otherwise dictated by the story and can either help or impede a character , and \" Battle Potentials \" , which are grown throughout the game and always grant boons to a character . To learn Battle Potentials , each character has a unique \" Masters Table \" , a grid @-@ based skill table that can be used to acquire and link different skills . Characters also have Special Abilities that grant them temporary boosts on the battlefield : Kurt can activate \" Direct Command \" and move around the battlefield without depleting his Action Point gauge , the character Reila can shift into her \" Valkyria Form \" and become invincible , while Imca can target multiple enemy units with her heavy weapon . \\n'}\n{'text': \" Troops are divided into five classes : Scouts , Shocktroopers , Engineers , Lancers and Armored Soldier . Troopers can switch classes by changing their assigned weapon . Changing class does not greatly affect the stats gained while in a previous class . With victory in battle , experience points are awarded to the squad , which are distributed into five different attributes shared by the entire squad , a feature differing from early games ' method of distributing to different unit types . \\n\"}\n{'text': ' = = Plot = = \\n'}\n{'text': ' The game takes place during the Second Europan War . Gallian Army Squad 422 , also known as \" The Nameless \" , are a penal military unit composed of criminals , foreign deserters , and military offenders whose real names are erased from the records and thereon officially referred to by numbers . Ordered by the Gallian military to perform the most dangerous missions that the Regular Army and Militia will not do , they are nevertheless up to the task , exemplified by their motto , Altaha Abilia , meaning \" Always Ready . \" The three main characters are No.7 Kurt Irving , an army officer falsely accused of treason who wishes to redeem himself ; Ace No.1 Imca , a female Darcsen heavy weapons specialist who seeks revenge against the Valkyria who destroyed her home ; and No.13 Riela Marcellis , a seemingly jinxed young woman who is unknowingly a descendant of the Valkyria . Together with their fellow squad members , these three are tasked to fight against a mysterious Imperial unit known as Calamity Raven , consisting of mostly Darcsen soldiers . \\n'}\n",
     "output_type": "stream"
    }
   ],
   "execution_count": 10
  },
  {
   "id": "e8119df124aca910",
   "cell_type": "markdown",
   "source": "## Now time to tokenize the data and then chunk the data\nWe need to tokenize the data so that it can be understood by the model. This is an important step as the model doesn't understnad plain text",
   "metadata": {
    "id": "e8119df124aca910"
   }
  },
  {
   "id": "252fb89c3351bf1e",
   "cell_type": "code",
   "source": "tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\")\ntokenizer.pad_token = tokenizer.eos_token\n\n\ndef tokenize(data):\n    return tokenizer(data[\"text\"], max_length=128)\n\n\ndef combine(data):\n    concatenated = {}\n    # Let's concat the data (tokens)\n    for k, lists in data.items():\n        combined = []\n        for lst in lists:\n            combined.extend(lst)\n        concatenated[k] = combined\n\n    total_length = len(concatenated[\"input_ids\"])\n\n    total_length = (total_length // 128) * 128\n\n    result = {}\n\n    # split into chunks\n    for k, lst in concatenated.items():\n        chunks = []\n        for l in range(0, total_length, 128):\n            chunks.append(lst[l:l + 128])\n        result[k] = chunks\n\n    result[\"labels\"] = result[\"input_ids\"].copy()\n    return result\n\n\ntokenized_datasets = dataset.map(tokenize, batched=True, remove_columns=[\"text\"])\n\ntraining_dataset = tokenized_datasets.map(combine, batched=True)",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2026-04-10T18:53:46.503479303Z",
     "start_time": "2026-04-10T18:53:43.641312576Z"
    },
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 81,
     "referenced_widgets": [
      "8e25812455fd4680b9d664aaecfe47f2",
      "54c2e9c7137049ff8beb05bb8c972658",
      "7b1dc4f98c654745a1a6ce607a4a9be0",
      "f2157862dc9d4ef294741a2e8a36d97d",
      "ce23d7097c2a4d189f0e7d9a2c22c9be",
      "0741c9735ba24cd8b2e574780ed9bb2c",
      "5bdc83d116af40c88bbb879e5f4df01f",
      "b26f87cd94374c4985e9d8ea029fcb19",
      "4b60e0df52a7481d8e4b9df0f3171a2d",
      "d5c1213d8f1d420c8835d9d8740aad4d",
      "0e45bcff086342129fdb48114477f777",
      "09c6543498e34a368543a953aa4a83ba",
      "1c0993d123a74bb0852197162b2e2d3e",
      "6cd5ddd66dbb4bf1ac3066f268b1b363",
      "6f265574427b4a73b774de42d12659c2",
      "5680619f39e94c419be50e40bd1a775a",
      "f5b01ae92a974970a4f7b191951840bf",
      "3fd65bd7c9c64480a99356bc7836640e",
      "8215a42be5c64fe9a25d92fc8008fa0b",
      "2349ef76bed847b79b5e7da6540aedef",
      "aec637e7513b442c9326632354961ada",
      "03e0356919f14fd685bcc1612c9050d6"
     ]
    },
    "id": "252fb89c3351bf1e",
    "outputId": "06f9f922-c233-4445-bedd-e616bd871dbb",
    "trusted": true,
    "execution": {
     "iopub.status.busy": "2026-04-12T12:53:07.603151Z",
     "iopub.execute_input": "2026-04-12T12:53:07.603421Z",
     "iopub.status.idle": "2026-04-12T12:53:09.355967Z",
     "shell.execute_reply.started": "2026-04-12T12:53:07.603397Z",
     "shell.execute_reply": "2026-04-12T12:53:09.355402Z"
    }
   },
   "outputs": [],
   "execution_count": 11
  },
  {
   "id": "b47c2da1cef52c2e",
   "cell_type": "code",
   "source": "for i in range(10):\n    print(len(training_dataset[\"train\"][i][\"input_ids\"]))",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2026-04-10T18:41:33.790641588Z",
     "start_time": "2026-04-10T18:41:33.561714562Z"
    },
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "b47c2da1cef52c2e",
    "outputId": "a5ef3e2d-defa-4a1a-f040-03ccfc7a91df",
    "trusted": true,
    "execution": {
     "iopub.status.busy": "2026-04-12T12:53:09.357577Z",
     "iopub.execute_input": "2026-04-12T12:53:09.357796Z",
     "iopub.status.idle": "2026-04-12T12:53:09.364741Z",
     "shell.execute_reply.started": "2026-04-12T12:53:09.357774Z",
     "shell.execute_reply": "2026-04-12T12:53:09.363964Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "text": "128\n128\n128\n128\n128\n128\n128\n128\n128\n128\n",
     "output_type": "stream"
    }
   ],
   "execution_count": 12
  },
  {
   "id": "2ca6250b-7ad1-4e6a-8f00-3972f5a2a902",
   "cell_type": "markdown",
   "source": "## Adding LORA to improve speed and use less parameters",
   "metadata": {}
  },
  {
   "id": "e2b8683e-37bd-45ba-a4d2-3d2d4a9bc486",
   "cell_type": "code",
   "source": "model = GPT2LMHeadModel.from_pretrained(\"gpt2\")\nmodel.resize_token_embeddings(len(tokenizer))\n\nlora_config = LoraConfig(r=8, lora_alpha=32, target_modules=[\"c_attn\"], lora_dropout=0.1, bias=\"none\", task_type=\"CAUSAL_LM\")\nmodel = get_peft_model(model, lora_config)\n\ntraining_dataset.set_format(type=\"torch\")\ntraining_data = DataLoader(training_dataset[\"train\"], batch_size=2, shuffle=True)\ntesting_data = DataLoader(training_dataset[\"test\"], batch_size=2, shuffle=True)\noptimiser = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=5e-5)\n\ndevice = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\nmodel.to(device)\nmodel.print_trainable_parameters()",
   "metadata": {
    "trusted": true,
    "execution": {
     "iopub.status.busy": "2026-04-12T12:53:09.365690Z",
     "iopub.execute_input": "2026-04-12T12:53:09.365993Z",
     "iopub.status.idle": "2026-04-12T12:53:09.985111Z",
     "shell.execute_reply.started": "2026-04-12T12:53:09.365968Z",
     "shell.execute_reply": "2026-04-12T12:53:09.984441Z"
    }
   },
   "outputs": [
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "Loading weights:   0%|          | 0/148 [00:00<?, ?it/s]",
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "ecefb4da6ff048609304c40f683d1726"
      }
     },
     "metadata": {}
    },
    {
     "name": "stderr",
     "text": "GPT2LMHeadModel LOAD REPORT from: gpt2\nKey                  | Status     |  | \n---------------------+------------+--+-\nh.{0...11}.attn.bias | UNEXPECTED |  | \n\nNotes:\n- UNEXPECTED\t:can be ignored when loading from different task/architecture; not ok if you expect identical arch.\n",
     "output_type": "stream"
    },
    {
     "name": "stdout",
     "text": "trainable params: 294,912 || all params: 124,734,720 || trainable%: 0.2364\n",
     "output_type": "stream"
    }
   ],
   "execution_count": 13
  },
  {
   "id": "bfd3873f9f578429",
   "cell_type": "markdown",
   "source": "## Model loop\nNow we need to create our training loop for GPT2 using the pretrained model account for back propagation, loss and number of epochs",
   "metadata": {
    "id": "bfd3873f9f578429"
   }
  },
  {
   "id": "8208e77b670a0eb2",
   "cell_type": "code",
   "source": "num_epochs = 15\ntotal_start = time.time()\n\nfor epoch in range(num_epochs):\n    start = time.time()\n\n    model.train()\n    train_loss = 0.0\n\n    for batch in training_data:\n        input_ids = batch[\"input_ids\"].to(device)\n        attention_mask = batch[\"attention_mask\"].to(device)\n        labels = batch[\"labels\"].to(device)\n\n        optimiser.zero_grad()\n        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)\n        loss = outputs.loss\n        loss.backward()\n        optimiser.step()\n\n        train_loss += loss.item()\n\n    model.eval()\n    validation_loss = 0.0\n\n    with torch.no_grad():\n        for batch in testing_data:\n            input_ids = batch[\"input_ids\"].to(device)\n            attention_mask = batch[\"attention_mask\"].to(device)\n            labels = batch[\"labels\"].to(device)\n\n            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)\n            validation_loss += outputs.loss.item()\n\n    avg_train_loss = train_loss / len(training_data)\n    avg_val_loss = validation_loss / len(testing_data)\n    perplexity = torch.exp(torch.tensor(avg_val_loss))\n\n    if device.type == \"cuda\":\n        torch.cuda.synchronize()\n\n    epoch_time = time.time() - start\n    avg_time_per_epoch = (time.time() - total_start) / (epoch + 1)\n    eta = avg_time_per_epoch * (num_epochs - epoch - 1)\n\n    print(f\"Epoch {epoch+1}/{num_epochs}\")\n    print(f\"Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | PPL: {perplexity:.2f}\")\n    print(f\"Time: {epoch_time:.2f}s | ETA: {eta/60:.2f}m\")\n    print(\"-\" * 30)",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 340,
     "referenced_widgets": [
      "8f05f89d52b047368c4405847015de33",
      "c9173aa946df480d81648bf01808a153",
      "4b279cc9d382415fa1d2cf21a94415f2",
      "0be49291f31545f789176d9bc9e642c4",
      "f12fe68a33ff4a699d24ac0f6b3ba320",
      "36433e0953664a70981a91977217cf3f",
      "caaaadd11e8d49559e8278629f6f0b45",
      "c70ad2b68e9e49649e2bfc71764cb5a0",
      "7b6f9b2d74c643ab8fd016d7e8117a53",
      "434d9c38ce03441f9b69ca86ddfb3c0b",
      "b54f1d1380b94261b13997f9eceb423e"
     ]
    },
    "id": "8208e77b670a0eb2",
    "outputId": "005113d2-3641-47a9-9c34-fbff7d36ac05",
    "trusted": true,
    "execution": {
     "iopub.status.busy": "2026-04-12T12:53:09.985973Z",
     "iopub.execute_input": "2026-04-12T12:53:09.986247Z",
     "iopub.status.idle": "2026-04-12T14:38:16.816340Z",
     "shell.execute_reply.started": "2026-04-12T12:53:09.986216Z",
     "shell.execute_reply": "2026-04-12T14:38:16.815610Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "text": "Epoch 1/15\nTrain Loss: 3.8074 | Val Loss: 3.5396 | PPL: 34.45\nTime: 420.31s | ETA: 98.07m\n------------------------------\nEpoch 2/15\nTrain Loss: 3.6884 | Val Loss: 3.5101 | PPL: 33.45\nTime: 418.71s | ETA: 90.89m\n------------------------------\nEpoch 3/15\nTrain Loss: 3.6575 | Val Loss: 3.4952 | PPL: 32.96\nTime: 421.96s | ETA: 84.07m\n------------------------------\nEpoch 4/15\nTrain Loss: 3.6399 | Val Loss: 3.4864 | PPL: 32.67\nTime: 418.49s | ETA: 76.98m\n------------------------------\nEpoch 5/15\nTrain Loss: 3.6280 | Val Loss: 3.4791 | PPL: 32.43\nTime: 417.68s | ETA: 69.91m\n------------------------------\nEpoch 6/15\nTrain Loss: 3.6169 | Val Loss: 3.4754 | PPL: 32.31\nTime: 421.43s | ETA: 62.96m\n------------------------------\nEpoch 7/15\nTrain Loss: 3.6086 | Val Loss: 3.4714 | PPL: 32.18\nTime: 421.52s | ETA: 56.00m\n------------------------------\nEpoch 8/15\nTrain Loss: 3.6015 | Val Loss: 3.4691 | PPL: 32.11\nTime: 421.63s | ETA: 49.03m\n------------------------------\nEpoch 9/15\nTrain Loss: 3.5954 | Val Loss: 3.4668 | PPL: 32.04\nTime: 420.18s | ETA: 42.02m\n------------------------------\nEpoch 10/15\nTrain Loss: 3.5905 | Val Loss: 3.4636 | PPL: 31.93\nTime: 422.08s | ETA: 35.03m\n------------------------------\nEpoch 11/15\nTrain Loss: 3.5847 | Val Loss: 3.4615 | PPL: 31.87\nTime: 421.49s | ETA: 28.03m\n------------------------------\nEpoch 12/15\nTrain Loss: 3.5798 | Val Loss: 3.4616 | PPL: 31.87\nTime: 421.00s | ETA: 21.03m\n------------------------------\nEpoch 13/15\nTrain Loss: 3.5764 | Val Loss: 3.4579 | PPL: 31.75\nTime: 419.68s | ETA: 14.02m\n------------------------------\nEpoch 14/15\nTrain Loss: 3.5721 | Val Loss: 3.4595 | PPL: 31.80\nTime: 418.79s | ETA: 7.01m\n------------------------------\nEpoch 15/15\nTrain Loss: 3.5687 | Val Loss: 3.4555 | PPL: 31.67\nTime: 421.88s | ETA: 0.00m\n------------------------------\n",
     "output_type": "stream"
    }
   ],
   "execution_count": 14
  },
  {
   "id": "9a4b279a-3502-47f0-aff5-b25e7d008c90",
   "cell_type": "code",
   "source": "output_dir = \"./gpt2_finetuned\"\nmodel.save_pretrained(output_dir)\n\ntokenizer.save_pretrained(output_dir)\n\nprint(f\"Model saved to {output_dir}\")",
   "metadata": {
    "trusted": true,
    "execution": {
     "iopub.status.busy": "2026-04-12T14:47:06.907066Z",
     "iopub.execute_input": "2026-04-12T14:47:06.907818Z",
     "iopub.status.idle": "2026-04-12T14:47:06.991596Z",
     "shell.execute_reply.started": "2026-04-12T14:47:06.907779Z",
     "shell.execute_reply": "2026-04-12T14:47:06.990817Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "text": "Model saved to ./gpt2_finetuned\n",
     "output_type": "stream"
    }
   ],
   "execution_count": 21
  },
  {
   "id": "00f683d7-c8fd-47ac-804c-4fccd4c51bb7",
   "cell_type": "code",
   "source": "shutil.make_archive('model_output', 'zip', './gpt2_finetuned')\n\nprint(\"Model zipped and ready for download!\")",
   "metadata": {
    "trusted": true,
    "execution": {
     "iopub.status.busy": "2026-04-12T14:47:10.189729Z",
     "iopub.execute_input": "2026-04-12T14:47:10.190141Z",
     "iopub.status.idle": "2026-04-12T14:47:35.447559Z",
     "shell.execute_reply.started": "2026-04-12T14:47:10.190111Z",
     "shell.execute_reply": "2026-04-12T14:47:35.446593Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "text": "Model zipped and ready for download!\n",
     "output_type": "stream"
    }
   ],
   "execution_count": 22
  },
  {
   "id": "75e43852-b6ce-4d7a-a266-dd96e7077e73",
   "cell_type": "code",
   "source": "output_dir = \"./gpt2_merged\"\nprint(f\"Merging weights and saving to {output_dir}...\")\nmerged_model = model.merge_and_unload()\nmerged_model.save_pretrained(output_dir)\ntokenizer.save_pretrained(output_dir)\nshutil.make_archive('model_output_15_epochs_w_lora', 'zip', './gpt2_merged')\nprint(\"Done! You now have a standalone GPT-2 model.\")",
   "metadata": {
    "trusted": true,
    "execution": {
     "iopub.status.busy": "2026-04-12T14:51:48.132657Z",
     "iopub.execute_input": "2026-04-12T14:51:48.133012Z",
     "iopub.status.idle": "2026-04-12T14:52:13.830366Z",
     "shell.execute_reply.started": "2026-04-12T14:51:48.132984Z",
     "shell.execute_reply": "2026-04-12T14:52:13.829507Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "text": "Merging weights and saving to ./gpt2_merged...\n",
     "output_type": "stream"
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]",
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "bf031dd298e4401a847d92d9c0a2fa84"
      }
     },
     "metadata": {}
    },
    {
     "name": "stdout",
     "text": "Done! You now have a standalone GPT-2 model.\n",
     "output_type": "stream"
    }
   ],
   "execution_count": 24
  },
  {
   "id": "d08a9fb5-0137-42d5-921e-d59a03715c81",
   "cell_type": "code",
   "source": [
    "print(\"Files in working directory:\", os.listdir('/kaggle/working'))\n",
    "FileLink(r'model_output.zip')"
   ],
   "metadata": {
    "trusted": true,
    "execution": {
     "iopub.status.busy": "2026-04-12T14:55:44.272724Z",
     "iopub.execute_input": "2026-04-12T14:55:44.273432Z",
     "iopub.status.idle": "2026-04-12T14:55:44.279586Z",
     "shell.execute_reply.started": "2026-04-12T14:55:44.273399Z",
     "shell.execute_reply": "2026-04-12T14:55:44.278849Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "text": "Files in working directory: ['gpt2_merged', 'gpt2_finetuned', '.virtual_documents', 'model_output.zip', 'state.db', 'model_output_15_epochs_w_lora.zip']\n",
     "output_type": "stream"
    },
    {
     "execution_count": 25,
     "output_type": "execute_result",
     "data": {
      "text/plain": "/kaggle/working/model_output.zip",
      "text/html": "<a href='model_output.zip' target='_blank'>model_output.zip</a><br>"
     },
     "metadata": {}
    }
   ],
   "execution_count": 25
  }
 ]
}