anamabo
diff --git a/‎Capstone_milestone_report.pdf
549 KB b/‎Capstone_milestone_report.pdf
549 KB
diff --git a/‎Capstone_project_proposal_CAMartinez.pdf
63.5 KB b/‎Capstone_project_proposal_CAMartinez.pdf
63.5 KB
diff --git a/‎analysis_data.pdf
29.5 KB b/‎analysis_data.pdf
29.5 KB
diff --git a/‎data_reading_and_wrangling.ipynb
Lines changed: 100 additions & 0 deletions b/‎data_reading_and_wrangling.ipynb
Lines changed: 100 additions & 0 deletions
@@ -0,0 +1,100 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<b style=\"font-size:150%;\"> Data wrangling  </b>  \n",
+    "\n",
+    "<p style=\"font-size:120%;\"> \n",
+    "I have one year of measurements of the hard disks, where each snapshot \n",
+    "corresponds to one day of data.<br>\n",
+    "This script joins all the data in a single file and it filters the columns that will be used in the analysis. This script also removes unphysical data and converts the time into date format. </p>  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import glob\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "column_names=['date', 'serial_number', 'model', 'capacity_bytes', 'failure', \n",
+    "         'smart_1_normalized', 'smart_1_raw','smart_3_normalized', 'smart_3_raw',\n",
+    "         'smart_5_normalized', 'smart_5_raw','smart_9_normalized', 'smart_9_raw', \n",
+    "         'smart_12_normalized', 'smart_12_raw','smart_194_normalized', 'smart_194_raw', ]\n",
+    "files= glob.glob('2015*.csv')\n",
+    "data= pd.concat([pd.read_csv(i, usecols= column_names) for i in files], \n",
+    "                ignore_index=True)\n",
+    "# data cleaning\n",
+    "data= data[data.capacity_bytes>0]\n",
+    "data['date']= pd.to_datetime(data['date'], errors= 'coerce')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "data.to_csv('hard_drive_data_2015.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "anaconda-cloud": {},
+  "kernelspec": {
+   "display_name": "Python [Root]",
+   "language": "python",
+   "name": "Python [Root]"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}