Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cell_volume(Master).ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions Centroid_Distance(Master).ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions Contacts(Master).ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions Density(Master).ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions Is_Centrosymmetric(Master).ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"Is_Centrosymmetric(Master).ipynb","provenance":[],"collapsed_sections":[]},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","metadata":{"id":"RV1EQWC6zz0N"},"source":["# DigiFab Datathon\n","\n","Aim: Centroid Distances"]},{"cell_type":"code","metadata":{"id":"Ls0c9tqtwqKb","executionInfo":{"status":"ok","timestamp":1616598250301,"user_tz":0,"elapsed":678,"user":{"displayName":"Charlotte Breakwell","photoUrl":"","userId":"15588921800197350603"}}},"source":["import pandas as pd\n","import numpy as np\n","import sklearn\n","import matplotlib.pyplot as plt\n","\n","from sklearn.model_selection import cross_val_score, GridSearchCV\n","from sklearn.impute import SimpleImputer\n","from sklearn.decomposition import PCA\n","from sklearn.preprocessing import OneHotEncoder, StandardScaler\n","from sklearn.metrics import mean_squared_error \n","from sklearn import *\n","from sklearn.linear_model import LogisticRegression\n","from sklearn.metrics import classification_report, confusion_matrix"],"execution_count":11,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"a9DO3z7I3FHX"},"source":["## Imports"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"7JujBtKv5w42","executionInfo":{"status":"ok","timestamp":1616598126932,"user_tz":0,"elapsed":28099,"user":{"displayName":"Charlotte Breakwell","photoUrl":"","userId":"15588921800197350603"}},"outputId":"83b67840-83a1-40ba-b868-ca183307371a"},"source":["from google.colab import drive\n","drive.mount('/content/drive')"],"execution_count":5,"outputs":[{"output_type":"stream","text":["Mounted at /content/drive\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"vvipWvQ4vkCJ","executionInfo":{"status":"ok","timestamp":1616598148719,"user_tz":0,"elapsed":11786,"user":{"displayName":"Charlotte Breakwell","photoUrl":"","userId":"15588921800197350603"}}},"source":["file_path = '/content/drive/MyDrive/DigiFab_Datathon/train_and_test_sets/'\n","save_path = '/content/drive/MyDrive/DigiFab_Datathon/'\n","\n","train_descriptors = pd.read_csv(file_path+\"train_descriptors.csv\")\n","train_mord3d = pd.read_csv(file_path+\"train_mord3d.csv\")\n","train_morgan = pd.read_csv(file_path+\"train_morgan.csv\")\n","train_rdk = pd.read_csv(file_path+\"train_rdk.csv\")\n","\n","train_crystals = pd.read_csv(file_path+\"train_crystals.csv\")\n","train_distances = pd.read_csv(file_path+\"train_distances.csv\")\n","train_centroid_distances = pd.read_csv(file_path+\"train_centroid_distances.csv\")"],"execution_count":6,"outputs":[]},{"cell_type":"code","metadata":{"id":"dyKbRmlv7JlP","executionInfo":{"status":"ok","timestamp":1616598152576,"user_tz":0,"elapsed":15029,"user":{"displayName":"Charlotte Breakwell","photoUrl":"","userId":"15588921800197350603"}}},"source":["test_descriptors = pd.read_csv(file_path+\"test_descriptors.csv\")\n","test_mord3d = pd.read_csv(file_path+\"test_mord3d.csv\")\n","test_morgan = pd.read_csv(file_path+\"test_morgan.csv\")\n","test_rdk = pd.read_csv(file_path+\"test_rdk.csv\")"],"execution_count":7,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"eIfsdghU2l5K"},"source":["## Date Engineering Step\n","\n","* Look for NaN values.\n","* Look at magintude of values - do they need scaling?"]},{"cell_type":"code","metadata":{"id":"alrEPOXwwo9u","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1616588310911,"user_tz":0,"elapsed":16468,"user":{"displayName":"Sophie Finnigan","photoUrl":"","userId":"10230321660225593837"}},"outputId":"67c2161b-37e0-476c-ab92-256c5975ac36"},"source":["print(train_descriptors.describe())\n","print(train_mord3d.describe())"],"execution_count":null,"outputs":[{"output_type":"stream","text":[" Unnamed: 0 ABC ... mZagreb1 mZagreb2\n","count 13449.000000 13449.000000 ... 13448.000000 13449.000000\n","mean 40464.558183 8.791685 ... 4.820664 2.709742\n","std 19446.921704 2.294386 ... 1.421185 0.606246\n","min 2.000000 0.000000 ... 0.750000 0.000000\n","25% 33281.000000 7.387307 ... 3.833333 2.333333\n","50% 43815.000000 9.289847 ... 4.722222 2.833333\n","75% 55711.000000 10.663621 ... 5.694444 3.194444\n","max 66123.000000 13.654635 ... 10.652778 4.000000\n","\n","[8 rows x 1614 columns]\n"," Unnamed: 0 PNSA1 ... MOMI-Z PBF\n","count 13449.000000 13432.000000 ... 13449.000000 13449.000000\n","mean 40464.558183 164.398549 ... 356.137203 0.383000\n","std 19446.921704 55.235914 ... 284.378542 0.304775\n","min 2.000000 0.000000 ... 0.000000 0.000000\n","25% 33281.000000 127.855661 ... 188.703461 0.111270\n","50% 43815.000000 160.405836 ... 303.942659 0.351683\n","75% 55711.000000 196.513934 ... 450.002580 0.605584\n","max 66123.000000 469.849929 ... 7084.018710 2.195392\n","\n","[8 rows x 214 columns]\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"XvPVmxJPABOw"},"source":["# Data Preprocessing"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"IGzlc3kCALx3","executionInfo":{"status":"ok","timestamp":1616598173435,"user_tz":0,"elapsed":518,"user":{"displayName":"Charlotte Breakwell","photoUrl":"","userId":"15588921800197350603"}},"outputId":"e59cb5f8-bbfa-4699-b78e-9f27366c0c23"},"source":["train_mord3d_full = train_mord3d.iloc[:, 1:-3].dropna(axis= 1, how=\"any\")\n","train_mord3d_full.shape"],"execution_count":8,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(13449, 173)"]},"metadata":{"tags":[]},"execution_count":8}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"ks-QZ-giAwel","executionInfo":{"status":"ok","timestamp":1616598193341,"user_tz":0,"elapsed":690,"user":{"displayName":"Charlotte Breakwell","photoUrl":"","userId":"15588921800197350603"}},"outputId":"2510135b-0a09-4925-d28c-440cbbad1e2d"},"source":["test_mord3d_full = test_mord3d[train_mord3d_full.columns]\n","test_mord3d_full.shape"],"execution_count":9,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(3363, 173)"]},"metadata":{"tags":[]},"execution_count":9}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"46tKyHLvAwoT","executionInfo":{"status":"ok","timestamp":1616598207599,"user_tz":0,"elapsed":872,"user":{"displayName":"Charlotte Breakwell","photoUrl":"","userId":"15588921800197350603"}},"outputId":"16764fb2-093b-4313-b82d-d0d846f8cfbc"},"source":["train_PCA = decomposition.PCA(n_components=.95)\n","scaler_for_PCA = preprocessing.StandardScaler()\n","train_mord3d_PCA = train_PCA.fit_transform(scaler_for_PCA.fit_transform(train_mord3d_full))\n","test_mord3d_PCA = train_PCA.transform(scaler_for_PCA.transform(test_mord3d_full))\n","print(train_mord3d_PCA.shape, test_mord3d_PCA.shape)"],"execution_count":10,"outputs":[{"output_type":"stream","text":["(13449, 45) (3363, 45)\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"h6_n2nGTw9OK","executionInfo":{"status":"ok","timestamp":1616598271819,"user_tz":0,"elapsed":465,"user":{"displayName":"Charlotte Breakwell","photoUrl":"","userId":"15588921800197350603"}}},"source":["train_crystals[\"is_centrosymmetric\"] = train_crystals[\"is_centrosymmetric\"].astype(int)"],"execution_count":12,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"6t6t8uKK3ewx"},"source":["## Initial Models"]},{"cell_type":"markdown","metadata":{"id":"uu2z9RWM9ygY"},"source":[""]},{"cell_type":"code","metadata":{"id":"C71wz1cc3j3O","executionInfo":{"status":"ok","timestamp":1616600095974,"user_tz":0,"elapsed":409,"user":{"displayName":"Charlotte Breakwell","photoUrl":"","userId":"15588921800197350603"}}},"source":["#RandomForest"],"execution_count":25,"outputs":[]},{"cell_type":"code","metadata":{"id":"uycZmmBmFQQS","executionInfo":{"status":"ok","timestamp":1616600097174,"user_tz":0,"elapsed":476,"user":{"displayName":"Charlotte Breakwell","photoUrl":"","userId":"15588921800197350603"}}},"source":["target = \"is_centrosymmetric\""],"execution_count":26,"outputs":[]},{"cell_type":"code","metadata":{"id":"_uO4XyOO_MXF","executionInfo":{"status":"ok","timestamp":1616600098479,"user_tz":0,"elapsed":470,"user":{"displayName":"Charlotte Breakwell","photoUrl":"","userId":"15588921800197350603"}}},"source":["from sklearn.ensemble import RandomForestClassifier"],"execution_count":27,"outputs":[]},{"cell_type":"code","metadata":{"id":"7pcd6oIv_Ubw","executionInfo":{"status":"ok","timestamp":1616600130849,"user_tz":0,"elapsed":460,"user":{"displayName":"Charlotte Breakwell","photoUrl":"","userId":"15588921800197350603"}}},"source":["clf = RandomForestClassifier(n_estimators = 100)"],"execution_count":28,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"ToED_6dOxX0p","executionInfo":{"status":"ok","timestamp":1616600158264,"user_tz":0,"elapsed":10672,"user":{"displayName":"Charlotte Breakwell","photoUrl":"","userId":"15588921800197350603"}},"outputId":"1ad34233-af38-4217-b972-75065e7e42de"},"source":["clf = RandomForestClassifier(n_estimators = 100)\n","clf.fit(train_mord3d_PCA,train_crystals[target])"],"execution_count":29,"outputs":[{"output_type":"execute_result","data":{"text/plain":["RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n"," criterion='gini', max_depth=None, max_features='auto',\n"," max_leaf_nodes=None, max_samples=None,\n"," min_impurity_decrease=0.0, min_impurity_split=None,\n"," min_samples_leaf=1, min_samples_split=2,\n"," min_weight_fraction_leaf=0.0, n_estimators=100,\n"," n_jobs=None, oob_score=False, random_state=None,\n"," verbose=0, warm_start=False)"]},"metadata":{"tags":[]},"execution_count":29}]},{"cell_type":"code","metadata":{"id":"xBI8LVpQ4MyU","executionInfo":{"status":"ok","timestamp":1616600161860,"user_tz":0,"elapsed":664,"user":{"displayName":"Charlotte Breakwell","photoUrl":"","userId":"15588921800197350603"}}},"source":["y_pred = clf.predict(train_mord3d_PCA)"],"execution_count":30,"outputs":[]},{"cell_type":"code","metadata":{"id":"5fWwA46h4M7A"},"source":[""],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"a1uynQ7fGA-p"},"source":["#Cross-validation"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"THA2-XEg_Yz_","executionInfo":{"status":"ok","timestamp":1616600179354,"user_tz":0,"elapsed":506,"user":{"displayName":"Charlotte Breakwell","photoUrl":"","userId":"15588921800197350603"}}},"source":["from sklearn.model_selection import cross_val_score"],"execution_count":31,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"9_LyyF3m4Ss_","executionInfo":{"status":"ok","timestamp":1616600286536,"user_tz":0,"elapsed":89737,"user":{"displayName":"Charlotte Breakwell","photoUrl":"","userId":"15588921800197350603"}},"outputId":"5c219c6c-a470-487b-8c1e-6bb23784cced"},"source":["np.mean(cross_val_score(clf, train_mord3d_PCA, train_crystals[target], cv=10))"],"execution_count":32,"outputs":[{"output_type":"execute_result","data":{"text/plain":["0.848762502212781"]},"metadata":{"tags":[]},"execution_count":32}]},{"cell_type":"markdown","metadata":{"id":"0BmlIJ3lKAUp"},"source":["### Outputting predictions for automated checking \n","Link: https://github.com/stevenkbennett/fons_datathon_testing"]},{"cell_type":"code","metadata":{"id":"dEhCsnCHGE8n","executionInfo":{"status":"ok","timestamp":1616600287183,"user_tz":0,"elapsed":638,"user":{"displayName":"Charlotte Breakwell","photoUrl":"","userId":"15588921800197350603"}}},"source":["predictions = logreg.predict(test_mord3d_PCA)"],"execution_count":33,"outputs":[]},{"cell_type":"code","metadata":{"id":"RuDLlv10KLr8","executionInfo":{"status":"ok","timestamp":1616600315535,"user_tz":0,"elapsed":497,"user":{"displayName":"Charlotte Breakwell","photoUrl":"","userId":"15588921800197350603"}}},"source":["# np.savetxt(\"is_centrosymmetric.csv\", predictions)\n","# Alternatively, if using pandas, you can use the following:\n","pd.DataFrame(predictions).to_csv(save_path+\"task_2_predictions.csv\", header=False, index=False)"],"execution_count":34,"outputs":[]},{"cell_type":"code","metadata":{"id":"KZOjGVb1M8nY"},"source":[""],"execution_count":null,"outputs":[]}]}
1 change: 1 addition & 0 deletions Packing_coefficient(Master).ipynb

Large diffs are not rendered by default.

Loading