|
| 1 | +# Data Visualization |
| 2 | +[<- Back to Home](../../README.md) |
| 3 | + |
| 4 | +```python |
| 5 | +import pandas as pd |
| 6 | +import numpy as np |
| 7 | +from plotly import express as px |
| 8 | +from matplotlib import pyplot as plt |
| 9 | +import missingno as msn |
| 10 | +import seaborn as sns |
| 11 | +%matplotlib inline |
| 12 | +``` |
| 13 | + |
| 14 | +## Reading File |
| 15 | + |
| 16 | + |
| 17 | +```python |
| 18 | +df = pd.read_csv("fake_job_postings.csv") |
| 19 | +df.head(2) |
| 20 | +``` |
| 21 | + |
| 22 | + |
| 23 | + |
| 24 | + |
| 25 | +<div> |
| 26 | +<style scoped> |
| 27 | + .dataframe tbody tr th:only-of-type { |
| 28 | + vertical-align: middle; |
| 29 | + } |
| 30 | + |
| 31 | + .dataframe tbody tr th { |
| 32 | + vertical-align: top; |
| 33 | + } |
| 34 | + |
| 35 | + .dataframe thead th { |
| 36 | + text-align: right; |
| 37 | + } |
| 38 | +</style> |
| 39 | +<table border="1" class="dataframe"> |
| 40 | + <thead> |
| 41 | + <tr style="text-align: right;"> |
| 42 | + <th></th> |
| 43 | + <th>job_id</th> |
| 44 | + <th>title</th> |
| 45 | + <th>location</th> |
| 46 | + <th>department</th> |
| 47 | + <th>salary_range</th> |
| 48 | + <th>company_profile</th> |
| 49 | + <th>description</th> |
| 50 | + <th>requirements</th> |
| 51 | + <th>benefits</th> |
| 52 | + <th>telecommuting</th> |
| 53 | + <th>has_company_logo</th> |
| 54 | + <th>has_questions</th> |
| 55 | + <th>employment_type</th> |
| 56 | + <th>required_experience</th> |
| 57 | + <th>required_education</th> |
| 58 | + <th>industry</th> |
| 59 | + <th>function</th> |
| 60 | + <th>fraudulent</th> |
| 61 | + </tr> |
| 62 | + </thead> |
| 63 | + <tbody> |
| 64 | + <tr> |
| 65 | + <th>0</th> |
| 66 | + <td>1</td> |
| 67 | + <td>Marketing Intern</td> |
| 68 | + <td>US, NY, New York</td> |
| 69 | + <td>Marketing</td> |
| 70 | + <td>NaN</td> |
| 71 | + <td>We're Food52, and we've created a groundbreaki...</td> |
| 72 | + <td>Food52, a fast-growing, James Beard Award-winn...</td> |
| 73 | + <td>Experience with content management systems a m...</td> |
| 74 | + <td>NaN</td> |
| 75 | + <td>0</td> |
| 76 | + <td>1</td> |
| 77 | + <td>0</td> |
| 78 | + <td>Other</td> |
| 79 | + <td>Internship</td> |
| 80 | + <td>NaN</td> |
| 81 | + <td>NaN</td> |
| 82 | + <td>Marketing</td> |
| 83 | + <td>0</td> |
| 84 | + </tr> |
| 85 | + <tr> |
| 86 | + <th>1</th> |
| 87 | + <td>2</td> |
| 88 | + <td>Customer Service - Cloud Video Production</td> |
| 89 | + <td>NZ, , Auckland</td> |
| 90 | + <td>Success</td> |
| 91 | + <td>NaN</td> |
| 92 | + <td>90 Seconds, the worlds Cloud Video Production ...</td> |
| 93 | + <td>Organised - Focused - Vibrant - Awesome!Do you...</td> |
| 94 | + <td>What we expect from you:Your key responsibilit...</td> |
| 95 | + <td>What you will get from usThrough being part of...</td> |
| 96 | + <td>0</td> |
| 97 | + <td>1</td> |
| 98 | + <td>0</td> |
| 99 | + <td>Full-time</td> |
| 100 | + <td>Not Applicable</td> |
| 101 | + <td>NaN</td> |
| 102 | + <td>Marketing and Advertising</td> |
| 103 | + <td>Customer Service</td> |
| 104 | + <td>0</td> |
| 105 | + </tr> |
| 106 | + </tbody> |
| 107 | +</table> |
| 108 | +</div> |
| 109 | + |
| 110 | + |
| 111 | + |
| 112 | + |
| 113 | +```python |
| 114 | +print(df.columns) |
| 115 | +``` |
| 116 | + |
| 117 | + Index(['job_id', 'title', 'location', 'department', 'salary_range', |
| 118 | + 'company_profile', 'description', 'requirements', 'benefits', |
| 119 | + 'telecommuting', 'has_company_logo', 'has_questions', 'employment_type', |
| 120 | + 'required_experience', 'required_education', 'industry', 'function', |
| 121 | + 'fraudulent'], |
| 122 | + dtype='object') |
| 123 | + |
| 124 | + |
| 125 | +**Question 1 ::** How many Datapoints are present in the data? |
| 126 | + |
| 127 | +**Question 2 ::** How many Features are present in the data? |
| 128 | + |
| 129 | + |
| 130 | +```python |
| 131 | +print("sol1:- Total Number Of DataPoints:- {}.".format(df.shape[0])) |
| 132 | +print("sol2:- Total Number of features:- {}.".format(df.shape[1])) |
| 133 | +``` |
| 134 | + |
| 135 | + sol1:- Total Number Of DataPoints:- 17880. |
| 136 | + sol2:- Total Number of features:- 18. |
| 137 | + |
| 138 | + |
| 139 | +**Question 3 ::** check for null values ? |
| 140 | + |
| 141 | + |
| 142 | +```python |
| 143 | +df.info() |
| 144 | +``` |
| 145 | + |
| 146 | + <class 'pandas.core.frame.DataFrame'> |
| 147 | + RangeIndex: 17880 entries, 0 to 17879 |
| 148 | + Data columns (total 18 columns): |
| 149 | + job_id 17880 non-null int64 |
| 150 | + title 17880 non-null object |
| 151 | + location 17534 non-null object |
| 152 | + department 6333 non-null object |
| 153 | + salary_range 2868 non-null object |
| 154 | + company_profile 14572 non-null object |
| 155 | + description 17879 non-null object |
| 156 | + requirements 15185 non-null object |
| 157 | + benefits 10670 non-null object |
| 158 | + telecommuting 17880 non-null int64 |
| 159 | + has_company_logo 17880 non-null int64 |
| 160 | + has_questions 17880 non-null int64 |
| 161 | + employment_type 14409 non-null object |
| 162 | + required_experience 10830 non-null object |
| 163 | + required_education 9775 non-null object |
| 164 | + industry 12977 non-null object |
| 165 | + function 11425 non-null object |
| 166 | + fraudulent 17880 non-null int64 |
| 167 | + dtypes: int64(5), object(13) |
| 168 | + memory usage: 2.5+ MB |
| 169 | + |
| 170 | + |
| 171 | + |
| 172 | +```python |
| 173 | +msn.matrix(df) |
| 174 | +``` |
| 175 | + |
| 176 | + |
| 177 | + |
| 178 | + |
| 179 | + <matplotlib.axes._subplots.AxesSubplot at 0x7f78397e4208> |
| 180 | + |
| 181 | + |
| 182 | + |
| 183 | + |
| 184 | + |
| 185 | + |
| 186 | + |
| 187 | + |
| 188 | +```python |
| 189 | +msn.heatmap(df) |
| 190 | +``` |
| 191 | + |
| 192 | + |
| 193 | + |
| 194 | + |
| 195 | + <matplotlib.axes._subplots.AxesSubplot at 0x7f7833f10780> |
| 196 | + |
| 197 | + |
| 198 | + |
| 199 | + |
| 200 | + |
| 201 | + |
| 202 | + |
| 203 | + |
| 204 | +```python |
| 205 | +msn.bar(df) |
| 206 | +``` |
| 207 | + |
| 208 | + |
| 209 | + |
| 210 | + |
| 211 | + <matplotlib.axes._subplots.AxesSubplot at 0x7f783a141940> |
| 212 | + |
| 213 | + |
| 214 | + |
| 215 | + |
| 216 | + |
| 217 | + |
| 218 | + |
| 219 | + |
| 220 | +```python |
| 221 | +for item in df.columns: |
| 222 | + print("{} uniques: {}".format(item,df[item].unique().size)) |
| 223 | +``` |
| 224 | + |
| 225 | + job_id uniques: 17880 |
| 226 | + title uniques: 11231 |
| 227 | + location uniques: 3106 |
| 228 | + department uniques: 1338 |
| 229 | + salary_range uniques: 875 |
| 230 | + company_profile uniques: 1710 |
| 231 | + description uniques: 14802 |
| 232 | + requirements uniques: 11969 |
| 233 | + benefits uniques: 6206 |
| 234 | + telecommuting uniques: 2 |
| 235 | + has_company_logo uniques: 2 |
| 236 | + has_questions uniques: 2 |
| 237 | + employment_type uniques: 6 |
| 238 | + required_experience uniques: 8 |
| 239 | + required_education uniques: 14 |
| 240 | + industry uniques: 132 |
| 241 | + function uniques: 38 |
| 242 | + fraudulent uniques: 2 |
| 243 | + |
| 244 | + |
| 245 | +## Features:- |
| 246 | +* job_id: - Every job have a different id |
| 247 | +* title:- Job have a title |
| 248 | +* location:- Location of job |
| 249 | +* department:- Job department(ex:- marketing etc) |
| 250 | +* salary_range:- range of salary |
| 251 | +* company_profile:- what actually campany do like it is food company or tech company. |
| 252 | +* description :- Full descripton of job |
| 253 | +* requirements :- What are the Requirements |
| 254 | +* benefits:- What are the extra benifit |
| 255 | +* Telecommuting:- binary variable |
| 256 | +* has_company_logo:- binary variable |
| 257 | +* has_questions:- binary variable |
| 258 | +* employment_type:- full-time or part time |
| 259 | +* required_experience:- internship of how much experience needed |
| 260 | +* required_education:- Minimum qualification |
| 261 | +* industry:- Ex-marketing and advrtisement |
| 262 | +* function:- functionality of job |
| 263 | +* fraudulent:- it is fraud or not |
| 264 | + |
| 265 | +**Question 4 ::** How many datapoints Are Fraudent in the given data ? |
| 266 | + |
| 267 | + |
| 268 | + |
| 269 | +```python |
| 270 | +print("sol :: Number Of Fraudent job :: {}".format(df['fraudulent'].mean() * df['fraudulent'].size)) |
| 271 | +``` |
| 272 | + |
| 273 | + sol :: Number Of Fraudent job :: 866.0 |
| 274 | + |
| 275 | + |
| 276 | + |
| 277 | +```python |
| 278 | +msn.dendrogram(df) |
| 279 | +``` |
| 280 | + |
| 281 | + |
| 282 | + |
| 283 | + |
| 284 | + <matplotlib.axes._subplots.AxesSubplot at 0x7f783a2177b8> |
| 285 | + |
| 286 | + |
| 287 | + |
| 288 | + |
| 289 | + |
| 290 | + |
0 commit comments