AliUsama98 commited on
Commit
27900ad
1 Parent(s): e474dfc

Upload summarizingdata.py

Browse files
Files changed (1) hide show
  1. summarizingdata.py +80 -0
summarizingdata.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """SummarizingData.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1Wo7aUHTjFTRVpiK4efjRHI2gsA6fRip5
8
+ """
9
+
10
+ # Import pandas
11
+ import pandas as pd
12
+
13
+ # Use pandas to read in recent_grads_url
14
+ recent_grads = pd.read_csv("/content/recent_grads.csv")
15
+
16
+ # Print the shape
17
+ print(recent_grads.shape)
18
+
19
+ from google.colab import drive
20
+ drive.mount('/content/drive')
21
+
22
+ # Print .dtypes
23
+ print(recent_grads.dtypes)
24
+
25
+ # Output summary statistics
26
+ print(recent_grads.describe())
27
+
28
+ # Exclude data of type object
29
+ print(recent_grads.describe(exclude=["object"]))
30
+
31
+ # Names of the columns we're searching for missing values
32
+ columns = ['median', 'p25th', 'p75th']
33
+
34
+ # Take a look at the dtypes
35
+ print(recent_grads[columns].dtypes)
36
+
37
+ # Find how missing values are represented
38
+ print(recent_grads["median"].unique())
39
+
40
+ # Replace missing values with NaN
41
+ for column in columns:
42
+ recent_grads.loc[recent_grads[column] == 'UN', column] = np.nan
43
+
44
+ import numpy as np
45
+ import pandas as pd
46
+
47
+ # Assuming 'recent_grads' is your DataFrame and 'columns' is a list of columns needing correction
48
+
49
+ # Replace missing values with NaN
50
+ for column in columns:
51
+ recent_grads.loc[recent_grads[column] == 'UN', column] = np.nan
52
+
53
+ # Select sharewomen column
54
+ sw_col = recent_grads['sharewomen']
55
+
56
+ # Output first five rows
57
+ print(sw_col.head())
58
+
59
+ # Import numpy
60
+ import numpy as np
61
+
62
+ # Use max to output maximum values
63
+ max_sw = recent_grads['sharewomen'].max()
64
+
65
+ # Print column max
66
+ print(max_sw)
67
+
68
+ # Output the row containing the maximum percentage of women
69
+ #print(sw_col)
70
+ print(recent_grads[(recent_grads['sharewomen']==max_sw)])
71
+
72
+ # Convert to numpy array
73
+ import numpy as np
74
+ recent_grads_np=np.array(recent_grads[['unemployed', 'low_wage_jobs']])
75
+
76
+
77
+ # Print the type of recent_grads_np
78
+ print(type(recent_grads_np))
79
+
80
+ print(np.corrcoef(recent_grads_np[:,0], recent_grads_np[:,1]))