datasummer / summarizingdata.py
AliUsama98's picture
Upload summarizingdata.py
27900ad
# -*- coding: utf-8 -*-
"""SummarizingData.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1Wo7aUHTjFTRVpiK4efjRHI2gsA6fRip5
"""
# Import pandas
import pandas as pd
# Use pandas to read in recent_grads_url
recent_grads = pd.read_csv("/content/recent_grads.csv")
# Print the shape
print(recent_grads.shape)
from google.colab import drive
drive.mount('/content/drive')
# Print .dtypes
print(recent_grads.dtypes)
# Output summary statistics
print(recent_grads.describe())
# Exclude data of type object
print(recent_grads.describe(exclude=["object"]))
# Names of the columns we're searching for missing values
columns = ['median', 'p25th', 'p75th']
# Take a look at the dtypes
print(recent_grads[columns].dtypes)
# Find how missing values are represented
print(recent_grads["median"].unique())
# Replace missing values with NaN
for column in columns:
recent_grads.loc[recent_grads[column] == 'UN', column] = np.nan
import numpy as np
import pandas as pd
# Assuming 'recent_grads' is your DataFrame and 'columns' is a list of columns needing correction
# Replace missing values with NaN
for column in columns:
recent_grads.loc[recent_grads[column] == 'UN', column] = np.nan
# Select sharewomen column
sw_col = recent_grads['sharewomen']
# Output first five rows
print(sw_col.head())
# Import numpy
import numpy as np
# Use max to output maximum values
max_sw = recent_grads['sharewomen'].max()
# Print column max
print(max_sw)
# Output the row containing the maximum percentage of women
#print(sw_col)
print(recent_grads[(recent_grads['sharewomen']==max_sw)])
# Convert to numpy array
import numpy as np
recent_grads_np=np.array(recent_grads[['unemployed', 'low_wage_jobs']])
# Print the type of recent_grads_np
print(type(recent_grads_np))
print(np.corrcoef(recent_grads_np[:,0], recent_grads_np[:,1]))