# -*- coding: utf-8 -*- """SummarizingData.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1Wo7aUHTjFTRVpiK4efjRHI2gsA6fRip5 """ # Import pandas import pandas as pd # Use pandas to read in recent_grads_url recent_grads = pd.read_csv("/content/recent_grads.csv") # Print the shape print(recent_grads.shape) from google.colab import drive drive.mount('/content/drive') # Print .dtypes print(recent_grads.dtypes) # Output summary statistics print(recent_grads.describe()) # Exclude data of type object print(recent_grads.describe(exclude=["object"])) # Names of the columns we're searching for missing values columns = ['median', 'p25th', 'p75th'] # Take a look at the dtypes print(recent_grads[columns].dtypes) # Find how missing values are represented print(recent_grads["median"].unique()) # Replace missing values with NaN for column in columns: recent_grads.loc[recent_grads[column] == 'UN', column] = np.nan import numpy as np import pandas as pd # Assuming 'recent_grads' is your DataFrame and 'columns' is a list of columns needing correction # Replace missing values with NaN for column in columns: recent_grads.loc[recent_grads[column] == 'UN', column] = np.nan # Select sharewomen column sw_col = recent_grads['sharewomen'] # Output first five rows print(sw_col.head()) # Import numpy import numpy as np # Use max to output maximum values max_sw = recent_grads['sharewomen'].max() # Print column max print(max_sw) # Output the row containing the maximum percentage of women #print(sw_col) print(recent_grads[(recent_grads['sharewomen']==max_sw)]) # Convert to numpy array import numpy as np recent_grads_np=np.array(recent_grads[['unemployed', 'low_wage_jobs']]) # Print the type of recent_grads_np print(type(recent_grads_np)) print(np.corrcoef(recent_grads_np[:,0], recent_grads_np[:,1]))