import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
%matplotlib inline
Using "Bachelors’ Degrees by Major and Sex, Massachusetts, 2015-2016" by The National Center for Education Statistics , the following journal shows the sex distribution of Massachusetts college students in stem fields
Dataset = pd.read_csv("bachelors_major_ma_2015_16.csv")
Dataset[:5]
Particulary Engineering, Physical Sciences, and Math majors are sampled in this journal
Major = Dataset[['Institution Name','Enrolled total (ADM2015_RV)',"Grand total men (C2016_A First major Engineering Bachelor's degree)","Grand total women (C2016_A First major Engineering Bachelor's degree)","Grand total men (C2016_A First major Physical Sciences Bachelor's degree)","Grand total women (C2016_A First major Physical Sciences Bachelor's degree)", "Grand total men (C2016_A First major Mathematics and Statistics Bachelor's degree)","Grand total women (C2016_A First major Mathematics and Statistics Bachelor's degree)"]]
Major = Major.dropna()
Major.describe()
unis = Major[0 :1]
for x in range(len(Major)):
unis = np.vstack((unis , Major[x+1 :(x+2 )] ))
print(unis)
#first trial
for i in range(len(unis[:,1])):
name = (unis[i,0])
enroll = (unis[i,1])
m_engin = (unis[i,2])
w_engin =(unis[i,3])
m_phys = (unis[i,4])
w_phys =(unis[i,5])
m_math = (unis[i,6])
w_math = (unis[i,7])
plt.plot(m_engin, w_engin , color='green', marker='o')
plt.plot(m_phys,w_phys , color='blue', marker='o')
plt.plot(m_math, w_math , color='black', marker='o')
plt.title("Sex distribution in a given college")
plt.xlabel('# of male students')
plt.ylabel('# of female students')
plt.show()
The 'null hypothisis' is that women and men are present in the stem field equally. To test this out I plotted and gathered the statistics of each major.
To constrain the dataset to just the majors of interest and remove any NaNs, I did the following
Major = Dataset[['Institution Name','Enrolled total (ADM2015_RV)',"Grand total men (C2016_A First major Engineering Bachelor's degree)","Grand total women (C2016_A First major Engineering Bachelor's degree)","Grand total men (C2016_A First major Physical Sciences Bachelor's degree)","Grand total women (C2016_A First major Physical Sciences Bachelor's degree)", "Grand total men (C2016_A First major Mathematics and Statistics Bachelor's degree)","Grand total women (C2016_A First major Mathematics and Statistics Bachelor's degree)"]]
Major = Major.dropna()
Major.head()
The data is then further constrained to just Men Engineering Majors and to just Women engineering Majors. To see if there is a linear relationship between the # of Men and Women engineering Majors a regression fit was performed.
d_m_engin = Major["Grand total men (C2016_A First major Engineering Bachelor's degree)"]
d_w_engin = Major["Grand total women (C2016_A First major Engineering Bachelor's degree)"]
engin_model = smf.ols(formula = 'd_w_engin ~ d_m_engin', data = Major)
est = engin_model.fit()
est.summary()
The following shows a plot of Engineering Men v. Women Majors. Shown in black is the expected trend if there was an equal # of Male and Female students. Shown in red is the trend for engineering majors.
plt.scatter(d_m_engin, d_w_engin, color='green', marker='o')
plt.title("Sex distribution in a given college (Engineering)")
plt.xlabel('# of male students')
plt.ylabel('# of female students')
o_x = np.linspace(0,500,100) # 100 linearly spaced numbers
o_y = 11.3304 + 0.3145*o_x
plt.plot(o_x , o_y, color = 'red')
p_x = o_x
plt.plot(p_x , p_x, color = 'black')
plt.show()
plt.hist(d_m_engin, color = 'blue')
plt.hist(d_w_engin, color = 'red')
#plt.axvline(x=-1.96, color = 'r')
#plt.axvline(x=1.96, color = 'r')
plt.xlabel('# of students')
plt.ylabel('Frequency')
plt.title('Distribution')
plt.show()
The statistics can be obtained and then used to see if the null hypothesis is a valid model.
n_obs = len(d_w_engin)
mean_w = np.mean(d_w_engin)
std_w = np.std(d_w_engin)
mean_m = np.mean(d_m_engin)
std_m = np.std(d_m_engin)
obs_diff = np.abs( mean_w - mean_m )
# Expected difference is that there is no difference
#(testing null hypothesis)
exp_diff = 0
#vng = (sd_g**2/n_g)
#vnb = (sd_b**2/n_b)
std_err = np.sqrt( ((std_w**2)/n_obs) + ((std_m**2)/n_obs) )
z = (obs_diff - exp_diff)/std_err
#print(mean_w)
print('Sample difference:' , obs_diff)
print('Expected population difference:' , exp_diff)
print('Standard Error:' , std_err)
print('Z = ' , z)
The same can then be performed for the physcial sciences and Math majors. It seems like a function could have been made to more quickly obtain the data.
d_m_phys = Major["Grand total men (C2016_A First major Physical Sciences Bachelor's degree)"]
d_w_phys = Major["Grand total women (C2016_A First major Physical Sciences Bachelor's degree)"]
phys_model = smf.ols(formula = 'd_w_phys ~ d_m_phys', data = Major)
est = phys_model.fit()
est.summary()
ln_inc_edu = plt.figure()
plt.scatter(d_m_phys, d_w_phys, color='Blue', marker='o')
plt.title("Sex distribution in a given college (Physical Sciences)")
plt.xlabel('# of male students')
plt.ylabel('# of female students')
o_x = np.linspace(0,80,100) # 100 linearly spaced numbers
o_y = 7.2122 + 0.3939*o_x
plt.plot(o_x , o_y, color = 'red')
p_x = o_x
plt.plot(p_x , p_x, color = 'black')
plt.show()
plt.hist(d_m_phys, color = 'blue')
plt.hist(d_w_phys, color = 'red')
#plt.axvline(x=-1.96, color = 'r')
#plt.axvline(x=1.96, color = 'r')
plt.title('Distribution')
plt.xlabel('# of students')
plt.ylabel('Frequency')
plt.show()
n_obs = len(d_w_engin)
mean_w = np.mean(d_w_phys)
std_w = np.std(d_w_phys)
mean_m = np.mean(d_m_phys)
std_m = np.std(d_m_phys)
obs_diff = np.abs( mean_w - mean_m )
# Expected difference is that there is no difference
#(testing null hypothesis)
exp_diff = 0
#vng = (sd_g**2/n_g)
#vnb = (sd_b**2/n_b)
std_err = np.sqrt( ((std_w**2)/n_obs) + ((std_m**2)/n_obs) )
z = (obs_diff - exp_diff)/std_err
print('Sample difference:' , obs_diff)
print('Expected population difference:' , exp_diff)
print('Standard Error:' , std_err)
print('Z = ' , z)
d_m_math = Major["Grand total men (C2016_A First major Mathematics and Statistics Bachelor's degree)"]
d_w_math = Major["Grand total women (C2016_A First major Mathematics and Statistics Bachelor's degree)"]
math_model = smf.ols(formula = 'd_w_math ~ d_m_math', data = Major)
est = math_model.fit()
est.summary()
ln_inc_edu = plt.figure()
plt.scatter(d_m_math, d_w_math, color='black', marker='o')
plt.title("Sex distribution in a given college (Math)")
plt.xlabel('# of male students')
plt.ylabel('# of female students')
o_x = np.linspace(0,120,100) # 100 linearly spaced numbers
o_y = 3.5532 + 0.4344*o_x
plt.plot(o_x , o_y, color = 'red')
p_x = o_x
plt.plot(p_x , p_x, color = 'black')
plt.show()
plt.hist(d_m_math, color = 'blue')
plt.hist(d_w_math, color = 'red')
#plt.axvline(x=-1.96, color = 'r')
#plt.axvline(x=1.96, color = 'r')
plt.title('Distribution')
plt.xlabel('# of students')
plt.ylabel('Frequency')
plt.show()
n_obs = len(d_w_engin)
mean_w = np.mean(d_w_math)
std_w = np.std(d_w_math)
mean_m = np.mean(d_m_math)
std_m = np.std(d_m_math)
obs_diff = np.abs( mean_w - mean_m )
# Expected difference is that there is no difference
#(testing null hypothesis)
exp_diff = 0
#vng = (sd_g**2/n_g)
#vnb = (sd_b**2/n_b)
std_err = np.sqrt( ((std_w**2)/n_obs) + ((std_m**2)/n_obs) )
z = (obs_diff - exp_diff)/std_err
print('Sample difference:' , obs_diff)
print('Expected population difference:' , exp_diff)
print('Standard Error:' , std_err)
print('Z = ' , z)
The plots show that many colleges in Massachusetss do have more Stem Male majors than female Majors. A challenge was repeating/copy&pasting the code for each major.
plt.hist?
for i in range(len(unis[:,1])):
name = (unis[i,0])
enroll = (unis[i,1])
m_engin = (unis[i,2])
w_engin =(unis[i,3])
loc_engin = m_engin + w_engin
tot_engin = tot_engin + loc_engin
if m_engin or w_engin > 0:
plt.plot(m_engin, w_engin , color='green', marker='o')
plt.show()
print(tot_engin)
for i in range(len(unis[:,1])):
m_phys = (unis[i,4])
w_phys =(unis[i,5])
loc_phys = m_phys + w_phys
tot_phys = tot_phys + loc_phys
if m_phys or w_phys > 0:
plt.plot(m_phys, w_phys, color='blue', marker='o')
plt.show()
print(tot_phys)
m_math = (unis[0,6])
w_math = (unis[0,7])
loc_math = m_math + w_math
pcent_w = w_math/loc_math
for i in range(len(unis[:,1])-1):
m_math = (unis[i+1,6])
w_math = (unis[i+1,7])
loc_math = m_math + w_math
pcent_w = w_math/loc_math
concatenate((unis[0,6]),m_math)
plt.hist(pcent_w, bins = 30)
plt.show()
for i in range(len(unis[:,1])):
m_math = (unis[i,6])
w_math = (unis[i,7])
loc_math = m_math + w_math
tot_math = tot_math + loc_math
if m_math or w_math > 0:
plt.plot(m_math, w_math , color='black', marker='o')
plt.show()
print(tot_math)
print(unis[0,:])
print(unis[:,:])
bu = Major[28:29]
uml = Major[177:178]
uma = Major[173:174]
wpi = Major[189:190]
wit = Major[182:183]
tot_m_engin = 0
tot_w_engin = 0
tot_engin = 0
tot_m_math = 0
tot_w_math = 0
tot_math = 0
tot_m_phys = 0
tot_w_phys = 0
tot_phys = 0
for i in range(len(unis[:,1])):
name = (unis[i,0])
enroll = (unis[i,1])
m_engin = (unis[i,2])
w_engin =(unis[i,3])
loc_engin = m_engin + w_engin
tot_engin = tot_engin + loc_engin
m_phys = (unis[i,4])
w_phys =(unis[i,5])
loc_phys = m_phys + w_phys
tot_phys = tot_phys + loc_phys
m_math = (unis[i,6])
w_math = (unis[i,7])
loc_math = m_math + w_math
tot_math = tot_math + loc_math
"""
plt.plot(enroll, (m_engin + w_engin) , color='green', marker='o')
plt.plot(enroll, (m_phys + w_phys) , color='blue', marker='o')
plt.plot(enroll,(m_math + w_math) , color='black', marker='o')
plt.show()"""
print(tot_engin, tot_phys, tot_math)