-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path3_occupation_ranking.py
81 lines (61 loc) · 2.78 KB
/
3_occupation_ranking.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# -*- coding: utf-8 -*-
"""
Created on Fri May 31 20:34:51 2019
@author: Alex
"""
import pandas as pd
import numpy as np
import os
os.chdir('D:/Users/Alex/Git_Repositories/Thesis')
column_titles = ['SIPP_name', 'TJBOCC1', 'occ_code']
occ_codes = pd.read_excel('2008_SIPP_SOC_crosswalk.xls', header = None,
names = column_titles)
occ_wages = pd.read_excel('national_M2008_dl.xls', na_values = ['*', '#'])
# Check for duplciates
print(occ_codes[occ_codes.duplicated('occ_code', keep = False)])
print(occ_wages[occ_wages.duplicated('occ_code', keep = False)])
# Strip whitespace
occ_codes['occ_code'] = occ_codes['occ_code'].str.strip()
occ_wages['occ_code'] = occ_wages['occ_code'].str.strip()
# Merge
occ_merged = pd.merge(occ_codes, occ_wages, how = 'left', on = ['occ_code'], validate = '1:1')
# Rank occupations
# Warning: Ranking by hourly rather than annual wages leads to missing data
occ_merged['occ_rank'] = occ_merged['h_median'].rank(ascending = False)
# Drop superfluous columns
occ_small = occ_merged[['TJBOCC1', 'SIPP_name', 'occ_rank']]
# Read SIPP dataset
SIPP_Data = pd.read_pickle('SIPP_Dataset_2')
# Merge
df = pd.merge(SIPP_Data, occ_small, how = 'left', on = ['TJBOCC1'], validate = 'm:1')
# Sort
df.sort_values(['unique_id', 'ref_date'], inplace=True)
# Take first differences of rank
df['rank_diff'] = df.groupby('unique_id')['occ_rank'].diff()
# Codify to moving up or down rank
df['occ_change'] = np.nan
df['occ_change'].mask(df['rank_diff'] == 0, 0, inplace = True)
df['occ_change'].mask(df['rank_diff'] < 0, 1, inplace = True)
df['occ_change'].mask(df['rank_diff'] > 0, -1, inplace = True)
# Create binary to indicate post birth
df['post_birth'] = np.nan
df['post_birth'].mask(df['months_since_birth'] <= 0, 0, inplace = True)
df['post_birth'].mask(df['months_since_birth'] > 0, 1, inplace = True)
# Dummy variable to indicate occupation rank increase
df['occ_rank_increase'] = np.nan
df['occ_rank_increase'].mask(df['rank_diff'] == 0, 0, inplace = True)
df['occ_rank_increase'].mask(df['rank_diff'] > 0, 0, inplace = True)
df['occ_rank_increase'].mask(df['rank_diff'] < 0, 1, inplace = True)
# Dummy variable to indicate occupation rank decrease
df['occ_rank_decrease'] = np.nan
df['occ_rank_decrease'].mask(df['rank_diff'] == 0, 0, inplace = True)
df['occ_rank_decrease'].mask(df['rank_diff'] < 0, 0, inplace = True)
df['occ_rank_decrease'].mask(df['rank_diff'] > 0, 1, inplace = True)
# Generate variable to indicate employer change
# Note: does not record change if missing data separates employer change
df['employer_change'] = df.groupby('unique_id')['EENO1'].diff()
df['employer_change'].mask(
np.logical_and(df['employer_change'] != 0,
df['employer_change'].isnull() == False),
1, inplace = True)
df.to_pickle('SIPP_Dataset_3')