-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsod1_download.py
81 lines (65 loc) · 2.57 KB
/
sod1_download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# -*- coding: utf-8 -*-
"""
Created on Fri 10 May 2024 00:05:26
Downloads FDIC zip files to memory; processes them into nice csv.xz files
@author: Kevin.Wong
"""
import io
import lzma
import multiprocessing as mp
import os
import re
import zipfile as zf
from concurrent.futures import ProcessPoolExecutor, wait
from datetime import datetime
from os import path
import pandas as pd
import polars as pl
import requests
URL = 'https://www7.fdic.gov/sod/download/ALL_{}.zip'
os.makedirs('sod', exist_ok=True)
def download_sod(y: int, output_path: str):
# stream the zip archive to memory
r = requests.get(URL.format(y))
if r.status_code != 200:
# warn if current year not available
print(f'no sod file for year {y}', flush=True)
return
print(f'retrieved sod zip for {y}', flush=True)
# extract the ALL file; the ALL_1 and ALL_2 files are just broken-up copies
# of the whole data; not sure why the FDIC chose that format
z = zf.ZipFile(io.BytesIO(r.content))
sf_name = next(
i for i in z.namelist() if re.match(r'ALL_\d{4}\.csv', i))
# for unclear reasons polars doesn't like reading from the zip file handle
# must load it all to memory first
try:
# in newer years utf-8 encoding is used
sod = pd.read_csv(z.open(sf_name, 'r'), thousands=',')
except (pl.ComputeError, UnicodeDecodeError):
# but in earlier ears only latin-1 encoding is used
sod = pd.read_csv(
z.open(sf_name, 'r'), thousands=',',
encoding='latin-1')
# # zip can be improperly parsed as float; fix to int if so
# if sod['ZIPBR'].dtype == float:
# sod = sod.with_columns(pl.col('ZIPBR').cast(pl.Int64))
# save
sod.to_csv(lzma.open(output_path, 'wb', preset=9))
print(f'wrote sod {y}', flush=True)
if __name__ == '__main__':
futures = []
with ProcessPoolExecutor(6, mp_context=mp.get_context('spawn')) as e:
# sod starts in 1994 --
# there was sod prior to 1994, but it did not include thrifts; this is
# due to FSLIC (Federal Savings and Loan Insurance Corporation) and RTC
# (Resolution Trust Corporation) merger into the FDIC circa 1993
for year in range(1994, datetime.now().year):
output_path = path.join('sod', f'sod_{year}.csv.xz')
if path.exists(output_path) and \
path.getsize(output_path) != 0:
# skip files that already exist
continue
# download_sod(year)
futures.append(e.submit(download_sod, year, output_path))
wait(futures)