import pandas as pd
import camelot
filename = 'data/Apple-Supplier-List.pdf'
tables = camelot.read_pdf(filename, flavor='stream', pages='1-end')
companies = pd.concat(
[tables[0].df[5:-1]] +
[table.df[:-1] for table in tables[1:-1]] +
[tables[-1].df[4:12]]
).drop(columns=[2]) \
.rename(columns={0: 'supplier_name', 1: 'address'}) \
.reset_index(drop=True)
companies[10:15]
def fix_multirow(companies, mr_col, merge_col, multiline_vals=[None, '']):
to_drop = []
for i, row in companies.iterrows():
if row[mr_col].strip() in multiline_vals:
prev_row = companies.iloc[i-1]
prev_row[merge_col] += ' ' + row[merge_col]
if len(row[mr_col].strip()) > 0:
prev_row[mr_col] += ' ' + row[mr_col]
to_drop.append(i)
return companies.drop(to_drop).reset_index(drop=True)
c1 = fix_multirow(companies, 'supplier_name', 'address')
c2 = fix_multirow(c1, 'address', 'supplier_name')
multiline_corps = [
'(Foxconn)',
'(Foxlink)',
'& Systemtechnik AG',
'Co., Ltd.',
'Co., Ltd. (TSMC)',
'Manufacturing Co., Ltd.',
'Products Co., Ltd.',
'Technology Co., Ltd.',
]
final = fix_multirow(c2, 'supplier_name', 'address', multiline_corps)
Geolocating through Google API is not free (and it's slow), so adding a caching layer.
import configparser
config = configparser.ConfigParser()
config.read('secrets.ini')
MAPS_API_KEY = config['GoogleMaps']['ApiKey']
import sys
sys.path.append(".")
from apple_suppliers.geolocation import Geolocation
gl = Geolocation(MAPS_API_KEY)
gl.extract_region('1600 Amphitheatre Parkway, Mountain View, CA')
from apple_suppliers.log_progress import log_progress
for addr in log_progress(final.address):
gl.geocode(addr)
# Manually fix address that didn't geocode correctly
gl.cache['560 Moo 2 Bangpa-in Industrial Estate, Udomsorayuth Road, T.Klong-Jik, Bang Pa-In, Ayutthaya, Thailand'] = \
gl.geocode('Udomsorayuth Road, T.Klong-Jik, Bang Pa-In, Ayutthaya, Thailand')
gl.cache['No. 18 Pan Long Shan Road, Jiangyin, Jiangsu, China'] = \
gl.geocode('Pan Long Shan, Jiangyin, Jiangsu, China')
gl.cache['No. 2275 Chengliu Zhonglu, Jiading District, Shanghai, China'] = \
gl.geocode('Prent Corp. No. 2275 Chengliu Zhonglu, Jiading District, Shanghai, China')
gl.cache['727 Kihara, Kiyotake-cho, Miyazaki, Japan'] = \
gl.geocode('Kihara, Kiyotake-cho, Miyazaki, Japan')
gl.write_cache()
final['lat'] = final.address.apply(gl.extract_lat)
final['lng'] = final.address.apply(gl.extract_lng)
final['country'] = final.address.apply(gl.extract_country)
final['region'] = final.address.apply(gl.extract_region)
final.to_csv('data/suppliers.csv')
final = pd.read_csv('data/suppliers.csv', index_col=0)
final.country.value_counts(ascending=True).plot(kind='barh', figsize=(16,8))
from ipywidgets import HTML
from ipyleaflet import Map, MarkerCluster, CircleMarker, WMSLayer, LayersControl, basemaps, basemap_to_tiles
mapnik = basemap_to_tiles(basemaps.OpenStreetMap.Mapnik)
mapnik.base = True
google_satellite = WMSLayer(name='Google Satelite', url="https://mt1.google.com/vt/lyrs=s&x={x}&y={y}&z={z}",
attribution='Google')
google_satellite.base = True
m = Map(center=(30, 0), zoom=2, layers=[google_satellite, mapnik])
markers = []
for i, row in final.iterrows():
cm = CircleMarker(location=(row.lat, row.lng), radius=5)
cm.popup = HTML("<b>{name}</b><br>{address}".format(name=row.supplier_name, address=row.address))
markers.append(cm)
m.add_layer(MarkerCluster(markers=markers, name='Apple Suppliers'))
m.add_control(LayersControl())
m