#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
A script that takes us cities (city\tstate_code)file and state-codes and it joins them
Usage:
./us_cities_to_csv.py [-o: output_file] cities.txt state-codes.txt
"""
from __future__ import unicode_literals
import codecs
import os
import sys
from getopt import getopt
[docs]def get_column_index(header, caption, default):
for i, h in enumerate(header.split('-')):
if h == caption:
return i
return default
[docs]def load_list(filename, skip_comments=True):
lines = []
with codecs.open(filename, 'r', 'UTF-8') as fh_in:
header = fh_in.readline()
for line in fh_in:
line = line.strip()
# handle comments (strip or skip)
if line.startswith('#'):
if skip_comments:
continue
else:
line = line.lstrip('#')
lines.append(line)
return lines, header
[docs]def load_state_code_dict(file_state_codes, skip_comments=True):
dict = {}
with codecs.open(file_state_codes, 'r', 'UTF-8') as fh_in:
for line in fh_in:
line = line.strip()
# handle comments (strip or skip)
if line.startswith('#'):
if skip_comments:
continue
else:
line = line.lstrip('#')
state, code = line.split('\t')
dict[code] = state
return dict
[docs]def remove_duplicities(lines):
data = remove_following_duplicities(sorted(lines))
chunks = group_by_city_and_state(data)
return [average_same_city(chunks.get(key)) for key in sorted(chunks.keys())]
[docs]def group_by_city_and_state(data):
dict = {}
for line in data:
if line.startswith('#'):
key = '#'
else:
city, state = line.split('\t')[0:2]
key = city + '_' + state
if not key in dict:
dict[key] = []
dict[key].append(line)
return dict
[docs]def remove_following_duplicities(lines):
previous = "could_not_be_possibly_a_previous_line"
output = []
for line in sorted(lines):
if previous == line:
continue
output.append(line)
previous = line
return output
[docs]def average_same_city(same_stops):
city = ""
state = ""
longitude_sum = float(0)
latitude_sum = float(0)
for line in same_stops:
if line.startswith('#'):
return ";".join(same_stops) # join comments to one line
city, state, geo = line.split('\t')
longitude, latitude = geo.split('|')
longitude_sum += float(longitude)
latitude_sum += float(latitude)
return "\t".join([city, state, str(longitude_sum/len(same_stops)) + '|' + str(latitude_sum/len(same_stops))])
[docs]def write_data(file_name, data):
with codecs.open(file_name, "w", 'UTF-8') as fh_out:
for line in remove_duplicities(data):
print >> fh_out, line
[docs]def main():
opts, files = getopt(sys.argv[1:], '-o:')
file_out = "./us_cities_to_csv.csv"
for opt, arg in opts:
if opt == '-o':
file_out = arg
# sanity check
if len(files) != 2:
sys.exit(__doc__)
# initialization
file_stops = files[0]
file_state_codes = files[1]
# load list of cities
state_dictionary = load_state_code_dict(file_state_codes)
lines, header = load_list(file_stops)
data = extract_fields(lines, header, state_dictionary)
print "writing to " + os.path.abspath(file_out)
write_data(file_out, data)
if __name__ == '__main__':
main()