#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
A script that takes mta stops file and it selects important fields and saves them (works with GTFS mainly)
Usage:
./mta_to_csv.py [-m: main_city] [-o: output_file] stops.txt
"""
from __future__ import unicode_literals
import codecs
import os
import sys
from getopt import getopt
[docs]def get_column_index(header, caption, default):
for i, h in enumerate(header.split(',')):
if h.strip() == caption:
return i
return default
[docs]def load_list(filename, skip_comments=True):
lines = []
with codecs.open(filename, 'r', 'UTF-8') as fh_in:
header = fh_in.readline()
for line in fh_in:
line = line.strip()
# handle comments (strip or skip)
if line.startswith('#'):
if skip_comments:
continue
else:
line = line.lstrip('#')
lines.append(line)
return lines, header
[docs]def remove_duplicities(lines):
data = remove_following_duplicities(sorted(lines))
chunks = group_by_name(data)
return [average_same_stops(chunks.get(key)) for key in sorted(chunks.keys())]
[docs]def group_by_name(data):
dict = {}
for line in data:
if line.startswith('#'):
name = '#'
else:
name = line.split('\t')[0]
if not name in dict:
dict[name] = []
dict[name].append(line)
return dict
[docs]def remove_following_duplicities(lines):
previous = "could_not_be_possibly_a_previous_line"
output = []
for line in sorted(lines):
if previous == line:
continue
output.append(line)
previous = line
return output
[docs]def average_same_stops(same_stops):
stop = ""
city = ""
longitude_sum = float(0)
latitude_sum = float(0)
for line in same_stops:
if line.startswith('#'):
return ";".join(same_stops) # join comments to one line
stop, city, geo = line.split('\t')
longitude, latitude = geo.split('|')
longitude_sum += float(longitude)
latitude_sum += float(latitude)
return "\t".join([stop, city, str(longitude_sum/len(same_stops)) + '|' + str(latitude_sum/len(same_stops))])
[docs]def write_data(file_name, data):
with codecs.open(file_name, "w", 'UTF-8') as fh_out:
for line in remove_duplicities(data):
print >> fh_out, line
[docs]def main():
file_out = "./mta_to_csv.csv"
main_city = "New York"
opts, files = getopt(sys.argv[1:], '-o:m:')
for opt, arg in opts:
if opt == '-o':
file_out = arg
if opt == '-m':
main_city = arg
# sanity check
if len(files) != 1:
sys.exit(__doc__)
# initialization
file_stops = files[0]
# load list of stops
lines, header = load_list(file_stops)
data = extract_fields(lines, header, main_city)
print "writing to " + os.path.abspath(file_out)
write_data(file_out, data)
if __name__ == '__main__':
main()