Source code for alex.applications.PublicTransportInfoEN.data.preprocessing.mta_to_csv

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
A script that takes mta stops file and it selects important fields and saves them (works with GTFS mainly)
Usage:

./mta_to_csv.py [-m: main_city] [-o: output_file] stops.txt
"""

from __future__ import unicode_literals
import codecs
import os
import sys
from getopt import getopt


[docs]def get_column_index(header, caption, default):
    for i, h in enumerate(header.split(',')):
        if h.strip() == caption:
            return i
    return default


[docs]def load_list(filename, skip_comments=True):
    lines = []
    with codecs.open(filename, 'r', 'UTF-8') as fh_in:
        header = fh_in.readline()
        for line in fh_in:
            line = line.strip()
            # handle comments (strip or skip)
            if line.startswith('#'):
                if skip_comments:
                    continue
                else:
                    line = line.lstrip('#')
            lines.append(line)
    return lines, header


[docs]def remove_duplicities(lines):
    data = remove_following_duplicities(sorted(lines))
    chunks = group_by_name(data)
    return [average_same_stops(chunks.get(key)) for key in sorted(chunks.keys())]


[docs]def group_by_name(data):
    dict = {}
    for line in data:
        if line.startswith('#'):
            name = '#'
        else:
            name = line.split('\t')[0]

        if not name in dict:
            dict[name] = []
        dict[name].append(line)

    return dict


[docs]def remove_following_duplicities(lines):
    previous = "could_not_be_possibly_a_previous_line"
    output = []
    for line in sorted(lines):
        if previous == line:
            continue
        output.append(line)
        previous = line
    return output


[docs]def average_same_stops(same_stops):
    stop = ""
    city = ""
    longitude_sum = float(0)
    latitude_sum = float(0)
    for line in same_stops:
        if line.startswith('#'):
            return ";".join(same_stops)  # join comments to one line
        stop, city, geo = line.split('\t')
        longitude, latitude = geo.split('|')
        longitude_sum += float(longitude)
        latitude_sum += float(latitude)
    return "\t".join([stop, city, str(longitude_sum/len(same_stops)) + '|' + str(latitude_sum/len(same_stops))])


[docs]def extract_fields(lines, header, main_city, skip_comments=True):
    stop_index = get_column_index(header, "stop_name", 2)
    lat_index = get_column_index(header, "stop_lat", 4)
    lon_index = get_column_index(header, "stop_lon", 5)

    data = ["#stop\tcity\tlongitude|latitude"]

    for line in lines:
        if not line:
            continue
        if line.startswith('#'):
            if skip_comments:
                continue
            else:
                line.lstrip('#')

        fields = line.strip().split(',')
        if len(fields) > len(header.split(',')):
            split = line.split('"')
            split[1] = split[1].replace(',',';')
            fields = '"'.join(split).split(',')
        if len(fields) != len(header.split(',')):
            print "different lengths!"

        stop = fields[stop_index].strip().strip('"')
        latitude = fields[lat_index].strip().strip('"')
        longitude = fields[lon_index].strip().strip('"')

        data.append('\t'.join([stop, main_city, longitude + '|' + latitude]))
    return data

[docs]def write_data(file_name, data):
    with codecs.open(file_name, "w", 'UTF-8') as fh_out:
        for line in remove_duplicities(data):
            print >> fh_out, line


[docs]def main():
    file_out = "./mta_to_csv.csv"
    main_city = "New York"

    opts, files = getopt(sys.argv[1:], '-o:m:')
    for opt, arg in opts:
        if opt == '-o':
            file_out = arg
        if opt == '-m':
            main_city = arg


    # sanity check
    if len(files) != 1:
        sys.exit(__doc__)

    # initialization
    file_stops = files[0]

    # load list of stops
    lines, header = load_list(file_stops)
    data = extract_fields(lines, header, main_city)
    print "writing to " + os.path.abspath(file_out)
    write_data(file_out, data)

if __name__ == '__main__':
    main()