Source code for alex.applications.PublicTransportInfoEN.data.preprocessing.us_cities_to_csv

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
A script that takes us cities (city\tstate_code)file and state-codes and it joins them

Usage:

./us_cities_to_csv.py [-o: output_file] cities.txt state-codes.txt
"""

from __future__ import unicode_literals
import codecs
import os
import sys
from getopt import getopt


[docs]def get_column_index(header, caption, default):
    for i, h in enumerate(header.split('-')):
        if h == caption:
            return i
    return default


[docs]def load_list(filename, skip_comments=True):
    lines = []
    with codecs.open(filename, 'r', 'UTF-8') as fh_in:
        header = fh_in.readline()
        for line in fh_in:
            line = line.strip()
            # handle comments (strip or skip)
            if line.startswith('#'):
                if skip_comments:
                    continue
                else:
                    line = line.lstrip('#')
            lines.append(line)
    return lines, header


[docs]def load_state_code_dict(file_state_codes, skip_comments=True):
    dict = {}
    with codecs.open(file_state_codes, 'r', 'UTF-8') as fh_in:
        for line in fh_in:
            line = line.strip()
            # handle comments (strip or skip)
            if line.startswith('#'):
                if skip_comments:
                    continue
                else:
                    line = line.lstrip('#')
            state, code = line.split('\t')
            dict[code] = state
    return dict


[docs]def remove_duplicities(lines):
    data = remove_following_duplicities(sorted(lines))
    chunks = group_by_city_and_state(data)
    return [average_same_city(chunks.get(key)) for key in sorted(chunks.keys())]


[docs]def group_by_city_and_state(data):
    dict = {}
    for line in data:
        if line.startswith('#'):
            key = '#'
        else:
            city, state = line.split('\t')[0:2]
            key = city + '_' + state

        if not key in dict:
            dict[key] = []
        dict[key].append(line)

    return dict


[docs]def remove_following_duplicities(lines):
    previous = "could_not_be_possibly_a_previous_line"
    output = []
    for line in sorted(lines):
        if previous == line:
            continue
        output.append(line)
        previous = line
    return output


[docs]def average_same_city(same_stops):
    city = ""
    state = ""
    longitude_sum = float(0)
    latitude_sum = float(0)
    for line in same_stops:
        if line.startswith('#'):
            return ";".join(same_stops)  # join comments to one line
        city, state, geo = line.split('\t')
        longitude, latitude = geo.split('|')
        longitude_sum += float(longitude)
        latitude_sum += float(latitude)
    return "\t".join([city, state, str(longitude_sum/len(same_stops)) + '|' + str(latitude_sum/len(same_stops))])


[docs]def extract_fields(lines, header, state_dictionary, skip_comments=True):
    state_code_index = get_column_index(header, "state", 1)
    city_index = get_column_index(header, "city", 2)
    lat_index = get_column_index(header, "lat", 3)
    lon_index = get_column_index(header, "lng", 4)

    data = ["#city\tstate\tlongitude|latitude"]

    for line in lines:
        if not line:
            continue
        if line.startswith('#'):
            if skip_comments:
                continue
            else:
                line.lstrip('#')

        fields = line.strip().split(',')
        if len(fields) > len(header.split('-')):
            print "different lengths!"

        state_code = fields[state_code_index].strip().strip('"')
        city = fields[city_index].strip().strip('"')
        latitude = fields[lat_index].strip().strip('"')
        longitude = fields[lon_index].strip().strip('"')

        state = state_dictionary[state_code]

        data.append('\t'.join([city, state, longitude + '|' + latitude]))
    return data


[docs]def write_data(file_name, data):
    with codecs.open(file_name, "w", 'UTF-8') as fh_out:
        for line in remove_duplicities(data):
            print >> fh_out, line


[docs]def main():
    opts, files = getopt(sys.argv[1:], '-o:')
    file_out = "./us_cities_to_csv.csv"
    for opt, arg in opts:
        if opt == '-o':
            file_out = arg

    # sanity check
    if len(files) != 2:
        sys.exit(__doc__)

    # initialization
    file_stops = files[0]
    file_state_codes = files[1]

    # load list of cities
    state_dictionary = load_state_code_dict(file_state_codes)
    lines, header = load_list(file_stops)
    data = extract_fields(lines, header, state_dictionary)
    print "writing to " + os.path.abspath(file_out)
    write_data(file_out, data)

if __name__ == '__main__':
    main()