Source code for alex.applications.PublicTransportInfoEN.data.preprocessing.stops_to_streets_experiment

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
A script that takes mta stops, it splits them by special characters and each item takes for a street

"""

from __future__ import unicode_literals
import codecs
import os


[docs]def get_column_index(header, caption, default):
    for i, h in enumerate(header.split(',')):
        if h.strip() == caption:
            return i
    return default


[docs]def load_list(filename):
    lines = []
    with codecs.open(filename, 'r', 'UTF-8') as fh_in:
        for line in fh_in:
            line = line.strip()
            # handle comments (strip or skip)
            if line.startswith('#'):
                continue
            lines.append(line)
    return lines


[docs]def remove_duplicities(lines):
    data = remove_following_duplicities(sorted(lines))
    chunks = group_by_name(data)
    return [average_same_stops(chunks.get(key)) for key in sorted(chunks.keys())]


[docs]def group_by_name(data):
    dict = {}
    for line in data:
        if line.startswith('#'):
            name = '#'
        else:
            name = line.split('\t')[0]

        if not name in dict:
            dict[name] = []
        dict[name].append(line)

    return dict


[docs]def remove_following_duplicities(lines):
    previous = "could_not_be_possibly_a_previous_line"
    output = []
    for line in sorted(lines):
        if previous == line:
            continue
        output.append(line)
        previous = line
    return output


[docs]def average_same_stops(same_stops):
    stop = ""
    city = ""
    longitude_sum = float(0)
    latitude_sum = float(0)
    for line in same_stops:
        if line.startswith('#'):
            return ";".join(same_stops)  # join comments to one line
        stop, city, geo = line.split('\t')
        longitude, latitude = geo.split('|')
        longitude_sum += float(longitude)
        latitude_sum += float(latitude)
    return "\t".join([stop, city, str(longitude_sum/len(same_stops)) + '|' + str(latitude_sum/len(same_stops))])


[docs]def extract_stops(lines):
    data = ["#stop\tcity\tlongitude|latitude"]

    for line in lines:
        if not line:
            continue
        if line.startswith('#'):
            continue
        stop, main_city, _ = line.split("\t", 2)

        if '-' in stop:
            streets = stop.split('-')
        elif '(' in stop:
            streets = stop.replace(')', '').split('(')
        elif '/' in stop:
            streets = stop.split('/')
        elif '&' in stop:
            streets = stop.split('&')
        else:
            continue

        data.extend([street + '\t' + main_city + "\tNan|Nan" for street in streets])
    return data

[docs]def write_data(file_name, data):
    with codecs.open(file_name, "w", 'UTF-8') as fh_out:
        for line in remove_duplicities(data):
            print >> fh_out, line


[docs]def main():
    file_out = "./streets_experiment.csv"
    main_city = "New York"

    # initialization
    file_stops = "/home/m2rtin/alex/alex/applications/PublicTransportInfoEN/data/stops.locations.csv"

    # load list of stops
    lines = load_list(file_stops)
    data = extract_stops(lines)
    print "writing to " + os.path.abspath(file_out)
    write_data(file_out, data)

if __name__ == '__main__':
    main()