#Use dask to read only necessary columns at a time. Speed up EDA
import dask.dataframe as dd#similar to pandas
import pandas as pd#pandas to create small dataframes
import folium#open street map
import datetime#Convert to unix time
import time#Convert to unix time
import numpy as np#Do aritmetic operations on arrays
import seaborn as sns#Plots
import matplotlib.pylab as plt#Plots
import matplotlib #Plots
from matplotlib import rcParams#Size of plots
import gpxpy.geo#Get the haversine distance
from sklearn.cluster import MiniBatchKMeans, KMeans#Clustering
import math
import pickle
import os
mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-5.3.0-posix-seh-rt_v4-rev0\\mingw64\\bin'
os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
#There are 19 columns
month = dd.read_csv('yellow_tripdata_2015-01.csv')
print(month.columns)
The data used in the attached datasets were collected and provided to the NYC Taxi and Limousine Commission (TLC)
These are the famous NYC yellow taxis that provide transportation exclusively through street-hails. The number of taxicabs is limited by a finite number of medallions issued by the TLC. You access this mode of transportation by standing in the street and hailing an available taxi with your hand. The pickups are not pre-arranged.
FHV transportation is accessed by a pre-arrangement with a dispatcher or limo company. These FHVs are not permitted to pick up passengers via street hails, as those rides are not considered pre-arranged.
The SHL program will allow livery vehicle owners to license and outfit their vehicles with green borough taxi branding, meters, credit card machines, and ultimately the right to accept street hails in addition to pre-arranged rides.
Credits: Quora
We Have collected all yellow taxi trips data from jan-2015 to dec-2016(Will be using only 2015 data)
file name | file name size | number of records | number of features |
---|---|---|---|
yellow_tripdata_2016-01 | 1. 59G | 10906858 | 19 |
yellow_tripdata_2016-02 | 1. 66G | 11382049 | 19 |
yellow_tripdata_2016-03 | 1. 78G | 12210952 | 19 |
yellow_tripdata_2016-04 | 1. 74G | 11934338 | 19 |
yellow_tripdata_2016-05 | 1. 73G | 11836853 | 19 |
yellow_tripdata_2016-06 | 1. 62G | 11135470 | 19 |
yellow_tripdata_2016-07 | 884Mb | 10294080 | 17 |
yellow_tripdata_2016-08 | 854Mb | 9942263 | 17 |
yellow_tripdata_2016-09 | 870Mb | 10116018 | 17 |
yellow_tripdata_2016-10 | 933Mb | 10854626 | 17 |
yellow_tripdata_2016-11 | 868Mb | 10102128 | 17 |
yellow_tripdata_2016-12 | 897Mb | 10449408 | 17 |
yellow_tripdata_2015-01 | 1.84Gb | 12748986 | 19 |
yellow_tripdata_2015-02 | 1.81Gb | 12450521 | 19 |
yellow_tripdata_2015-03 | 1.94Gb | 13351609 | 19 |
yellow_tripdata_2015-04 | 1.90Gb | 13071789 | 19 |
yellow_tripdata_2015-05 | 1.91Gb | 13158262 | 19 |
yellow_tripdata_2015-06 | 1.79Gb | 12324935 | 19 |
yellow_tripdata_2015-07 | 1.68Gb | 11562783 | 19 |
yellow_tripdata_2015-08 | 1.62Gb | 11130304 | 19 |
yellow_tripdata_2015-09 | 1.63Gb | 11225063 | 19 |
yellow_tripdata_2015-10 | 1.79Gb | 12315488 | 19 |
yellow_tripdata_2015-11 | 1.65Gb | 11312676 | 19 |
yellow_tripdata_2015-12 | 1.67Gb | 11460573 | 19 |
Field Name | Description |
---|---|
VendorID |
A code indicating the TPEP provider that provided the record.
|
tpep_pickup_datetime | The date and time when the meter was engaged. |
tpep_dropoff_datetime | The date and time when the meter was disengaged. |
Passenger_count | The number of passengers in the vehicle. This is a driver-entered value. |
Trip_distance | The elapsed trip distance in miles reported by the taximeter. |
Pickup_longitude | Longitude where the meter was engaged. |
Pickup_latitude | Latitude where the meter was engaged. |
RateCodeID | The final rate code in effect at the end of the trip.
|
Store_and_fwd_flag | This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka “store and forward,” because the vehicle did not have a connection to the server. Y= store and forward trip N= not a store and forward trip |
Dropoff_longitude | Longitude where the meter was disengaged. |
Dropoff_ latitude | Latitude where the meter was disengaged. |
Payment_type | A numeric code signifying how the passenger paid for the trip.
|
Fare_amount | The time-and-distance fare calculated by the meter. |
Extra | Miscellaneous extras and surcharges. Currently, this only includes. the $0.50 and $1 rush hour and overnight charges. |
MTA_tax | 0.50 MTA tax that is automatically triggered based on the metered rate in use. |
Improvement_surcharge | 0.30 improvement surcharge assessed trips at the flag drop. the improvement surcharge began being levied in 2015. |
Tip_amount | Tip amount – This field is automatically populated for credit card tips.Cash tips are not included. |
Tolls_amount | Total amount of all tolls paid in trip. |
Total_amount | The total amount charged to passengers. Does not include cash tips. |
samples = 12748986+12450521+13351609+13071789+13158262+12324935+11562783+11130304+11225063+12315488+11312676+11460573
print(samples)
We will be taking the data of year 2015(12 months data) as train data, and test our model on year 2016 data.
month.head(5)
outlier_locations = month[(month.pickup_longitude != 0 and month.pickup_latitude != 0) & ((month.pickup_longitude <= -74.15) or (month.pickup_latitude <= 40.5774)or \
(month.pickup_longitude >= -73.7004) or (month.pickup_latitude >= 40.9176))]
#Optional
#Comment out if necessary
map_osm = folium.Map(location=[40.734695, -73.990372], tiles='Stamen Toner')
sample_locations = outlier_locations.head(100)
for i,j in sample_locations.iterrows():
folium.Marker(list((j['pickup_latitude'],j['pickup_longitude']))).add_to(map_osm)
map_osm
def convert_to_unix(s):
return time.mktime(datetime.datetime.strptime(s, "%Y-%m-%d %H:%M:%S").timetuple())
#return a dataframe with many parameters along with trip duration and trip pickup times(with outliers)
def return_with_trip_times():
duration = month[['tpep_pickup_datetime','tpep_dropoff_datetime']].compute()
#pickups and dropoffs to unix time
duration_pickup = [convert_to_unix(x) for x in duration['tpep_pickup_datetime'].values]
duration_drop = [convert_to_unix(x) for x in duration['tpep_dropoff_datetime'].values]
#calculate duration of trips
durations = (np.array(duration_drop) - np.array(duration_pickup))/float(60)
#append durations of trips and pickup times in unix to a new dataframe
new_frame = month[['VendorID','payment_type','passenger_count','trip_distance','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','RateCodeID','store_and_fwd_flag']].compute()
new_frame['trip_times'] = durations
new_frame['pickup_times'] = duration_pickup
new_frame['Speed'] = 60*(new_frame['trip_distance']/new_frame['trip_times'])
return new_frame
#duration_dropoff
#outliers aren't still removed
#dataframe with durations and pickup unix time appended
frame_with_durations = return_with_trip_times()
def plot_boxplot(new_frame, string1, string2):
var = new_frame[string1].values
# print(string1,var, new_frame.shape)
var = np.sort(var,axis = None)
# print(string1, var.shape)
if string2!="":
plt.ylim(0, var[int(float(95)*len(var)/100)])
sns.boxplot(y= string1, data = new_frame)
sns.plt.title(string1+' in minutes without outliers')
plt.show()
return var
def remove_outliers(new_frame):
print ("Original:",new_frame.shape[0])
a = new_frame.shape[0]
new_frame = new_frame[((new_frame.dropoff_longitude >= -74.15) & (new_frame.dropoff_longitude <= -73.7004) &\
(new_frame.dropoff_latitude >= 40.5774) & (new_frame.dropoff_latitude <= 40.9176)) & \
((new_frame.pickup_longitude >= -74.15) & (new_frame.pickup_latitude >= 40.5774)& \
(new_frame.pickup_longitude <= -73.7004) & (new_frame.pickup_latitude <= 40.9176))]
print ("Removing outliers from NY boundaries:",new_frame.shape[0],(a - new_frame.shape[0]))
b = new_frame.shape[0]
print ("\ntrip times percentiles...")
var = plot_boxplot(new_frame, "trip_times", "")
for i in range(0,len(var),int(float(10)*len(var)/100)):
print ("{}th percentile {}".format(round(float(i)/len(var),3)*100,round(var[i],3)))
print(new_frame.shape, "before frist")
# new_frame = new_frame[(new_frame.trip_times > 0) & (new_frame.trip_times < 720)]
print(new_frame.shape, "frist", new_frame.head())
print ("Removing outliers from trip times:",new_frame.shape[0],(b - new_frame.shape[0]))
var = plot_boxplot(new_frame, "trip_times", "out")
print(new_frame.shape, "second")
#--------------------------------------------------------------
print ("\ntrip distance percentiles...")
var = plot_boxplot(new_frame, "trip_distance", "")
for i in range(0,len(var),int(float(10)*len(var)/100)):
print ("{}th percentile {}".format(round(float(i)/len(var),3)*100,round(var[i],3)))
c = new_frame.shape[0]
new_frame = new_frame[(new_frame.trip_distance > 0) & (new_frame.trip_distance < 23)]
print ("Removing outliers from trip distance:",new_frame.shape[0],(c - new_frame.shape[0]))
var = plot_boxplot(new_frame,"trip_distance", "out")
#--------------------------------------------------------------
print ("\nSpeed percentiles...")
var = plot_boxplot(new_frame, "Speed", "")
for i in range(0,len(var),int(float(10)*len(var)/100)):
print ("{}th percentile {}".format(round(float(i)/len(var),3)*100,round(var[i],3)))
d = new_frame.shape[0]
new_frame = new_frame[(new_frame.Speed <= 65) & (new_frame.Speed >= 0)]
print ("Removing outliers from trip speed:",new_frame.shape[0],(d - new_frame.shape[0]))
var = plot_boxplot(new_frame, "Speed", "out")
#--------------------------------------------------------------
print( "Total outliers removed",a - new_frame.shape[0])
return new_frame
frame_with_durations_outliers_removed = remove_outliers(frame_with_durations)
frame_with_durations_outliers_removed.head()
import seaborn as sns
#trip times
def dist_of_params(frame,variable,title):
sns.FacetGrid(frame,size=6) \
.map(sns.kdeplot, variable) \
.add_legend();
sns.plt.title(title)
plt.show();
#trip times
dist_of_params(frame_with_durations_outliers_removed,'trip_times','time for cab trips distribution(in minutes)')
#log trip times
log_trip_times = frame_with_durations_outliers_removed.trip_times.values
frame_with_durations_outliers_removed['log_times'] = np.log(log_trip_times)
dist_of_params(frame_with_durations_outliers_removed,'log_times','log of time for cab trips distribution')
#trip distances
dist_of_params(frame_with_durations_outliers_removed,'trip_distance','distance for cab trips distribution')
#log trip distances
log_trip_distance = frame_with_durations_outliers_removed.trip_distance.values
frame_with_durations_outliers_removed['log_distance'] = np.log(log_trip_distance)
dist_of_params(frame_with_durations_outliers_removed,'log_distance','log of distance for cab trips distribution')
#trip speed
dist_of_params(frame_with_durations_outliers_removed,'Speed','average Speed of cab trips distribution')
#log speed
log_trip_speed = frame_with_durations_outliers_removed.Speed.values
frame_with_durations_outliers_removed['log_speed'] = np.log(log_trip_speed)
dist_of_params(frame_with_durations_outliers_removed,'log_speed','log of speed for cab trips distribution')
#Get unique vendors
vendor_unique = frame_with_durations_outliers_removed.VendorID.unique()
Vendors = []
#Get the number of trips for each vendor
Vendors.append(frame_with_durations_outliers_removed[frame_with_durations_outliers_removed['VendorID'] == 1].shape[0])
Vendors.append(frame_with_durations_outliers_removed[frame_with_durations_outliers_removed['VendorID'] == 2].shape[0])
#Names of vendors
print("Unique vendors\n {}: Creative Mobile Technologies\n {}: VeriFone Inc\n".format(vendor_unique[1],vendor_unique[0]))
print ("Vendor 1 and Vendor 2 count:\n",Vendors)
sns.set_style("whitegrid")
fig, ax = plt.subplots()
fig.set_size_inches(12, 5)
ax = sns.barplot(x = ['Creative Mobile Technologies', 'VeriFone Inc'] , y = np.array(Vendors))
ax.set(ylabel='Pickups')
sns.plt.title('Pickups per vendor for the month of JAN')
plt.show()
print ("Creative Mobile Technologies share :{} % VeriFone Inc share :{} % ".format(100*float(Vendors[1])/sum(Vendors),100*float(Vendors[0])/sum(Vendors)))
#Get the number of pickups per each passenger count
passenger_count = []
passenger_uniq = set(frame_with_durations_outliers_removed['passenger_count'].values)
for i in passenger_uniq:
passenger_count.append(frame_with_durations_outliers_removed[frame_with_durations_outliers_removed['passenger_count'] == i].shape[0])
#sort the passenger pickups based on the passenger count
pass_count_pickups = [x for _,x in sorted(zip(passenger_uniq,passenger_count))]
pass_count = [int(y) for y,_ in sorted(zip(passenger_uniq,passenger_count))]
print (pass_count)
print (pass_count_pickups)
sns.set_style("whitegrid")
fig, ax = plt.subplots()
fig.set_size_inches(12, 5)
ax = sns.barplot(x = np.array(pass_count) , y = np.array(pass_count_pickups))
ax.set(ylabel='Pickups',xlabel = 'number of passengers')
sns.plt.title('Number of pickups for every type of passenger count')
plt.show()
for a,b in zip(pass_count,pass_count_pickups):
print ("Pickups for passenger count {} : {}% of total pickups".format(int(a),round(float(b)*100/sum(pass_count_pickups),4)))
#Unique rate codes
rate_codes_uniq = set(frame_with_durations_outliers_removed['RateCodeID'].values)
rate_count = []
for i in rate_codes_uniq:
rate_count.append(frame_with_durations_outliers_removed[frame_with_durations_outliers_removed['RateCodeID'] == i].shape[0])
#sort the pickups based on the rate codes
rate_code_pickups = [x for _,x in sorted(zip(rate_codes_uniq,rate_count))]
rate_code = [int(y) for y,_ in sorted(zip(rate_codes_uniq,rate_count))]
Trips = ['Standard rates','JFK trips','Newark trips','Nassau/Westchester trips','Negotiated fare','Group rides','unknownrate code']
sns.set_style("whitegrid")
fig, ax = plt.subplots()
fig.set_size_inches(12, 5)
ax = sns.barplot(x = np.array(Trips) , y = np.array(rate_code_pickups))
ax.set(ylabel='Pickups',xlabel = 'Rate Code IDs')
sns.plt.title('Number of pickups for different types of rate codes')
plt.show()
for a,b in zip(Trips,rate_code_pickups):
print ("Pickups for/with {} accounts {}% of total pickups".format(a,round(float(b)*100/sum(rate_code_pickups),4)))
SWflag_uniq = set(frame_with_durations_outliers_removed['store_and_fwd_flag'].values)
SWflag_count = []
for i in SWflag_uniq :
SWflag_count.append(frame_with_durations_outliers_removed[frame_with_durations_outliers_removed['store_and_fwd_flag'] == i].shape[0])
print (SWflag_count)
SWflag_uniq = list(SWflag_uniq)
print (SWflag_uniq)
for a,b in zip(SWflag_uniq,SWflag_count):
print ("Pickups for/with store and forward = {} accounts {}% of total pickups".format(a,round(float(b)*100/sum(SWflag_count),4)))
sns.set_style("whitegrid")
fig, ax = plt.subplots()
fig.set_size_inches(12, 5)
ax = sns.barplot(x = np.array(SWflag_uniq) , y = np.array(SWflag_count))
ax.set(ylabel='Pickups',xlabel = 'store_and_fwd_flag')
sns.plt.title('Number of pickups for store forward flags')
plt.show()
payment_uniq = set(frame_with_durations_outliers_removed['payment_type'].values)
payment_count = []
for i in payment_uniq :
payment_count.append(frame_with_durations_outliers_removed[frame_with_durations_outliers_removed['payment_type'] == i].shape[0])
print (payment_count)
payment_uniq = list(payment_uniq)
print (payment_uniq)
pay_pickups = [x for _,x in sorted(zip(payment_uniq,payment_count))]
pay = [int(y) for y,_ in sorted(zip(payment_uniq,payment_count))]
pay_type = ['Credit card','Cash','No charge','Dispute','Unknow']
sns.set_style("whitegrid")
fig, ax = plt.subplots()
fig.set_size_inches(12, 5)
ax = sns.barplot(x = np.array(pay_type) , y = np.array(pay_pickups))
ax.set(ylabel='Pickups',xlabel = 'payment types')
sns.plt.title('Number of pickups for each payment types')
plt.show()
for a,b in zip(pay_type,pay_pickups):
print ("Pickups with payment types {} accounts to :{}% of total pickups".format(a,round(float(b)*100/sum(pay_pickups),4)))
frame_with_durations_outliers_removed['lat_bin'] = np.round(frame_with_durations_outliers_removed['pickup_latitude'],4)
frame_with_durations_outliers_removed['lon_bin'] = np.round(frame_with_durations_outliers_removed['pickup_longitude'],4)
coords = frame_with_durations_outliers_removed[['pickup_latitude', 'pickup_longitude']].values
#Getting 40 clusters using the kmeans
kmeans = MiniBatchKMeans(n_clusters=40, batch_size=10000).fit(coords)
frame_with_durations_outliers_removed['pickup_cluster'] = kmeans.predict(frame_with_durations_outliers_removed[['pickup_latitude', 'pickup_longitude']])
frame_with_durations_outliers_removed['dropoff_cluster'] = kmeans.predict(frame_with_durations_outliers_removed[['dropoff_latitude', 'dropoff_longitude']])
#Get regions
def Convert_Clusters(frame,cluster):
region=[]
colors = []
Queens=[4,16,22,23,26,35,36]
Brooklyn=[11,19,29,37]
for i in frame[cluster].values:
if i==2:
region.append("JFK")
#Green - JFK
colors.append('#7CFC00')
elif i==13:
region.append("Bronx")
#Red - Bronx
colors.append('#DC143C')
elif i in Queens:
region.append("Queens")
#Blue - Queens
colors.append('#00FFFF')
elif i in Brooklyn:
region.append("Brooklyn")
#Brooklyn - yellow orange
colors.append('#FFD700')
else:
region.append("Manhattan")
#White - manhattan
colors.append('#FFFFFF')
frame['Regions '+cluster]=region
return frame,colors
frame_with_durations_outliers_removed,colors_pickup=Convert_Clusters(frame_with_durations_outliers_removed,'pickup_cluster')
frame_with_durations_outliers_removed,colors_dropoff=Convert_Clusters(frame_with_durations_outliers_removed,'dropoff_cluster')
frame_with_durations_outliers_removed.head()
def plot_scatter_locations(frame,colors,choose):
%matplotlib inline
pd.options.display.mpl_style = 'default' #Better Styling
new_style = {'grid': False} #Remove grid
matplotlib.rc('axes', **new_style)
rcParams['figure.figsize'] = (17.5, 17) #Size of figure
rcParams['figure.dpi'] = 250
var1 = choose+'_longitude'
var2 = choose+'_latitude'
P=frame.plot(kind='scatter', x=[var1], y=[var2],color='white',xlim=(-74.06,-73.77),ylim=(40.61, 40.91),s=.02,alpha=.6)
P.set_axis_bgcolor('black') #Background Color
plt.savefig('plot_'+choose+'.png')
plt.show()
plot_scatter_locations(frame_with_durations_outliers_removed,colors_pickup,'pickup')
plot_scatter_locations(frame_with_durations_outliers_removed,colors_dropoff,'dropoff')
INFERENCE:
def compute_pickups_per_day():
time = month[['tpep_pickup_datetime','tpep_dropoff_datetime']].compute()
time['tpep_pickup_datetime'] = pd.to_datetime(time['tpep_pickup_datetime'])
time["pickup_day"] = time['tpep_pickup_datetime'].dt.strftime('%u').astype(int)
time["pickup_hour"] = time['tpep_pickup_datetime'].dt.strftime('%H').astype(int)
set(time['pickup_day'].values)
day_pickups = []
night_pickups = []
mid_night_pickups = []
afternoon_pickups = []
for i in range(1,8):
day_pickups.append(time[(time.pickup_day == i) & (time.pickup_hour >= 6) & (time.pickup_hour < 12)].shape[0])
night_pickups.append(time[(time.pickup_day == i) & (time.pickup_hour >= 18) & (time.pickup_hour <= 23)].shape[0])
mid_night_pickups.append(time[(time.pickup_day == i) & (time.pickup_hour >= 0) & (time.pickup_hour < 6)].shape[0])
afternoon_pickups.append(time[(time.pickup_day == i) & (time.pickup_hour >= 12) & (time.pickup_hour < 18)].shape[0])
days =pd.DataFrame({'morning':day_pickups, 'afternoon':afternoon_pickups, 'night':night_pickups, 'midnight':mid_night_pickups})
days.plot(kind='bar', stacked=True,figsize=(4.5, 4.5))
plt.xlabel('Days of the week')
plt.ylabel('Number of pickups')
plt.tick_params(axis='both', which='minor')
plt.plot()
#plt.xticks(['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'])
days = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
for i in range(0,len(days)):
print ("{} - morning:{} , day:{} , mid-day:{} night:{}".format(days[i],mid_night_pickups[i],day_pickups[i],afternoon_pickups[i],night_pickups[i]))
return time
times_of_day_dframe = compute_pickups_per_day()
#pickups and distances travelled at what time of the day??
def pickup_in_day(frame):
hour_pickups = []
temp = []
for i in range(1,8):
for j in range(0,24):
temp.append(frame[(frame.pickup_day == i) & (frame.pickup_hour == j)].shape[0])
hour_pickups.append(temp)
temp = []
colors = ['xkcd:blue','xkcd:orange','xkcd:brown','xkcd:coral','xkcd:magenta','xkcd:green','xkcd:fuchsia']
days = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
plt.figure(figsize=(8,4))
hours_lis = [s for s in range(0,24)]
for k in range(0,7):
plt.plot(hours_lis,hour_pickups[k],colors[k],label = days[k])
plt.plot(hours_lis,hour_pickups[k], 'ro', markersize=2)
plt.xticks([s for s in range(0,24)])
plt.xlabel('Hours of a day')
plt.ylabel('Number of pickups')
plt.title('Pickups for every hour')
plt.legend()
plt.grid(True)
plt.show()
hour_pickup_month = []
for j in range(0,24):
hour_pickup_month.append(frame[frame.pickup_hour == j].shape[0])
plt.figure(figsize=(8,4))
hours_lis = [s for s in range(0,24)]
plt.plot(hours_lis,hour_pickup_month,'xkcd:magenta',label = 'average pickups per hour')
plt.plot(hours_lis,hour_pickup_month, 'ro', markersize=2)
plt.xticks([s for s in range(0,24)])
plt.xlabel('Hours of a day')
plt.ylabel('Number of pickups')
plt.title('Pickups for every hour for whole of month')
plt.legend()
plt.grid(True)
plt.show()
pickup_in_day(times_of_day_dframe)
frame_with_durations_outliers_removed['lat_bin'] = np.round(frame_with_durations_outliers_removed['pickup_latitude'],4)
frame_with_durations_outliers_removed['lon_bin'] = np.round(frame_with_durations_outliers_removed['pickup_longitude'],4)
#Find the best cluster where least number of clusters have a min. radius of atleast 1km or more
coords = frame_with_durations_outliers_removed[['pickup_latitude', 'pickup_longitude']].values
def find_min_distance(cluster_centers, cluster_len):
nice_points = 0
wrong_points = 0
for i in range(0, cluster_len-1):
min_value = 1000000
for j in range(i+1, cluster_len):
min_value = min(min_value, gpxpy.geo.haversine_distance(cluster_centers[i][0], cluster_centers[i][1],cluster_centers[j][0], cluster_centers[j][1]))
if min_value!=1000000 and (min_value/1000) >= 1:
nice_points +=1
else:
wrong_points += 1
print (cluster_len, "Cents with min distance < 1:", wrong_points, "Cents with min distance > 1:", nice_points)
if nice_points == cluster_len-1:
return 1
else:
return 0
def find_clusters(increment):
kmeans = MiniBatchKMeans(n_clusters=increment, batch_size=10000).fit(coords)
frame_with_durations_outliers_removed['pickup_cluster'] = kmeans.predict(frame_with_durations_outliers_removed[['pickup_latitude', 'pickup_longitude']])
cluster_centers = kmeans.cluster_centers_
cluster_len = len(cluster_centers)
return cluster_centers, cluster_len
for increment in range(10, 100, 10):
cluster_centers, cluster_len = find_clusters(increment)
more_than_5km = find_min_distance(cluster_centers, cluster_len)
#Getting 40 clusters using the kmeans
kmeans = MiniBatchKMeans(n_clusters=40, batch_size=10000).fit(coords)
frame_with_durations_outliers_removed['pickup_cluster'] = kmeans.predict(frame_with_durations_outliers_removed[['pickup_latitude', 'pickup_longitude']])
#Adding a new column describing what cluster the pickup point belongs to
frame_with_durations_outliers_removed.head(5)
#Plotting the cluster centers on OSM
cluster_centers = kmeans.cluster_centers_
cluster_len = len(cluster_centers)
map_osm = folium.Map(location=[40.734695, -73.990372], tiles='Stamen Toner')
for i in range(cluster_len):
folium.Marker(list((cluster_centers[i][0],cluster_centers[i][1])), popup=(str(cluster_centers[i][0])+str(cluster_centers[i][1]))).add_to(map_osm)
map_osm
def plot_clusters(frame):
city_long_border = (-74.03, -73.75)
city_lat_border = (40.63, 40.85)
fig, ax = plt.subplots(ncols=1, nrows=1)
ax.scatter(frame.pickup_longitude.values[:100000], frame.pickup_latitude.values[:100000], s=10, lw=0,
c=frame.pickup_cluster.values[:100000], cmap='tab20', alpha=0.2)
ax.set_xlim(city_long_border)
ax.set_ylim(city_lat_border)
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
plt.show()
plot_clusters(frame_with_durations_outliers_removed)