sc


#### DO NOT CHANGE ANYTHING IN THIS CELL ####

from pyspark.sql.functions import col

def load_data(size='small'):
    # Loads the data for this question. Do not change this function.
    # This function should only be called with the parameter 'small' or 'large'
    
    if size != 'small' and size != 'large':
        print("Invalid size parameter provided. Use only 'small' or 'large'.")
        return
    
    input_bucket = "s3://lab11-janedoe3"
    
    # Load Trip Data
    trip_path = '/'+size+'/yellow_tripdata*'
    trips = spark.read.csv(input_bucket + trip_path, header=True, inferSchema=True)
    print("Trip Count: ",trips.count()) # Prints # of trips (# of records, as each record is one trip)
    
    # Load Lookup Data
    lookup_path = '/'+size+'/taxi*'
    lookup = spark.read.csv(input_bucket + lookup_path, header=True, inferSchema=True)
    
    return trips, lookup

def main(size, bucket):
    # Runs your functions implemented above.
    
    print(user())
    trips, lookup = load_data(size=size)
    trips = long_trips(trips)
    mtrips = manhattan_trips(trips, lookup)
    wp = weighted_profit(trips, mtrips)
    final = final_output(wp,lookup)
    
    # Outputs the results for you to visually see
    final.show()
    
    # Writes out as a CSV to your bucket.
    final.write.csv(bucket)


def user():
    # Returns a string consisting of your username.
    return 'janedoe3'


def long_trips(trips):
    # Returns a Dataframe with Schema the same as :trips:
    pass


def manhattan_trips(trips, lookup):
    # Returns a Dataframe with Schema: DOLocationID, pcount
    pass


def weighted_profit(trips, mtrips): 
    # Returns a Dataframe with Schema: PULocationID, weighted_profit
    pass


def final_output(calc, lookup): 
    # Returns a Dataframe with Schema: Zone, Borough, weighted_profit
    pass


bucket = 's3://lab11-janedoe3/output-small'
main('small',bucket)
# main('large', bucket)

Analyzing Large Amount of Data with PySpark on AWS¶

Implement the below functions for this lab:¶

3a. Update the `user()` function¶

3b. Update the `long_trips()` function¶

3c. Update the `manhattan_trips()` function¶

3d. Update the `weighted_profit()` function¶

3e. Update the `final_output()` function¶

Analyzing Large Amount of Data with PySpark on AWS¶

Implement the below functions for this lab:¶

3a. Update the user() function¶

3b. Update the long_trips() function¶

3c. Update the manhattan_trips() function¶

3d. Update the weighted_profit() function¶

3e. Update the final_output() function¶

3a. Update the `user()` function¶

3b. Update the `long_trips()` function¶

3c. Update the `manhattan_trips()` function¶

3d. Update the `weighted_profit()` function¶

3e. Update the `final_output()` function¶