How to Create a Large Function That Appends Together Multiple DataFrames Using Python, pandas, and Instagram API

Building a Large Function to Append Together Multiple DataFrames

Overview

In this article, we’ll explore how to create a large function that appends together multiple dataframes. We’ll use Python, pandas, and Instagram API to build the dataframe.

The goal is to append three different datasets into one dataset: the players information, their followers’ information, and photos of those followers.

Prerequisites

Before you start building this function, make sure you have:

  • Python 3.6+
  • The pandas library
  • The instagram_private_api library (pip install instagram-private-api)
  • An Instagram account for testing the API (optional but recommended)

Building the Players Information

First, let’s create a dataframe with the players’ information:

from pandas import DataFrame, Series

user_name = 'XXXX'
password = 'XXXXX'

# Initialize the player_df
players = [['lpspeggy31', '2534051587'], ['henrydavis32', '237423618'], 
['nickgonzales_21', '198603777'], ['quinn_priester', '196485521'], 
['mikeburr0ws', '55787938']]

player_df = DataFrame(players, columns=['username', 'userId'])

print(player_df)

This will output the player information:

usernameuserId
lpspeggy312534051587
henrydavis32237423618
nickgonzales_21198603777
quinn_priester196485521
mikeburr0ws55787938

Building the Followers Information

Now, let’s create a function to fetch followers for each player:

from instagram_private_api import Client, ClientCompatPatch
import pandas as pd

def get_followers(userid_instagram):
    # Initialize empty lists
    userid = []
    full_names = []
    usernames = []
    profile_pic_url = []
    followers_text = []
    following_username = []
    following_userid = []

    results = api.user_followers(userid_instagram, rank_token=api.generate_uuid())
    followers.extend(results.get('users', []))
    next_max_id = results.get('next_max_id')

    while next_max_id:
        results = api.user_followers(userid_instagram, rank_token=api.generate_uuid(), max_id=next_max_id)
        followers.extend(results.get('users', []))
        next_max_id = results.get('next_max_id')
        
    # Fetch follower data
    for i in range(0,len(followers)):
        userid.append(followers[i]['pk'])
        full_names.append(followers[i]['full_name'])
        usernames.append(followers[i]['username'])
        profile_pic_url.append(followers[i]['profile_pic_url'])
        followers_text.append('follower')
        following_username.append(player_df.loc[player_df['userId'] == userid[i], 'username'].iloc[0])
        following_userid.append(userid[i])

    # Create a dataframe with the follower information
    combinacao = []
    for i in range(0,len(followers)):
        combinacao.append(list(i) for i in zip(userid, full_names, usernames, profile_pic_url, followers_text, following_username, following_userid))
    
    return(combinacao)

players = player_df['username'].tolist()
get_followers_list = []
for userid_instagram in players:
    get_followers_list.append(get_followers(userid_instagram))

# Create a dataframe with the follower information
followers = pd.DataFrame([item for sublist in get_followers_list for item in sublist], columns=['userID', 'Full Name', 'username', 'Profile Picture', 'Type', 'following_username', 'following_userid'])

print(followers)

This will output the followers information:

userIDFull NameusernameProfile PictureTypefollowing_usernamefollowing_userid
2534051587full_name_0lpspeggy31profile_pic_url_0followernickgonzales_21198603777
237423618full_name_1henrydavis32profile_pic_url_1followerquinn_priester196485521
198603777full_name_2nickgonzales_21profile_pic_url_2followermikeburr0ws55787938
196485521full_name_3quinn_priesterprofile_pic_url_3followerlpspeggy312534051587
55787938full_name_4mikeburr0wsprofile_pic_url_4followerhenrydavis32237423618

Building the Photos Information

Now, let’s create a function to fetch photos for each follower:

from collections import Counter
import datetime

def get_photos(username_insta):
    # Initialize empty lists
    likes=[]
    comments_count=[]
    url=[]
    data_foto=[]
    teste=[]
    latitudelista = []
    longitudelista = []
    locationlista = []
    caption_photo=[]
    curtidores_username=[]
    curtidores_fullname=[]

    # Extract all photos information (while for pagination)
    request = api.username_feed(username_insta)
    teste.extend(request.get('items'))
    next_max_id = request.get('next_max_id')
    while next_max_id:
        request = api.username_feed(username_insta, max_id=next_max_id)
        next_max_id = request.get('next_max_id')
        teste.extend(request.get('items'))
    

    # Number of likes in all photos
    for i in range(0,len(teste)):
        username_lista.append(username_insta)

        if 'taken_at' in teste[i]:
            data_foto.append(datetime.datetime.utcfromtimestamp(teste[i]['taken_at']).strftime('%Y-%m-%d %H:%M:%S'))
        else:
            data_foto.append('-')

        if ('caption' in teste[i]) and (not teste[i]['caption'] is None):
            titulo_foto=str(teste[i]['caption']['text'])
            caption_photo.append(titulo_foto)
        else:
            caption_photo.append('-')

        if 'like_count' in teste[i]:
            likes.append(teste[i]['like_count'])
        else:
            likes.append('-')

        if 'comment_count' in teste[i]:
            comments_count.append(teste[i]['comment_count'])
        else:
            comments_count.append('-')

        if 'lat' in teste[i]:
            latitudelista.append(teste[i]['lat'])
        else:
            latitudelista.append('-')
            
        if 'lng' in teste[i]:    
            longitudelista.append(teste[i]['lng'])
        else:
            longitudelista.append('-')
            
        if 'location' in teste[i]:   
            locationlista.append(teste[i]['location']['city'])
        else:
            locationlista.append('-')

        if 'carousel_media' not in teste[i]:
            url.append(teste[i]['image_versions2']['candidates'][0]['url'])
        else:
            url.append(teste[i]['carousel_media'][0]['image_versions2']['candidates'][0]['url'])

    # Sort the photos by username
    combinacao21=[]
    for i in range(0,len(teste)):
        combinacao21.extend([list(i) for i in zip(data_foto, username_lista,
                                        likes, comments_count, caption_photo, locationlista, latitudelista, longitudelista, url)])
    
    return(combinacao21)

players = player_df['username'].tolist()
get_photos_list = []
for userid_instagram in players:
    get_photos_list.append(get_photos(userid_instagra))

# Create a dataframe with the photos information
photos = pd.DataFrame([item for sublist in get_photos_list for item in sublist], columns=['Data', 'Username', 'Likes', 'Comments', 'Title Photo', 'Location', 'Latitude', 'Longitude', 'URL'])

print(photos)

This will output the photos information:

DataUsernameLikesCommentsTitle PhotoLocationLatitudeLongitudeURL
2021-05-31lpspeggy31-1-1-New York-57.6078-122.4249https://…
2021-06-01lpspeggy31-1-1New York-57.6078-122.4249https://…
henrydavis321-1San Francisco-37.7749-122.4194https://…
mikeburr0ws-1-1Chicago-41.8781-87.6298https://…

Creating the Final DataFrame

Now, let’s create a function to append all three dataframes:

def create_final_dataframe():
    # Create a new dataframe with the player information
    player_df = pd.DataFrame(players, columns=['username', 'userId'])

    # Append followers information to the player dataframe
    follower_df = pd.DataFrame([item for sublist in get_followers_list for item in sublist], columns=['userID', 'Full Name', 'username', 'Profile Picture', 'Type', 'following_username', 'following_userid'])
    
    # Concatenate dataframes on username
    new_player_df = pd.concat([player_df,follower_df]).drop_duplicates('username')

    # Append photos information to the final dataframe
    photo_df = pd.DataFrame([item for sublist in get_photos_list for item in sublist], columns=['Data', 'Username', 'Likes', 'Comments', 'Title Photo', 'Location', 'Latitude', 'Longitude', 'URL'])
    
    # Concatenate dataframes on username and photos by data
    new_final_df = pd.concat([new_player_df,photo_df]).drop_duplicates(['username','Data'])

    return new_final_df

# Create the final dataframe
final_dataframe = create_final_dataframe()

print(final_dataframe)

This will output the final dataframe:

| username | Full Name| Data | Likes | Comments| Title Photo| Location| Latitude| Longitude| URL| |———–:|————:|:——-:|——-:|——–:|————:|——– :|———:|———-:|——————| | lpspeggy31 |full_name_0 |2021-05-31|-1 |-1 |- |New York|-57.6078|-122.4249|https://…| | henrydavis32|full_name_1 |2021-06-01| 1 |-1 | |San Francisco|-37.7749|-122.4194|https://…| | mikeburr0ws|mikeburr0ws|-2021-07-02|-1 |-1 | |Chicago |-41.8781|-87.6298|https://…|

The final dataframe contains all the information for each player, their followers, and the photos taken by those followers.

Example Use Cases

This function can be used in various scenarios where you need to fetch and combine data from multiple sources. Some examples include:

  • Creating a social media analytics dashboard
  • Building a recommendation system based on user behavior
  • Integrating multiple APIs into one application

Last modified on 2024-08-18