spark

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 6, Finished, Available)


# Read the data
workspace_default_storage_account = "group08astoragec0a5c9b39"
workspace_default_container = "azureml-blobstore-8f67895d-e507-48c5-8b8e-f003f0227b44"

workspace_wasbs_base_url = (
    f"wasbs://{workspace_default_container}@{workspace_default_storage_account}.blob.core.windows.net/"
)
df = spark.read.parquet((f"{workspace_wasbs_base_url}<PATH-TO-READ/WRITE>"))

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 7, Finished, Available)


df.count()

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 8, Finished, Available)

5617852


df.printSchema()

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 9, Finished, Available)

root
 |-- author: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- body: string (nullable = true)
 |-- controversiality: long (nullable = true)
 |-- created_utc: timestamp (nullable = true)
 |-- gilded: long (nullable = true)
 |-- score: long (nullable = true)
 |-- stickied: boolean (nullable = true)
 |-- subreddit: string (nullable = true)


df.cache

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 10, Finished, Available)

<bound method DataFrame.cache of DataFrame[author: string, author_flair_text: string, body: string, controversiality: bigint, created_utc: timestamp, gilded: bigint, score: bigint, stickied: boolean, subreddit: string]>


from pyspark.sql.functions import when, col, regexp_extract
import pyspark.sql.functions as f

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 11, Finished, Available)


# Define the keywords for each topic
tactics_keywords = '|'.join(['tactic', 'formation', 'playstyle', 'strategy', 'lineup', 'set-piece', 'defense', 'offense', 'pressing', 'counter-attack', 'tiki-taka', 'tikitaka', '433', '442', '4-3-3', '4-4-2'])
transfers_keywords = '|'.join(['transfer', 'signing', 'contract', 'bid', 'deal', 'release clause', 'agent', 'loan', 'free transfer', 'termination fee'])
performances_keywords = '|'.join(['goal', 'assist', 'save', 'tackle', 'dribble', 'performance', 'man of the match', 'rating', 'brace', 'hat trick'])
rankings_keywords = '|'.join(['rankings', 'top scorer', 'best midfielder', 'best defender' 'leaderboard', 'award', 'Ballon d\'Or', 'FIFA Best', 'talent', 'prospect', 'goat', 'g.o.a.t', 'greatest', 'top 10', 'top 30'])
analysis_keywords = '|'.join(['preview', 'analysis', 'derby', 'fixture', 'head-to-head', 'odds', 'prediction', 'review', 'recap', 'post-match', 'highlight'])
decisions_keywords = '|'.join(['manager', 'coach', 'substitution', 'tactics', 'selection', 'bench', 'formation change', 'press conference', 'interview'])

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 26, Finished, Available)


# Add columns with dummy variables for each topic based on the presence of keywords
df_with_topics = df.withColumn('soccer_tactics', when(col('body').rlike(tactics_keywords), 1).otherwise(0)) \
                   .withColumn('transfers', when(col('body').rlike(transfers_keywords), 1).otherwise(0)) \
                   .withColumn('player_performances', when(col('body').rlike(performances_keywords), 1).otherwise(0)) \
                   .withColumn('player_rankings', when(col('body').rlike(rankings_keywords), 1).otherwise(0)) \
                   .withColumn('match_analysis_previews', when(col('body').rlike(analysis_keywords), 1).otherwise(0)) \
                   .withColumn('managerial_decisions', when(col('body').rlike(decisions_keywords), 1).otherwise(0))

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 27, Finished, Available)


# Filter the DataFrame to only keep records that are related to at least one topic
df_filtered = df_with_topics.filter(
    (col('soccer_tactics') == 1) |
    (col('transfers') == 1) |
    (col('player_performances') == 1) |
    (col('player_rankings') == 1) |
    (col('match_analysis_previews') == 1) |
    (col('managerial_decisions') == 1)
)

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 28, Finished, Available)


df_filtered.count()

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 29, Finished, Available)

833106


df_filtered.cache

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 30, Finished, Available)

<bound method DataFrame.cache of DataFrame[author: string, author_flair_text: string, body: string, controversiality: bigint, created_utc: timestamp, gilded: bigint, score: bigint, stickied: boolean, subreddit: string, soccer_tactics: int, transfers: int, player_performances: int, player_rankings: int, match_analysis_previews: int, managerial_decisions: int]>


# Only keep the columns we want for this problem
df_filtered = df_filtered.select('body', 'gilded', 'score', 'soccer_tactics', 'transfers', 'player_performances',\
'player_rankings', 'match_analysis_previews', 'managerial_decisions')

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 31, Finished, Available)


df_filtered.printSchema()

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 32, Finished, Available)

root
 |-- body: string (nullable = true)
 |-- gilded: long (nullable = true)
 |-- score: long (nullable = true)
 |-- soccer_tactics: integer (nullable = false)
 |-- transfers: integer (nullable = false)
 |-- player_performances: integer (nullable = false)
 |-- player_rankings: integer (nullable = false)
 |-- match_analysis_previews: integer (nullable = false)
 |-- managerial_decisions: integer (nullable = false)


df_filtered.show(10)

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 33, Finished, Available)

+--------------------+------+-----+--------------+---------+-------------------+---------------+-----------------------+--------------------+
|                body|gilded|score|soccer_tactics|transfers|player_performances|player_rankings|match_analysis_previews|managerial_decisions|
+--------------------+------+-----+--------------+---------+-------------------+---------------+-----------------------+--------------------+
|His debut feels l...|     0|    2|             0|        1|                  0|              0|                      0|                   0|
|Just remember Myk...|     0|    2|             0|        1|                  0|              0|                      0|                   0|
|Why Mudryk though...|     0|   16|             0|        0|                  0|              1|                      0|                   1|
|Well said, so sic...|     0|    2|             0|        1|                  0|              0|                      0|                   1|
|I don't think he'...|     0|    1|             0|        0|                  1|              0|                      0|                   0|
|It's the Rodri sh...|     0|    6|             0|        0|                  0|              0|                      1|                   0|
|Can't wait for Dy...|     0|    2|             0|        0|                  0|              0|                      0|                   1|
|Can we loan potte...|     0|    4|             0|        1|                  0|              0|                      0|                   0|
|That’s bs and you...|     0|    1|             0|        0|                  0|              0|                      0|                   1|
|Nice having a top...|     0|    2|             0|        0|                  0|              0|                      0|                   1|
+--------------------+------+-----+--------------+---------+-------------------+---------------+-----------------------+--------------------+
only showing top 10 rows


datapath = 'Users/wc777/fall-2023-reddit-project-team-08/data'
df_filtered.write.parquet(f"{datapath}/csv/soccer_topic.parquet")

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 66, Finished, Available)


# Calculate the number of rows, average score, and number of gilded posts within each topic
topic_counts = df_filtered.groupBy().agg(
    f.sum('soccer_tactics').alias('soccer_tactics_count'),
    f.sum('transfers').alias('transfers_count'),
    f.sum('player_performances').alias('player_performances_count'),
    f.sum('player_rankings').alias('player_rankings_count'),
    f.sum('match_analysis_previews').alias('match_analysis_previews_count'),
    f.sum('managerial_decisions').alias('managerial_decisions_count'),
    f.round(f.avg(f.when(col('soccer_tactics') == 1, col('score'))), 2).alias('avg_score_tactics'),
    f.round(f.avg(f.when(col('transfers') == 1, col('score'))), 2).alias('avg_score_transfers'),
    f.round(f.avg(f.when(col('player_performances') == 1, col('score'))), 2).alias('avg_score_performances'),
    f.round(f.avg(f.when(col('player_rankings') == 1, col('score'))), 2).alias('avg_score_rankings'),
    f.round(f.avg(f.when(col('match_analysis_previews') == 1, col('score'))), 2).alias('avg_score_analysis'),
    f.round(f.avg(f.when(col('managerial_decisions') == 1, col('score'))), 2).alias('avg_score_decisions'),
    f.sum(when((col('soccer_tactics') == 1) & (col('gilded') != 0), 1)).alias('gilded_tactics'),
    f.sum(when((col('transfers') == 1) & (col('gilded') != 0), 1)).alias('gilded_transfers'),
    f.sum(when((col('player_performances') == 1) & (col('gilded') != 0), 1)).alias('gilded_performances'),
    f.sum(when((col('player_rankings') == 1) & (col('gilded') != 0), 1)).alias('gilded_rankings'),
    f.sum(when((col('match_analysis_previews') == 1) & (col('gilded') != 0), 1)).alias('gilded_analysis'),
    f.sum(when((col('managerial_decisions') == 1) & (col('gilded') != 0), 1)).alias('gilded_decisions')
)

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 36, Finished, Available)


topic_counts_pd = topic_counts.toPandas()

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 37, Finished, Available)


# Create a new dataframe for plotting
import pandas as pd
data = {
    'topic': ['soccer tactics', 'transfers', 'player performances', 'player rankings', 'match analysis and previews', 'managerial decisions'],
    'number of rows': [104584, 224669, 327807, 75402, 44770, 184505],
    'average score': [12.28, 16.02, 15.93, 15.37, 15.12, 15.89],
    'gild percentage': [0.030, 0.024, 0.053, 0.036, 0.054, 0.027]
}
pd_for_plot = pd.DataFrame(data)

pd_for_plot

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 51, Finished, Available)


pd_for_plot.to_csv(f'{datapath}/csv/soccer_topic_summary.csv', index=False)

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 67, Finished, Available)


import matplotlib.pyplot as plt
import seaborn as sns

sns.reset_defaults()
plt.rcdefaults()

plt.figure(figsize=(20, 12))

# Create the barplot about the number of rows per topic
sns.barplot(x='topic', y='number of rows', data=pd_for_plot, palette='Set2')
plt.title('Number of Rows per Topic', fontsize=40)
plt.xlabel('Topic', fontsize=28)
plt.ylabel('Number of Rows', fontsize=28)
plt.tick_params(axis='both', labelsize=24)
plt.xticks(rotation=45)  
plt.savefig('./Users/wc777/fall-2023-reddit-project-team-08/data/plots/number_of_rows_bar.png')
plt.show()

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 62, Finished, Available)


# Create the barplot about the average score per topic
plt.figure(figsize=(20, 12))
sns.barplot(x='topic', y='average score', data=pd_for_plot, palette='Set2')
plt.title('Average Score per Topic', fontsize=40)
plt.xlabel('Topic', fontsize=28)
plt.ylabel('Average Score', fontsize=28)
plt.tick_params(axis='both', labelsize=24)
plt.xticks(rotation=45)
plt.savefig('./Users/wc777/fall-2023-reddit-project-team-08/data/plots/average_score_bar.png')
plt.show()

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 63, Finished, Available)


# Create the barplot about the gild percentage per topic
plt.figure(figsize=(20, 12))
sns.barplot(x='topic', y='gild percentage', data=pd_for_plot, palette='Set2')
plt.title('Gild Percentage per Topic', fontsize=40)
plt.xlabel('Topic', fontsize=28)
plt.ylabel('Gild Percentage (%)', fontsize=28)
plt.tick_params(axis='both', labelsize=24)
plt.xticks(rotation=45)
plt.savefig('./Users/wc777/fall-2023-reddit-project-team-08/data/plots/gild_percentage_bar.png')
plt.show()

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 64, Finished, Available)


sizes = pd_for_plot['number of rows']  
labels = pd_for_plot['topic'] 

# Create the pie chart about proportion of posts per topic
plt.figure(figsize=(20, 12))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140, colors=sns.color_palette('Set2'),
    textprops={'fontsize': 18})
plt.title('Proportion of Posts per Soccer Topic', fontsize=40)
plt.axis('equal')

plt.savefig('./Users/wc777/fall-2023-reddit-project-team-08/data/plots/proportion_of_topics_pie.png')
plt.show()

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 65, Finished, Available)

	topic	number of rows	average score	gild percentage
0	soccer tactics	104584	12.28	0.030
1	transfers	224669	16.02	0.024
2	player performances	327807	15.93	0.053
3	player rankings	75402	15.37	0.036
4	match analysis and previews	44770	15.12	0.054
5	managerial decisions	184505	15.89	0.027

Launch spark environment¶