spark
StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 6, Finished, Available)
SparkSession - hive
# Read the data
workspace_default_storage_account = "group08astoragec0a5c9b39"
workspace_default_container = "azureml-blobstore-8f67895d-e507-48c5-8b8e-f003f0227b44"
workspace_wasbs_base_url = (
f"wasbs://{workspace_default_container}@{workspace_default_storage_account}.blob.core.windows.net/"
)
df = spark.read.parquet((f"{workspace_wasbs_base_url}<PATH-TO-READ/WRITE>"))
StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 7, Finished, Available)
df.count()
StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 8, Finished, Available)
5617852
df.printSchema()
StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 9, Finished, Available)
root |-- author: string (nullable = true) |-- author_flair_text: string (nullable = true) |-- body: string (nullable = true) |-- controversiality: long (nullable = true) |-- created_utc: timestamp (nullable = true) |-- gilded: long (nullable = true) |-- score: long (nullable = true) |-- stickied: boolean (nullable = true) |-- subreddit: string (nullable = true)
df.cache
StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 10, Finished, Available)
<bound method DataFrame.cache of DataFrame[author: string, author_flair_text: string, body: string, controversiality: bigint, created_utc: timestamp, gilded: bigint, score: bigint, stickied: boolean, subreddit: string]>
from pyspark.sql.functions import when, col, regexp_extract
import pyspark.sql.functions as f
StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 11, Finished, Available)
# Define the keywords for each topic
tactics_keywords = '|'.join(['tactic', 'formation', 'playstyle', 'strategy', 'lineup', 'set-piece', 'defense', 'offense', 'pressing', 'counter-attack', 'tiki-taka', 'tikitaka', '433', '442', '4-3-3', '4-4-2'])
transfers_keywords = '|'.join(['transfer', 'signing', 'contract', 'bid', 'deal', 'release clause', 'agent', 'loan', 'free transfer', 'termination fee'])
performances_keywords = '|'.join(['goal', 'assist', 'save', 'tackle', 'dribble', 'performance', 'man of the match', 'rating', 'brace', 'hat trick'])
rankings_keywords = '|'.join(['rankings', 'top scorer', 'best midfielder', 'best defender' 'leaderboard', 'award', 'Ballon d\'Or', 'FIFA Best', 'talent', 'prospect', 'goat', 'g.o.a.t', 'greatest', 'top 10', 'top 30'])
analysis_keywords = '|'.join(['preview', 'analysis', 'derby', 'fixture', 'head-to-head', 'odds', 'prediction', 'review', 'recap', 'post-match', 'highlight'])
decisions_keywords = '|'.join(['manager', 'coach', 'substitution', 'tactics', 'selection', 'bench', 'formation change', 'press conference', 'interview'])
StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 26, Finished, Available)
# Add columns with dummy variables for each topic based on the presence of keywords
df_with_topics = df.withColumn('soccer_tactics', when(col('body').rlike(tactics_keywords), 1).otherwise(0)) \
.withColumn('transfers', when(col('body').rlike(transfers_keywords), 1).otherwise(0)) \
.withColumn('player_performances', when(col('body').rlike(performances_keywords), 1).otherwise(0)) \
.withColumn('player_rankings', when(col('body').rlike(rankings_keywords), 1).otherwise(0)) \
.withColumn('match_analysis_previews', when(col('body').rlike(analysis_keywords), 1).otherwise(0)) \
.withColumn('managerial_decisions', when(col('body').rlike(decisions_keywords), 1).otherwise(0))
StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 27, Finished, Available)
# Filter the DataFrame to only keep records that are related to at least one topic
df_filtered = df_with_topics.filter(
(col('soccer_tactics') == 1) |
(col('transfers') == 1) |
(col('player_performances') == 1) |
(col('player_rankings') == 1) |
(col('match_analysis_previews') == 1) |
(col('managerial_decisions') == 1)
)
StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 28, Finished, Available)
df_filtered.count()
StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 29, Finished, Available)
833106
df_filtered.cache
StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 30, Finished, Available)
<bound method DataFrame.cache of DataFrame[author: string, author_flair_text: string, body: string, controversiality: bigint, created_utc: timestamp, gilded: bigint, score: bigint, stickied: boolean, subreddit: string, soccer_tactics: int, transfers: int, player_performances: int, player_rankings: int, match_analysis_previews: int, managerial_decisions: int]>
# Only keep the columns we want for this problem
df_filtered = df_filtered.select('body', 'gilded', 'score', 'soccer_tactics', 'transfers', 'player_performances',\
'player_rankings', 'match_analysis_previews', 'managerial_decisions')
StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 31, Finished, Available)
df_filtered.printSchema()
StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 32, Finished, Available)
root |-- body: string (nullable = true) |-- gilded: long (nullable = true) |-- score: long (nullable = true) |-- soccer_tactics: integer (nullable = false) |-- transfers: integer (nullable = false) |-- player_performances: integer (nullable = false) |-- player_rankings: integer (nullable = false) |-- match_analysis_previews: integer (nullable = false) |-- managerial_decisions: integer (nullable = false)
df_filtered.show(10)
StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 33, Finished, Available)
+--------------------+------+-----+--------------+---------+-------------------+---------------+-----------------------+--------------------+ | body|gilded|score|soccer_tactics|transfers|player_performances|player_rankings|match_analysis_previews|managerial_decisions| +--------------------+------+-----+--------------+---------+-------------------+---------------+-----------------------+--------------------+ |His debut feels l...| 0| 2| 0| 1| 0| 0| 0| 0| |Just remember Myk...| 0| 2| 0| 1| 0| 0| 0| 0| |Why Mudryk though...| 0| 16| 0| 0| 0| 1| 0| 1| |Well said, so sic...| 0| 2| 0| 1| 0| 0| 0| 1| |I don't think he'...| 0| 1| 0| 0| 1| 0| 0| 0| |It's the Rodri sh...| 0| 6| 0| 0| 0| 0| 1| 0| |Can't wait for Dy...| 0| 2| 0| 0| 0| 0| 0| 1| |Can we loan potte...| 0| 4| 0| 1| 0| 0| 0| 0| |That’s bs and you...| 0| 1| 0| 0| 0| 0| 0| 1| |Nice having a top...| 0| 2| 0| 0| 0| 0| 0| 1| +--------------------+------+-----+--------------+---------+-------------------+---------------+-----------------------+--------------------+ only showing top 10 rows
datapath = 'Users/wc777/fall-2023-reddit-project-team-08/data'
df_filtered.write.parquet(f"{datapath}/csv/soccer_topic.parquet")
StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 66, Finished, Available)
# Calculate the number of rows, average score, and number of gilded posts within each topic
topic_counts = df_filtered.groupBy().agg(
f.sum('soccer_tactics').alias('soccer_tactics_count'),
f.sum('transfers').alias('transfers_count'),
f.sum('player_performances').alias('player_performances_count'),
f.sum('player_rankings').alias('player_rankings_count'),
f.sum('match_analysis_previews').alias('match_analysis_previews_count'),
f.sum('managerial_decisions').alias('managerial_decisions_count'),
f.round(f.avg(f.when(col('soccer_tactics') == 1, col('score'))), 2).alias('avg_score_tactics'),
f.round(f.avg(f.when(col('transfers') == 1, col('score'))), 2).alias('avg_score_transfers'),
f.round(f.avg(f.when(col('player_performances') == 1, col('score'))), 2).alias('avg_score_performances'),
f.round(f.avg(f.when(col('player_rankings') == 1, col('score'))), 2).alias('avg_score_rankings'),
f.round(f.avg(f.when(col('match_analysis_previews') == 1, col('score'))), 2).alias('avg_score_analysis'),
f.round(f.avg(f.when(col('managerial_decisions') == 1, col('score'))), 2).alias('avg_score_decisions'),
f.sum(when((col('soccer_tactics') == 1) & (col('gilded') != 0), 1)).alias('gilded_tactics'),
f.sum(when((col('transfers') == 1) & (col('gilded') != 0), 1)).alias('gilded_transfers'),
f.sum(when((col('player_performances') == 1) & (col('gilded') != 0), 1)).alias('gilded_performances'),
f.sum(when((col('player_rankings') == 1) & (col('gilded') != 0), 1)).alias('gilded_rankings'),
f.sum(when((col('match_analysis_previews') == 1) & (col('gilded') != 0), 1)).alias('gilded_analysis'),
f.sum(when((col('managerial_decisions') == 1) & (col('gilded') != 0), 1)).alias('gilded_decisions')
)
StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 36, Finished, Available)
topic_counts_pd = topic_counts.toPandas()
StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 37, Finished, Available)
# Create a new dataframe for plotting
import pandas as pd
data = {
'topic': ['soccer tactics', 'transfers', 'player performances', 'player rankings', 'match analysis and previews', 'managerial decisions'],
'number of rows': [104584, 224669, 327807, 75402, 44770, 184505],
'average score': [12.28, 16.02, 15.93, 15.37, 15.12, 15.89],
'gild percentage': [0.030, 0.024, 0.053, 0.036, 0.054, 0.027]
}
pd_for_plot = pd.DataFrame(data)
pd_for_plot
StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 51, Finished, Available)
topic | number of rows | average score | gild percentage | |
---|---|---|---|---|
0 | soccer tactics | 104584 | 12.28 | 0.030 |
1 | transfers | 224669 | 16.02 | 0.024 |
2 | player performances | 327807 | 15.93 | 0.053 |
3 | player rankings | 75402 | 15.37 | 0.036 |
4 | match analysis and previews | 44770 | 15.12 | 0.054 |
5 | managerial decisions | 184505 | 15.89 | 0.027 |
pd_for_plot.to_csv(f'{datapath}/csv/soccer_topic_summary.csv', index=False)
StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 67, Finished, Available)
import matplotlib.pyplot as plt
import seaborn as sns
sns.reset_defaults()
plt.rcdefaults()
plt.figure(figsize=(20, 12))
# Create the barplot about the number of rows per topic
sns.barplot(x='topic', y='number of rows', data=pd_for_plot, palette='Set2')
plt.title('Number of Rows per Topic', fontsize=40)
plt.xlabel('Topic', fontsize=28)
plt.ylabel('Number of Rows', fontsize=28)
plt.tick_params(axis='both', labelsize=24)
plt.xticks(rotation=45)
plt.savefig('./Users/wc777/fall-2023-reddit-project-team-08/data/plots/number_of_rows_bar.png')
plt.show()
StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 62, Finished, Available)
# Create the barplot about the average score per topic
plt.figure(figsize=(20, 12))
sns.barplot(x='topic', y='average score', data=pd_for_plot, palette='Set2')
plt.title('Average Score per Topic', fontsize=40)
plt.xlabel('Topic', fontsize=28)
plt.ylabel('Average Score', fontsize=28)
plt.tick_params(axis='both', labelsize=24)
plt.xticks(rotation=45)
plt.savefig('./Users/wc777/fall-2023-reddit-project-team-08/data/plots/average_score_bar.png')
plt.show()
StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 63, Finished, Available)
# Create the barplot about the gild percentage per topic
plt.figure(figsize=(20, 12))
sns.barplot(x='topic', y='gild percentage', data=pd_for_plot, palette='Set2')
plt.title('Gild Percentage per Topic', fontsize=40)
plt.xlabel('Topic', fontsize=28)
plt.ylabel('Gild Percentage (%)', fontsize=28)
plt.tick_params(axis='both', labelsize=24)
plt.xticks(rotation=45)
plt.savefig('./Users/wc777/fall-2023-reddit-project-team-08/data/plots/gild_percentage_bar.png')
plt.show()
StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 64, Finished, Available)
sizes = pd_for_plot['number of rows']
labels = pd_for_plot['topic']
# Create the pie chart about proportion of posts per topic
plt.figure(figsize=(20, 12))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140, colors=sns.color_palette('Set2'),
textprops={'fontsize': 18})
plt.title('Proportion of Posts per Soccer Topic', fontsize=40)
plt.axis('equal')
plt.savefig('./Users/wc777/fall-2023-reddit-project-team-08/data/plots/proportion_of_topics_pie.png')
plt.show()
StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 6, 65, Finished, Available)