spark

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 21, Finished, Available)


%%configure -f \
{"conf": {"spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.2"}}

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, -1, Finished, Available)

Unrecognized options:


!pip install spark-nlp

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 6, Finished, Available)

Collecting spark-nlp
  Downloading spark_nlp-5.1.4-py2.py3-none-any.whl (540 kB)
     |████████████████████████████████| 540 kB 9.0 MB/s eta 0:00:01
Installing collected packages: spark-nlp
Successfully installed spark-nlp-5.1.4


# read the data
workspace_default_storage_account = "group08astoragec0a5c9b39"
workspace_default_container = "azureml-blobstore-8f67895d-e507-48c5-8b8e-f003f0227b44"

workspace_wasbs_base_url = (
    f"wasbs://{workspace_default_container}@{workspace_default_storage_account}.blob.core.windows.net/"
)
df = spark.read.parquet((f"{workspace_wasbs_base_url}<PATH-TO-READ/WRITE>"))

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 7, Finished, Available)


from pyspark.sql.functions import when, col, regexp_extract
import pandas as pd
import numpy as np
import json
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 8, Finished, Available)


df_worldcup = df.filter(col('body').rlike('(?i)world cup|Qatar|fifa 2022|national|final'))

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 25, Finished, Available)


df_worldcup.cache

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 16, Finished, Available)

<bound method DataFrame.cache of DataFrame[author: string, author_flair_text: string, body: string, controversiality: bigint, created_utc: timestamp, gilded: bigint, score: bigint, stickied: boolean, subreddit: string]>


df_worldcup.count()

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 26, Finished, Available)

157607


df_worldcup = df_worldcup.withColumn("champion", (col("body").rlike("(?i)champion|Argentina|3 star|three star|del mundo")).cast("int"))
df_worldcup = df_worldcup.withColumn("players", (col("body").rlike("(?i)player|Messi|Leo|Enzo|Alvarez|Di Maria|Lautaro|De Paul|Mac Allister|Romero|Mbappe|Ronaldo|Martinez|Neymar|Kylian|Lewandowski|vvd|Modric|Kane|van Dijk|Hazard|Suarez|Kevin|De Bruyne|Muller")).cast("int"))
df_worldcup = df_worldcup.withColumn("referee", (col("body").rlike("(?i)referee|penalty|red card|yellow card|Misjudgment|VAR|offside")).cast("int"))

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 27, Finished, Available)


df_worldcup.groupBy('champion').agg(F.count('*').alias('count')).show()

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 29, Finished, Available)

+--------+------+
|champion| count|
+--------+------+
|       1| 11027|
|       0|146580|
+--------+------+


df_worldcup.groupBy('players').agg(F.count('*').alias('count')).show()

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 30, Finished, Available)

+-------+------+
|players| count|
+-------+------+
|      1| 36087|
|      0|121520|
+-------+------+


df_worldcup.groupBy('referee').agg(F.count('*').alias('count')).show()

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 31, Finished, Available)

+-------+------+
|referee| count|
+-------+------+
|      1|  6147|
|      0|151460|
+-------+------+


df_small = df_worldcup.sample(False, 0.01)
df_small.count()

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 33, Finished, Available)

1609


df_small.show(5)

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 34, Finished, Available)

+--------------------+--------------------+--------------------+----------------+-------------------+------+-----+--------+---------+--------+-------+-------+
|              author|   author_flair_text|                body|controversiality|        created_utc|gilded|score|stickied|subreddit|champion|players|referee|
+--------------------+--------------------+--------------------+----------------+-------------------+------+-----+--------+---------+--------+-------+-------+
|   Melloa_Trunk_Tree|                null|Finally,  Liverpo...|               0|2023-01-02 18:13:39|     0|    2|   false|   soccer|       0|      0|      0|
|              captdf|:Angel_City_FC: A...|WWC 2019 will hav...|               0|2023-01-26 23:42:55|     0|    2|   false|     NWSL|       0|      0|      1|
|       Cardealer1000|        :transpride:|No I'm genuinely ...|               0|2023-01-14 17:01:57|     0|    1|   false|   soccer|       0|      0|      1|
|            DickyD43|I don't give a fu...|Their UCL Finals ...|               0|2023-03-19 19:25:44|     0|   28|   false|chelseafc|       0|      0|      0|
|Professional_Ad_9101|                null|Picky has always ...|               0|2023-01-14 16:46:07|     0|    3|   false|   soccer|       0|      0|      0|
+--------------------+--------------------+--------------------+----------------+-------------------+------+-----+--------+---------+--------+-------+-------+
only showing top 5 rows


documentAssembler = DocumentAssembler()\
    .setInputCol("body")\
    .setOutputCol("document")
    
use = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en")\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")


sentimentdl = SentimentDLModel.pretrained(name="sentimentdl_use_twitter", lang="en")\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("sentiment")

nlpPipeline = Pipeline(
      stages = [
          documentAssembler,
          use,
          sentimentdl
      ])

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 12, Finished, Available)

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]
sentimentdl_use_twitter download started this may take some time.
Approximate size to download 11.4 MB
[OK!]


# Use the small dataset for sentiment analysis

content = df_small.select('body')
pipelineModel = nlpPipeline.fit(content)
result = pipelineModel.transform(content)

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 35, Finished, Available)


result.show(5)

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 36, Finished, Available)

+--------------------+--------------------+--------------------+--------------------+
|                body|            document| sentence_embeddings|           sentiment|
+--------------------+--------------------+--------------------+--------------------+
|Finally,  Liverpo...|[{document, 0, 44...|[{sentence_embedd...|[{category, 0, 44...|
|WWC 2019 will hav...|[{document, 0, 45...|[{sentence_embedd...|[{category, 0, 45...|
|No I'm genuinely ...|[{document, 0, 43...|[{sentence_embedd...|[{category, 0, 43...|
|Their UCL Finals ...|[{document, 0, 80...|[{sentence_embedd...|[{category, 0, 80...|
|Picky has always ...|[{document, 0, 53...|[{sentence_embedd...|[{category, 0, 53...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows


senti = result.select('body', F.explode('sentiment.result'))

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 41, Finished, Available)


senti.show(5)

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 42, Finished, Available)

+--------------------+--------+
|                body|     col|
+--------------------+--------+
|Finally,  Liverpo...|negative|
|WWC 2019 will hav...|positive|
|No I'm genuinely ...|positive|
|Their UCL Finals ...|positive|
|Picky has always ...|negative|
+--------------------+--------+
only showing top 5 rows


small_join = senti.join(df_small, on='body', how='inner')
small_join = small_join.select('col', 'created_utc', 'champion', 'players', 'referee')\
                .withColumnRenamed('col', 'sentiment')

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 43, Finished, Available)


small_join.show(5)

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 44, Finished, Available)

+---------+-------------------+--------+-------+-------+
|sentiment|        created_utc|champion|players|referee|
+---------+-------------------+--------+-------+-------+
| negative|2023-01-02 18:13:39|       0|      0|      0|
| positive|2023-01-26 23:42:55|       0|      0|      1|
| positive|2023-01-14 17:01:57|       0|      0|      1|
| positive|2023-03-19 19:25:44|       0|      0|      0|
| negative|2023-01-14 16:46:07|       0|      0|      0|
+---------+-------------------+--------+-------+-------+
only showing top 5 rows


# Now repeat the process on the larger data

content = df_worldcup.select('body')
pipelineModel = nlpPipeline.fit(content)
result = pipelineModel.transform(content)

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 45, Finished, Available)


senti = result.select('body', F.explode('sentiment.result'))

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 46, Finished, Available)


senti.count()

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 54, Finished, Available)

157607


senti.show(5)

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 48, Finished, Available)

+--------------------+--------+
|                body|     col|
+--------------------+--------+
|I mean, not reall...|negative|
|scored the cl fin...| neutral|
|The youth of toda...|positive|
|Ah fair, I'm supp...|negative|
|Beckenbauer playe...|positive|
+--------------------+--------+
only showing top 5 rows


senti.groupBy('body').count().orderBy(col('count').desc()).show(10)

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 56, Finished, Available)

+--------------------+-----+
|                body|count|
+--------------------+-----+
|Today I feel Qata...|  936|
|Hello! Thanks for...|  444|
|             Finally|  171|
|Hello! Thanks for...|  165|
|Jinx? We are feck...|  128|
|             FINALLY|   66|
|     Fucking finally|   57|
|Please have a loo...|   53|
|Your post has bee...|   49|
|            Finally!|   41|
+--------------------+-----+
only showing top 10 rows


# Join the dataframes
df1_with_index = senti.select('col').withColumn("row_index", F.monotonically_increasing_id())
df2_with_index = df_worldcup.withColumn("row_index", F.monotonically_increasing_id())
df_join = df2_with_index.join(df1_with_index, "row_index")
df_join = df_join.drop("row_index")

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 58, Finished, Available)


df_join = df_join.select('col', 'created_utc', 'champion', 'players', 'referee')\
                .withColumnRenamed('col', 'sentiment')

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 62, Finished, Available)


df_join.count()

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 59, Finished, Available)

157607


df_pd = df_join.toPandas()

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 78, Finished, Available)


datapath = 'Users/wc777/fall-2023-reddit-project-team-08/data'

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 70, Finished, Available)


df_pd.head(10)

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 64, Finished, Available)


df_pd.dtypes

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 68, Finished, Available)

sentiment                  object
created_utc        datetime64[ns]
champion                    int32
players                     int32
referee                     int32
sentiment_score             int64
dtype: object


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from pandas.tseries.offsets import Week
sns.reset_defaults()
plt.rcdefaults()

# Melt the dataset to make categories as a single column
melted_df = df_pd.melt(id_vars=['sentiment'], value_vars=['champion', 'players', 'referee'], var_name='category', value_name='mentioned')
melted_df = melted_df[melted_df['mentioned'] == 1]

# Plot category-wise sentiment analysis
plt.figure(figsize=(20, 12))
sns.countplot(data=melted_df, x='category', hue='sentiment', palette='Set2')
plt.title('Category-wise Sentiment Analysis', fontsize=40)
plt.ylabel('Number of Posts', fontsize=28)
plt.xlabel('Category', fontsize=28)
plt.tick_params(axis='both', labelsize=24)
plt.legend(fontsize=24, loc='upper right')

plt.savefig(f'{datapath}/plots/worldcup_sentiment_count_by_category.png')
plt.show()

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 128, Finished, Available)


df_melted_total = df_pd.assign(category='total')[['sentiment', 'category']]
melted_df = melted_df.drop(columns=['mentioned'])
melted_df = melted_df.append(df_melted_total, ignore_index=True)

value_count_table = pd.crosstab(index=melted_df['category'], columns=melted_df['sentiment'])
value_count_table

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 122, Finished, Available)


value_count_table.to_csv(f'{datapath}/csv/worldcup_count_table.csv')

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 120, Finished, Available)


df_week = df_pd.set_index('created_utc')

# Resample the data by week and count the total number of posts
weekly_counts = df_week.resample('W').size().rename('total_posts')

# Resample the data by week and count the number of positive, negative, and neutral posts
weekly_sentiment_counts = df_week.resample('W')['sentiment'].value_counts().unstack().fillna(0)

# Combine two tables above
weekly_data = pd.concat([weekly_counts, weekly_sentiment_counts], axis=1)

# Create a new column about the percentage of positive posts
weekly_data['positive_percentage'] = (weekly_data['positive'] / weekly_data['total_posts']) * 100

weekly_data = weekly_data.reset_index()
weekly_data.columns = ['week', 'total_posts', 'negative', 'neutral', 'positive', 'positive_percentage(%)']
weekly_data.to_csv(f'{datapath}/csv/worldcup_weekly_count.csv', index=False)
weekly_data

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 129, Finished, Available)


fig, ax1 = plt.subplots(figsize=(20, 12))

ax1.plot(weekly_data['week'], weekly_data['total_posts'], label='Total Posts', color='blue', linewidth=2.4)
ax1.plot(weekly_data['week'], weekly_data['positive'], label='Positive Posts', color='green', linewidth=2.4)
ax1.plot(weekly_data['week'], weekly_data['negative'], label='Negative Posts', color='red', linewidth=2.4)
ax1.plot(weekly_data['week'], weekly_data['neutral'], label='Neutral Posts', color='orange', linewidth=2.4)

ax1.set_xlabel('Week of the Year', fontsize=28)
ax1.set_ylabel('Number of Posts', fontsize=28)
ax1.set_title('Weekly Sentiment Post Counts and Positive Percentage', fontsize=40)
ax1.set_ylim(0, 20000)

# Create a second y-axis about the positive percentage
ax2 = ax1.twinx()
ax2.plot(weekly_data['week'], weekly_data['positive_percentage(%)'], label='Positive Percentage', color='purple', linestyle='--')
ax2.set_ylabel('Positive Percentage (%)', fontsize=28)
ax2.set_ylim(25, 55)

ax1.legend(fontsize=20, loc='upper left')
ax2.legend(fontsize=20, loc='upper right')
ax1.tick_params(axis='both', labelsize=16)
ax2.tick_params(axis='both', labelsize=16)


plt.savefig(f'{datapath}/plots/worldcup_weekly_sentiment_post_counts.png')
plt.show()

StatementMeta(8aef2e4c-7819-4b3c-ad3b-441a2f26e87b, 35, 130, Finished, Available)

sentiment	negative	neutral	positive
category
champion	4976	637	5414
players	19230	2104	14753
referee	3963	307	1877
total	85860	8408	63339

	week	total_posts	negative	neutral	positive	positive_percentage(%)
0	2023-01-01	1611	868	83	660	40.968343
1	2023-01-08	12624	7095	648	4881	38.664449
2	2023-01-15	12644	7010	658	4976	39.354635
3	2023-01-22	12486	6883	676	4927	39.460195
4	2023-01-29	10791	5666	611	4514	41.831156
5	2023-02-05	8043	4374	423	3246	40.358075
6	2023-02-12	12529	6796	642	5091	40.633730
7	2023-02-19	15376	8656	821	5899	38.364984
8	2023-02-26	12475	6697	739	5039	40.392786
9	2023-03-05	11827	6126	620	5081	42.961021
10	2023-03-12	12381	6655	656	5070	40.949843
11	2023-03-19	14720	7740	801	6179	41.976902
12	2023-03-26	12656	7183	693	4780	37.768647
13	2023-04-02	7444	4111	337	2996	40.247179

Launch spark environment¶

Create three dummy variables about different sub-topics within World Cup¶

Create a small data sample for code debugging¶

Utilize a pre-trained model and predict the sentiment category using piplines¶

Repeat the process on the real data¶

Convert to a pandas dataframe¶

Create a grouped barplot to display the sentiment result for each sub-topic¶

Create a time series plot to show how the numbers of total posts, positive posts, negative posts, neutral posts, and the percentage of positive posts vary over time¶

	sentiment	created_utc	champion	players	referee
0	positive	2023-01-27 21:08:10	1	0	0
1	negative	2023-01-27 21:12:32	0	0	0
2	positive	2023-01-15 13:16:40	0	0	0
3	positive	2023-03-12 15:42:34	1	0	0
4	negative	2023-01-15 17:32:26	0	0	0
5	positive	2023-01-25 11:13:24	0	0	1
6	negative	2023-02-04 17:45:51	0	0	0
7	negative	2023-03-05 18:52:51	0	0	0
8	negative	2023-02-14 22:25:51	0	1	0
9	positive	2023-02-26 16:54:25	0	1	0

Launch spark environment¶

Filter the posts related to World Cup 2022¶

Create three dummy variables about different sub-topics within World Cup¶

Create a small data sample for code debugging¶

Utilize a pre-trained model and predict the sentiment category using piplines¶

Repeat the process on the real data¶

Convert to a pandas dataframe¶

Create a grouped barplot to display the sentiment result for each sub-topic¶

Create a time series plot to show how the numbers of total posts, positive posts, negative posts, neutral posts, and the percentage of positive posts vary over time¶