Let’s check out various ways to handle missing data or Nulls in Spark Dataframe.
import pyspark
from pyspark.sql import SparkSession
spark= SparkSession.builder.appName(‘NULL_Handling’).getOrCreate()
print(‘NULL_Handling’)
null_df=spark.read.csv(r’D:\python_coding\pyspark_tutorial\Nulls.csv’,header=True,inferSchema=True)
null_df.show()
Dataset
#na func to drop rows with null values
#rows having atleast a null value is dropped
null_df.na.drop().show()
drop nulls
#rows having nulls greater than 2 are dropped
null_df.na.drop(thresh=2).show()
#apache-spark #python #big-data-analytics #pyspark #programming