ICode9

精准搜索请尝试: 精确搜索
首页 > 其他分享> 文章详细

Spark Dataset DataFrame空值null,NaN判断和处理

2021-06-21 16:03:38  阅读:226  来源: 互联网

标签:null no gender NaN DataFrame Dataset female yes male


import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.Row
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.Column
import org.apache.spark.sql.DataFrameReader
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.Encoder
import org.apache.spark.sql.functions._
import org.apache.spark.sql.DataFrameStatFunctions
import org.apache.spark.ml.linalg.Vectors
 
 
math.sqrt(-1.0)
res43: Double = NaN
    
math.sqrt(-1.0).isNaN()
res44: Boolean = true
   
   
val data1 = data.toDF("affairs", "gender", "age", "yearsmarried", "children", "religiousness", "education", "occupation", "rating")
data1: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields]
    
data1.limit(10).show
+-------+------+---+------------+--------+-------------+---------+----------+------+
|affairs|gender|age|yearsmarried|children|religiousness|education|occupation|rating|
+-------+------+---+------------+--------+-------------+---------+----------+------+
|      0|  male| 37|          10|      no|            3|       18|         7|     4|
|      0|  null| 27|        null|      no|            4|       14|         6|  null|
|      0|  null| 32|        null|     yes|            1|       12|         1|  null|
|      0|  null| 57|        null|     yes|            5|       18|         6|  null|
|      0|  null| 22|        null|      no|            2|       17|         6|  null|
|      0|  null| 32|        null|      no|            2|       17|         5|  null|
|      0|female| 22|        null|      no|            2|       12|         1|  null|
|      0|  male| 57|          15|     yes|            2|       14|         4|     4|
|      0|female| 32|          15|     yes|            4|       16|         1|     2|
|      0|  male| 22|         1.5|      no|            4|       14|         4|     5|
+-------+------+---+------------+--------+-------------+---------+----------+------+
    
 // 删除所有列的空值和NaN
val resNull=data1.na.drop()
resNull: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields]
    
 resNull.limit(10).show()
+-------+------+---+------------+--------+-------------+---------+----------+------+
|affairs|gender|age|yearsmarried|children|religiousness|education|occupation|rating|
+-------+------+---+------------+--------+-------------+---------+----------+------+
|      0|  male| 37|          10|      no|            3|       18|         7|     4|
|      0|  male| 57|          15|     yes|            2|       14|         4|     4|
|      0|female| 32|          15|     yes|            4|       16|         1|     2|
|      0|  male| 22|         1.5|      no|            4|       14|         4|     5|
|      0|  male| 37|          15|     yes|            2|       20|         7|     2|
|      0|  male| 27|           4|     yes|            4|       18|         6|     4|
|      0|  male| 47|          15|     yes|            5|       17|         6|     4|
|      0|female| 22|         1.5|      no|            2|       17|         5|     4|
|      0|female| 27|           4|      no|            4|       14|         5|     4|
|      0|female| 37|          15|     yes|            1|       17|         5|     5|
+-------+------+---+------------+--------+-------------+---------+----------+------+
    
 //删除某列的空值和NaN
val res=data1.na.drop(Array("gender","yearsmarried"))
 
// 删除某列的非空且非NaN的低于10的
data1.na.drop(10,Array("gender","yearsmarried"))
    
    
 //填充所有空值的列
val res123=data1.na.fill("wangxiao123")
res123: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields]
    
 res123.limit(10).show()
+-------+-----------+---+------------+--------+-------------+---------+----------+-----------+
|affairs|     gender|age|yearsmarried|children|religiousness|education|occupation|     rating|
+-------+-----------+---+------------+--------+-------------+---------+----------+-----------+
|      0|       male| 37|          10|      no|            3|       18|         7|          4|
|      0|wangxiao123| 27| wangxiao123|      no|            4|       14|         6|wangxiao123|
|      0|wangxiao123| 32| wangxiao123|     yes|            1|       12|         1|wangxiao123|
|      0|wangxiao123| 57| wangxiao123|     yes|            5|       18|         6|wangxiao123|
|      0|wangxiao123| 22| wangxiao123|      no|            2|       17|         6|wangxiao123|
|      0|wangxiao123| 32| wangxiao123|      no|            2|       17|         5|wangxiao123|
|      0|     female| 22| wangxiao123|      no|            2|       12|         1|wangxiao123|
|      0|       male| 57|          15|     yes|            2|       14|         4|          4|
|      0|     female| 32|          15|     yes|            4|       16|         1|          2|
|      0|       male| 22|         1.5|      no|            4|       14|         4|          5|
+-------+-----------+---+------------+--------+-------------+---------+----------+-----------+
    
 //对指定的列空值填充
 val res2=data1.na.fill(value="wangxiao111",cols=Array("gender","yearsmarried") )
res2: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields]
    
 res2.limit(10).show()
+-------+-----------+---+------------+--------+-------------+---------+----------+------+
|affairs|     gender|age|yearsmarried|children|religiousness|education|occupation|rating|
+-------+-----------+---+------------+--------+-------------+---------+----------+------+
|      0|       male| 37|          10|      no|            3|       18|         7|     4|
|      0|wangxiao111| 27| wangxiao111|      no|            4|       14|         6|  null|
|      0|wangxiao111| 32| wangxiao111|     yes|            1|       12|         1|  null|
|      0|wangxiao111| 57| wangxiao111|     yes|            5|       18|         6|  null|
|      0|wangxiao111| 22| wangxiao111|      no|            2|       17|         6|  null|
|      0|wangxiao111| 32| wangxiao111|      no|            2|       17|         5|  null|
|      0|     female| 22| wangxiao111|      no|            2|       12|         1|  null|
|      0|       male| 57|          15|     yes|            2|       14|         4|     4|
|      0|     female| 32|          15|     yes|            4|       16|         1|     2|
|      0|       male| 22|         1.5|      no|            4|       14|         4|     5|
+-------+-----------+---+------------+--------+-------------+---------+----------+------+
    
    
val res3=data1.na.fill(Map("gender"->"wangxiao222","yearsmarried"->"wangxiao567") )
res3: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields]
    
 res3.limit(10).show()
+-------+-----------+---+------------+--------+-------------+---------+----------+------+
|affairs|     gender|age|yearsmarried|children|religiousness|education|occupation|rating|
+-------+-----------+---+------------+--------+-------------+---------+----------+------+
|      0|       male| 37|          10|      no|            3|       18|         7|     4|
|      0|wangxiao222| 27| wangxiao567|      no|            4|       14|         6|  null|
|      0|wangxiao222| 32| wangxiao567|     yes|            1|       12|         1|  null|
|      0|wangxiao222| 57| wangxiao567|     yes|            5|       18|         6|  null|
|      0|wangxiao222| 22| wangxiao567|      no|            2|       17|         6|  null|
|      0|wangxiao222| 32| wangxiao567|      no|            2|       17|         5|  null|
|      0|     female| 22| wangxiao567|      no|            2|       12|         1|  null|
|      0|       male| 57|          15|     yes|            2|       14|         4|     4|
|      0|     female| 32|          15|     yes|            4|       16|         1|     2|
|      0|       male| 22|         1.5|      no|            4|       14|         4|     5|
+-------+-----------+---+------------+--------+-------------+---------+----------+------+
    
 //查询空值列
data1.filter("gender is null").select("gender").limit(10).show
+------+
|gender|
+------+
|  null|
|  null|
|  null|
|  null|
|  null|
+------+
    
    
 data1.filter("gender is not null").select("gender").limit(10).show
+------+
|gender|
+------+
|  male|
|female|
|  male|
|female|
|  male|
|  male|
|  male|
|  male|
|female|
|female|
+------+
    
    
 data1.filter( data1("gender").isNull ).select("gender").limit(10).show
+------+
|gender|
+------+
|  null|
|  null|
|  null|
|  null|
|  null|
+------+
    
    
 data1.filter("gender<>''").select("gender").limit(10).show
+------+
|gender|
+------+
|  male|
|female|
|  male|
|female|
|  male|
|  male|
|  male|
|  male|
|female|
|female|
+------+

标签:null,no,gender,NaN,DataFrame,Dataset,female,yes,male
来源: https://blog.51cto.com/u_15278282/2931962

本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享;
2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关;
3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关;
4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除;
5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

专注分享技术,共同学习,共同进步。侵权联系[81616952@qq.com]

Copyright (C)ICode9.com, All Rights Reserved.

ICode9版权所有