当前位置: 代码迷 >> 综合 >> Spark Streaming中序列化问题:org.apache.spark.SparkException: Task not serializable
  详细解决方案

Spark Streaming中序列化问题:org.apache.spark.SparkException: Task not serializable

热度:40   发布时间:2023-12-16 22:49:36.0

利用saprk streaming实时分析数据时报的一些问题:打印日志如下:

org.apache.spark.SparkException: Task not serializableat org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:403)at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:393)at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:162)at org.apache.spark.SparkContext.clean(SparkContext.scala:2326)at org.apache.spark.rdd.RDD$$anonfun$foreach$1.apply(RDD.scala:926)at org.apache.spark.rdd.RDD$$anonfun$foreach$1.apply(RDD.scala:925)at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)at org.apache.spark.rdd.RDD.foreach(RDD.scala:925)at org.apache.spark.api.java.JavaRDDLike$class.foreach(JavaRDDLike.scala:351)at org.apache.spark.api.java.AbstractJavaRDDLike.foreach(JavaRDDLike.scala:45)at com.myspark.sparkanalysis.web.WebSocketServer.lambda$1(WebSocketServer.java:54)at org.apache.spark.streaming.api.java.JavaDStreamLike$$anonfun$foreachRDD$1.apply(JavaDStreamLike.scala:272)at org.apache.spark.streaming.api.java.JavaDStreamLike$$anonfun$foreachRDD$1.apply(JavaDStreamLike.scala:272)at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:628)at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:628)at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:51)at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:416)at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:50)at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)at scala.util.Try$.apply(Try.scala:192)at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:257)at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:256)at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)at java.lang.Thread.run(Unknown Source)

导致我以上报出的问题是,JavaSpakrContext,JavaStreamingContext不能被序列化,以下我的关键spark streaming类代码如下:

package com.myspark.sparkanalysis.service;import java.io.Serializable;
import java.util.List;import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import org.springframework.stereotype.Service;@Component
public class StreamingConfig implements Serializable,Runnable{private static final long serialVersionUID = 1L;//这里是关键,要加上 transient 关键字,表示不被序列化private transient JavaSparkContext javaSparkContext; //这里是关键,要加上 transient 关键字,表示不被序列化private transient JavaStreamingContext streamingContext = null;public StreamingConfig(@Autowired JavaSparkContext javaSparkContext) {this.javaSparkContext = javaSparkContext;}/*** 开启Stream任务* @param server* @param listenerDirectory 要监听的文件夹*/public void startStreamTask(StreamingConsumer server, String listenerDirectory) {streamingContext = new JavaStreamingContext(javaSparkContext, Durations.seconds(20));JavaDStream<String> lines = streamingContext.textFileStream(listenerDirectory);lines.map(line -> line.split(",")[2]).foreachRDD(rdd -> {//do something....List<String> collect = rdd.collect();for (String d : collect) {server.sendMessageToCient(d);}//rdd.saveAsTextFile("");});streamingContext.start();try {streamingContext.awaitTermination();streamingContext.close();} catch (InterruptedException e) {e.printStackTrace();}}/*** 手动关闭Stream*/public void destroyStreamTask() {if(streamingContext != null) {streamingContext.stop();}	}@Overridepublic void run() {//startStreamTask(StreamingConsumer server, String listenerDirectory)}
}

最后修改完后,项目正确运行起来。

  相关解决方案