累加器实现wordcount

程序员文章站 2022-06-14 13:37:25

...

package util
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.util.AccumulatorV2
import scala.collection.mutable

class MyAccumulator extends AccumulatorV2[String,mutable.HashMap[String,Int]] {
  private val _hashAcc = new mutable.HashMap[String,Int]()
  override def isZero: Boolean = _hashAcc.isEmpty
  override def copy():  AccumulatorV2[String, mutable.HashMap[String,Int]] = {
    val newAcc = new MyAccumulator()
    newAcc._hashAcc  ++= (_hashAcc)
    newAcc
  }
  override def reset(): Unit = _hashAcc.clear()

  override def add(v: String): Unit = {
    _hashAcc.get(v) match{
      case None => _hashAcc +=((v,1))
      case Some(a) => _hashAcc +=((v,a+1))
    }
  }
  //分区求和
  override def merge(other: AccumulatorV2[String, mutable.HashMap[String,Int]]): Unit={
    other match{
      case o:AccumulatorV2[String, mutable.HashMap[String,Int]] =>{
        for((k,v) <- o.value){
          _hashAcc.get(k) match {
            case None => _hashAcc +=((k,v))
            case Some(a) => _hashAcc += ((k,v+a))
          }
        }
      }
    }
  }
  override def value:  mutable.HashMap[String,Int] = {
    _hashAcc
  }
}
object WordCount{
  def main(args :Array[String]):Unit = {
    val conf = new SparkConf().setMaster("local[*]").setAppName("demo")
    val sc = new SparkContext(conf)
    val line = sc.parallelize(List("a","b","c","d","e","f","a","a","b","c"))
    val acc = new MyAccumulator()
    sc.register(acc,"test")
    line.foreach(acc.add(_))
    for((k,v) <- acc.value.toList.sortBy(_._2).reverse){
      println(k+","+v)
    }
    sc.stop()
  }


}

上一篇： javascript 上下banner替换具体实现_javascript技巧

下一篇： php的include和ob函数在循环中导致的,该如何处理

累加器实现wordcount

分析了一下JQuery中的extend方法实现原理_jquery

Python使用Redis实现作业调度系统(超简单)

阻塞队列的简单实现

PHP中使用pcntl和libevent实现Timer功能

Python实现链表的代码详解

YII模块实现绑定二级域名的方法_php实例

PHP中curl实现Get和Post请求的方法

用jquery实现输入框获取焦点消失文字

python用数组和链表实现队列

AJAX实现显示页面后才加载