spark3.0教程:案例-统计各个性别人数、最高身高、最低身高(scala) 作者:马育民 • 2025-12-16 20:07 • 阅读:10002 # 介绍 统计各个性别人数、最高身高、最低身高 # 数据格式 数据如下: - 第一列:序号 - 第二列:性别 - 第三列:身高 ``` 1 F 170 2 M 178 3 M 174 4 F 165 ``` # 生成数据 ``` package top.malaoshi.sex import org.apache.spark.SparkContext import org.apache.spark.SparkConf import scala.util.Random object GeneratePeopleInfoHDFS { /** * 随机生成性别 * @return */ def getRandomGender(): String = { val rand = new Random() val randNum = rand.nextInt(2) + 1 if (randNum % 2 == 0) { "M" } else { "F" } } def main(args: Array[String]) { //这里注意端口和路径,要和配置里面的端口路径一样 // val outputFile = "hdfs://hadoop1:8020/data/peopleinfo.txt" val path = "data/person_height" val conf = new SparkConf().setAppName("GeneratePersonHeightHDFS").setMaster("local[2]") val sc = new SparkContext(conf) val rand = new Random() val array = new Array[String](1000) for (i <- 1 to 1000) { // 随机生成身高,单位厘米 var height = rand.nextInt(190) if (height < 50) { height = height + 50 } // 随机生成性别 var gender = getRandomGender() // 如果男生身高低于100,增加100 if (height < 100 && gender == "M") { height = height + 100 } // 如果女生身高低于100,增加50 if (height < 100 && gender == "F") { height = height + 50 } // 将数据放入数组中 array(i - 1) = i + " " + gender + " " + height } val rdd = sc.parallelize(array) rdd.foreach(println) rdd.saveAsTextFile(path) } } ``` # 统计身高 ``` package top.malaoshi.sex import org.apache.spark.SparkConf import org.apache.spark.SparkContext object CountPeopleInfo{ def main(args:Array[String]){ if (args.length<1){ println("Usage: input file path") System.exit(1) } val conf = new SparkConf().setAppName("CountPeopleInfo").setMaster("local") val sc = new SparkContext(conf) val lines = sc.textFile(args(0),3) val maleInfo = lines .filter(line=>line.contains("M")) .map(line=>line.split(" ")) .map(t=>(t(1)+" "+t(2))) val femaleInfo = lines .filter(line=>line.contains("F")) .map(line=>line.split(" ")) .map(t=>(t(1)+" "+ t(2))) val maleHeightInfo = maleInfo .map(t=>t.split(" ")(1).toInt) val femaleHeightInfo = femaleInfo .map(t=>t.split(" ")(1).toInt) val lowestMale = maleHeightInfo .sortBy(x=>x,true) .first() val lowestFemale = femaleHeightInfo .sortBy(p=>p,true) .first() val highestMale = maleHeightInfo .sortBy(p=>p,false) .first() val highestFemale = femaleHeightInfo .sortBy(p=>p,false) .first() println("Number of Male:"+ maleInfo.count()) println("Number of Female:"+femaleInfo.count()) println("Lowest Male:"+lowestMale) println("Lowest Female:"+lowestFemale) println("HighestMale:"+highestMale) println("HighestFemale:"+highestFemale) } } ``` ### 运行 按照下图操作: [](https://www.malaoshi.top/upload/0/0/1GW2Q0qFTMCO.png) 指定参数,如下图: [](https://www.malaoshi.top/upload/0/0/1GW2Q0qc8YSR.png) ### 执行结果 ``` Number of Male:494 Number of Female:506 Lowest Male:100 Lowest Female:100 HighestMale:199 HighestFemale:188 ``` 原文出处:http://www.malaoshi.top/show_1GW2Q1cClJWw.html