hadoop 中reducer非常慢 算法很简单 求帮忙改进
有一个大数据文本 约 18G左右格式类似是:
12 Followers[13,15,58,62,]
12 Following[15,21,28,58,59,]
13 Followers[28,91,34098,]
15 Following[59,17,572,]
59 Followers[489,2,2398,]
59 Folloing[598,398,194,]
最后要找出比如对于用户12来说 followers 与following中重复的用户
输出为
12 Friendship[15,58]
13 Friendship[]
15 Friendship[]
59 Friendship[]
我的程序在hadoop上对于小文本可以跑出来 但是用我的大文本的话reducer就很慢 希望大牛可以帮忙改进下
代码如下:
package org.myorg;
import java.io.*;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
public class friendship{
public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text> {
//private final static IntWritable one = new IntWritable(1);
private Text ID = new Text();
private Text information = new Text();
public void map(LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
if (tokenizer.hasMoreTokens()) {
ID.set(tokenizer.nextToken());
}
if (tokenizer.hasMoreTokens()) {
information.set(tokenizer.nextToken());
}
output.collect(ID, information);
}
}
public static class Reduce extends MapReduceBase implements Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
Text orginal= new Text();
StringBuilder friendship= new StringBuilder("Friendship[");
while (values.hasNext()){
Text temp =values.next();
orginal.append(temp.getBytes(),0,temp.getLength());
orginal.append("%".getBytes(),0,1);
}
if (orginal.find("F",1) != -1) {
String[] strs,followerList,followingList;
String follower,following;
strs = orginal.toString().split("%");
follower = strs[0].substring(strs[0].indexOf("[") + 1, strs[0].lastIndexOf("]"));
following = strs[1].substring(strs[1].indexOf("[") + 1, strs[1].lastIndexOf("]"));
followerList = follower.split(",");
followingList = following.split(",");
for (int j = 0; j < followerList.length; j++) {
if ("" == followerList[j]) continue;
for (int p = 0; p < followingList.length; p++) {
if ("" == followingList[p]) continue;
if ((followerList[j].equals(followingList[p]))==true) {
friendship.append(followerList[j]).append(",");
}
}
}
}
friendship.append("]");
Text List=new Text(friendship.toString());
output.collect(key, List);
}
}
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf(friendship.class);
conf.setJobName("Friendship");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
conf.setMapperClass(Map.class);
//conf.setCombinerClass(Reduce.class);
conf.setReducerClass(Reduce.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
}
}
--------------------编程问答-------------------- 帮帮忙啊! --------------------编程问答-------------------- 准备学习Hadoop --------------------编程问答-------------------- StringTokenizer tokenizer = new StringTokenizer(line);
if (tokenizer.hasMoreTokens()) {
ID.set(tokenizer.nextToken());
}
if (tokenizer.hasMoreTokens()) {
information.set(tokenizer.nextToken());
}
orginal.append(temp.getBytes(),0,temp.getLength());
orginal.append("%".getBytes(),0,1);
都写的严重有问题,目测你的代码质量 还是转行吧. --------------------编程问答-------------------- if (tokenizer.hasMoreTokens()) {
ID.set(tokenizer.nextToken());
information.set(tokenizer.nextToken());
}
上面的可以合起来写
StringBuilder 的append 是这样用的 append("a").append("b");
你还是 老老实实 的先写几点普通的java程序吧... --------------------编程问答-------------------- Text temp =values.next();
temp用全局变脸 可以减少变量个数
follower = strs[0].substring(strs[0].indexOf("[") + 1, strs[0].lastIndexOf("]"));
写成String s1=strs[0];
用s1进行计算,减少计算个数
followerList = follower.split(",");
followingList = following.split(",");
上面代码为何执行多次
补充:Java , Java相关