MapReduce倒排索引

时间：2019-09-14 22:05:04 阅读：59 评论：0 收藏：0 [点我收藏+]

问题分析

技术分享图片

实现代码

InversIndex类

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class InversIndex {

 public static void main(String[] args) throws Exception {
   // TODO Auto-generated method stub
   Job job=Job.getInstance(new Configuration());

   job.setJarByClass(InversIndex.class);

   job.setMapperClass(InversMapper.class);
   job.setMapOutputKeyClass(Text.class);
   job.setMapOutputValueClass(Text.class);
   FileInputFormat.setInputPaths(job, new Path(args[0]));

   job.setReducerClass(InversReducer.class);
   job.setOutputKeyClass(Text.class);
   job.setOutputValueClass(Text.class);
   FileOutputFormat.setOutputPath(job, new Path(args[1]));

   job.setCombinerClass(InversConbiner.class);

   job.waitForCompletion(true);
}

 public static class InversMapper extends Mapper<LongWritable, Text, Text, Text>{

   private Text k2=new Text();
   private Text v2=new Text();

   @Override
   protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
       throws IOException, InterruptedException {
     // TODO Auto-generated method stub
     String hang=value.toString();
     String[] values=hang.split("\t");

     for(String string : values){
       FileSplit in=(FileSplit) context.getInputSplit();
       Path path=in.getPath();
       String fileName=path.getName();

       k2.set(string+"->"+ fileName);
       v2.set("1");
       context.write(k2, v2);
    }
  }
}
 public static class InversConbiner extends Reducer<Text, Text, Text, Text>{

   private Text k22=new Text();
   private Text v22=new Text();
   @Override
   protected void reduce(Text k2, Iterable<Text> v2, Reducer<Text, Text, Text, Text>.Context context)
       throws IOException, InterruptedException {
     String keyAndName = k2.toString();
     String[] strings=keyAndName.split("->");
     String key = strings[0];
     String fileName = strings[1];

     long sum = 0;

     for(Text text : v2){
       sum += Long.parseLong(text.toString());
    }
     k22.set(key);
     v22.set(fileName +"->"+ sum);

     context.write(k22, v22);
  }
}

 public static class InversReducer extends Reducer<Text, Text, Text, Text>{

   private Text v3=new Text();
   @Override
   protected void reduce(Text k2, Iterable<Text> v2, Reducer<Text, Text, Text, Text>.Context context)
       throws IOException, InterruptedException {
     String sum ="";
     for(Text text : v2){
       sum += text.toString() + "\t";
    }

     v3.set(sum);
     context.write(k2, v3);
  }
}
}

 
 
x
 
 
 
 
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class InversIndex {
 public static void main(String[] args) throws Exception {
   // TODO Auto-generated method stub
   Job job=Job.getInstance(new Configuration());
   job.setJarByClass(InversIndex.class);
   job.setMapperClass(InversMapper.class);
   job.setMapOutputKeyClass(Text.class);
   job.setMapOutputValueClass(Text.class);
   FileInputFormat.setInputPaths(job, new Path(args[0]));
   job.setReducerClass(InversReducer.class);
   job.setOutputKeyClass(Text.class);
   job.setOutputValueClass(Text.class);
   FileOutputFormat.setOutputPath(job, new Path(args[1]));
   job.setCombinerClass(InversConbiner.class);
   job.waitForCompletion(true);
}
 public static class InversMapper extends Mapper<LongWritable, Text, Text, Text>{
   private Text k2=new Text();
   private Text v2=new Text();
   @Override
   protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
       throws IOException, InterruptedException {
     // TODO Auto-generated method stub
     String hang=value.toString();
     String[] values=hang.split("\t");
     for(String string : values){
       FileSplit in=(FileSplit) context.getInputSplit();
       Path path=in.getPath();
       String fileName=path.getName();
       k2.set(string+"->"+ fileName);
       v2.set("1");
       context.write(k2, v2);
    }
  }
}
 public static class InversConbiner extends Reducer<Text, Text, Text, Text>{
   private Text k22=new Text();
   private Text v22=new Text();
   @Override
   protected void reduce(Text k2, Iterable<Text> v2, Reducer<Text, Text, Text, Text>.Context context)
       throws IOException, InterruptedException {
     String keyAndName = k2.toString();
     String[] strings=keyAndName.split("->");
     String key = strings[0];
     String fileName = strings[1];
     long sum = 0;
     for(Text text : v2){
       sum += Long.parseLong(text.toString());
    }
     k22.set(key);
     v22.set(fileName +"->"+ sum);
     context.write(k22, v22);
  }
}
 public static class InversReducer extends Reducer<Text, Text, Text, Text>{
   private Text v3=new Text();
   @Override
   protected void reduce(Text k2, Iterable<Text> v2, Reducer<Text, Text, Text, Text>.Context context)
       throws IOException, InterruptedException {
     String sum ="";
     for(Text text : v2){
       sum += text.toString() + "\t";
    }
     v3.set(sum);
     context.write(k2, v3);
  }
}
}
 
 

数据：

a.txt

hellotom
hellokitty
hellojerry
hellotom

 
hellotom
hellokitty
hellojerry
hellotom

b.txt

hellocat
hellotom
hellokitty

 
hellocat
hellotom
hellokitty

c.txt

hellotom
catkitty

 
hellotom
catkitty

执行步骤

hadoop jar /ii.jar com.wxkj.ii.action.InversIndex /data /outdata

 
hadoop jar /ii.jar com.wxkj.ii.action.InversIndex /data /outdata

执行结果

[root@hadoop01 tmp]# hdfs dfs -cat /outdata/part-r-00000
catc.txt->1b.txt->1
hellob.txt->3c.txt->1a.txt->4
jerrya.txt->1
kittya.txt->1b.txt->1c.txt->1
tomc.txt->1b.txt->1a.txt->2

 
[root@hadoop01 tmp]# hdfs dfs -cat /outdata/part-r-00000
catc.txt->1b.txt->1
hellob.txt->3c.txt->1a.txt->4
jerrya.txt->1
kittya.txt->1b.txt->1c.txt->1
tomc.txt->1b.txt->1a.txt->2

MapReduce倒排索引

原文：https://www.cnblogs.com/TiePiHeTao/p/a5b2849db3d9fec57773f55cfa616d9f.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)