当前位置:编程学习 > JAVA >>

mahout算法源码分析之Itembased Collaborative Filtering(三)RowSimilarityJob验证

Mahout版本:0.7,hadoop版本:1.0.4,jdk:1.7.0_25 64bit。
 
 本篇分析上篇的分析是否正确,主要是编写上篇输出文件的读取以及添加log信息打印相关变量。
 
首先,编写下面的测试文件分析所有的输出:
 
[java] 
package mahout.fansy.item;  
  
import java.io.IOException;  
import java.util.Map;  
  
import mahout.fansy.utils.read.ReadArbiKV;  
  
import org.apache.hadoop.conf.Configuration;  
import org.apache.hadoop.fs.Path;  
import org.apache.hadoop.io.Writable;  
import org.apache.mahout.math.Vector;  
import org.apache.mahout.math.hadoop.similarity.cooccurrence.Vectors;  
  
import junit.framework.TestCase;  
  
public class ReadRowSimilarityJobOut extends TestCase {  
    // 测试 weights 输出:  
    public void testWeights() throws IOException{  
        String path="hdfs://ubuntu:9000/user/mahout/item/temp/weights/part-r-00000";  
        Map<Writable,Writable> map= ReadArbiKV.readFromFile(path);  
        System.out.println("weights=================");  
        System.out.println(map);  
    }  
    //normsPath  
    public void testNormsPath() throws IOException{  
        String path="hdfs://ubuntu:9000/user/mahout/item/temp/norms.bin";  
        Vector map=getVector(path);  
        System.out.println("normsPath=================");  
        System.out.println(map);  
    }  
    //maxValues.bin  
    public void testMaxValues() throws IOException{  
        String path="hdfs://ubuntu:9000/user/mahout/item/temp/maxValues.bin";  
        Vector map=getVector(path);  
        System.out.println("maxValues=================");  
        System.out.println(map);  
    }  
    //numNonZeroEntries.bin  
    public void testNumNonZeroEntries() throws IOException{  
        String path="hdfs://ubuntu:9000/user/mahout/item/temp/numNonZeroEntries.bin";  
        Vector map=getVector(path);  
        System.out.println("numNonZeroEntries=================");  
        System.out.println(map);  
    }  
      
    //pairwiseSimilarityPath  
    public void testPairwiseSimilarityPath() throws IOException{  
        String path="hdfs://ubuntu:9000/user/mahout/item/temp/pairwiseSimilarity/part-r-00000";  
          
        Map<Writable,Writable> map= ReadArbiKV.readFromFile(path);  
        System.out.println("pairwiseSimilarityPath=================");  
        System.out.println(map);  
    }  
      
    //similarityMatrix  
    public void testSimilarityMatrix() throws IOException{  
        String path="hdfs://ubuntu:9000/user/mahout/item/temp/similarityMatrix/part-r-00000";  
        Map<Writable,Writable> map= ReadArbiKV.readFromFile(path);  
        System.out.println("similarityMatrix=================");  
        System.out.println(map);  
    }  
      
    // 读取.bin文件  
    public Vector getVector(String path){  
        Configuration conf=new Configuration();  
        conf.set("mapred.job.tracker", "ubuntu:9001");  
        Vector vector=null;  
        try {  
            vector = Vectors.read(new Path(path), conf);  
        } catch (IOException e) {  
            e.printStackTrace();  
        }  
        return vector;  
    }  
}  
运行上面的文件得到下面的输出:
[plain] 
weights=================  
{1={103:2.5,102:3.0,101:5.0}, 2={101:2.0,104:2.0,103:5.0,102:2.5}, 3={101:2.5,107:5.0,105:4.5,104:4.0}, 4={101:5.0,106:4.0,104:4.5,103:3.0}, 5={106:4.0,105:3.5,104:4.0,103:2.0,102:3.0,101:4.0}}  
normsPath=================  
{107:25.0,106:32.0,105:32.5,104:56.25,103:44.25,102:24.25,101:76.25}  
maxValues=================  
{}  
numNonZeroEntries=================  
{}  
pairwiseSimilarityPath=================  
{102={106:0.14972506706560876,105:0.14328432723886902,104:0.12789210656028413,103:0.1975496259559987}, 103={106:0.1424339656566283,105:0.11208890297777215,104:0.14037600977966974}, 101={107:0.10275248635596666,106:0.1424339656566283,105:0.1158457425543559,104:0.16015261286229274,103:0.15548737703860027,102:0.14201473202245876}, 106={}, 107={}, 104={107:0.13472338607037426,106:0.18181818181818182,105:0.16736577623297264}, 105={107:0.2204812092115424,106:0.14201473202245876}}  
similarityMatrix=================  
{102={101:0.14201473202245876,106:0.14972506706560876,105:0.14328432723886902,104:0.12789210656028413,103:0.1975496259559987}, 103={101:0.15548737703860027,106:0.1424339656566283,105:0.11208890297777215,104:0.14037600977966974,102:0.1975496259559987}, 101={107:0.10275248635596666,106:0.1424339656566283,105:0.1158457425543559,104:0.16015261286229274,103:0.15548737703860027,102:0.14201473202245876}, 106={101:0.1424339656566283,105:0.14201473202245876,104:0.18181818181818182,103:0.1424339656566283,102:0.14972506706560876}, 107={105:0.2204812092115424,104:0.13472338607037426,101:0.10275248635596666}, 104={107:0.13472338607037426,106:0.18181818181818182,105:0.16736577623297264,103:0.14037600977966974,102:0.12789210656028413,101:0.16015261286229274}, 105={107:0.2204812092115424,106:0.14201473202245876,104:0.16736577623297264,103:0.11208890297777215,102:0.14328432723886902,101:0.1158457425543559}}  
其中第一个weights就和分析的一模一样,这里就不再相信写了。那就只分析pairwiseSimilarityPath和similarityMatrix了:
补充:软件开发 , Java ,
CopyRight © 2022 站长资源库 编程知识问答 zzzyk.com All Rights Reserved
部分文章来自网络,