本地IDEA跑阿里云服务器Word Count

时间：2020-07-10 16:05:53 阅读：82 评论：0 收藏：0 [点我收藏+]

Maven依赖

<dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>3.1.3</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>3.1.3</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-core</artifactId>
            <version>3.1.3</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>3.1.3</version>
        </dependency>
    </dependencies>

代码

import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.*;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;

public class HDFS_API {
    public static FileSystem fs;
    public static Configuration conf;

    public static void init() throws Exception {
        //通过这种方式设置客户端身份
        System.setProperty("HADOOP_USER_NAME", "hadoop");
        conf = new Configuration ();
        conf.set("fs.defaultFS","hdfs://bigdata:9000");
        conf.set("dfs.client.use.datanode.hostname", "true");
        conf.set("fs.hdfs.impl","org.apache.hadoop.hdfs.DistributedFileSystem");
        fs = FileSystem.get(conf);

        //或者使用下面的方式设置客户端身份
        //fs = FileSystem.get(new URI("hdfs://bigdata:9000"),conf,"hadoop");
    }
    public static void close() throws Exception {
        if(fs != null)
            fs.close ();
    }

    //遍历path路径下文件及文件信息
    public static void listFiles(String path) throws Exception {
        //listStatus可以添加FileFilter类过滤不要的文件
        FileStatus[] files = fs.listStatus (new Path(path));
        for(FileStatus file : files) {
            //文件大小
            System.out.println ( file.getLen () );
            //文件路径
            System.out.println ( file.getPath () );
            //文件权限
            System.out.println ( file.getPermission () );
            //file.isFile()
            //file.isDirectory()
        }
    }
    //创建文件夹
    public static void mkDir(String path) throws Exception {
        //第一个参数是路径Path，第二个参数是目录权限管理
        fs.mkdirs ( new Path(path) );
    }
    //删除文件夹
    public static void deleteDir(String path) throws Exception {
        fs.delete ( new Path(path), true );
    }
    //下载文件，通过copyTocalFile()和copyFromLocalFile()
    //或者通过FSDataInputStream和FSDataOutputStream
    public static void getFileToLocal(String inputPath, String outputPath) throws Exception {
        //fs.copyToLocalFile (new Path(inputPath), new Path(outputPath));
        //FileSystem对象的open()方法返回一个FSDataInputStream，用以读数据，建立输入流
        FSDataInputStream inputStream = fs.open (new Path(inputPath));
        //本地的输出流
        FileOutputStream outputStream = new FileOutputStream (new File (outputPath));
        IOUtils.copyBytes (inputStream,outputStream,conf);
        IOUtils.closeStreams ( inputStream, outputStream );
    }
    //上传文件
    public static void putFile(String inputPath, String outputPath) throws Exception {
        //第一个参数是本地路径，第二个路径是上传路径，将本地文件上传到HDFS上
        //fs.copyFromLocalFile (new Path(inputPath), new Path(outputPath));
        FileInputStream inputStream = new FileInputStream (new File(inputPath));
        FSDataOutputStream outputStream = fs.create (new Path(outputPath));
        IOUtils.copyBytes (inputStream,outputStream,conf);
        IOUtils.closeStreams ( inputStream,outputStream );
    }

    //在HDFS上用流读写数据
    public static void read_write(String inputPath, String outputPath) throws Exception {
        FSDataInputStream inputStream = fs.open (new Path(inputPath));
        FSDataOutputStream outputStream = fs.create (new Path(outputPath));
        //只能操作字节流
        byte[] buf = new byte[1024];
        inputStream.read(buf);
        //不可以用buf.toString，其没有重写toString()方法，只会返回类名和地址
        System.out.println (new String(buf));
        outputStream.write (buf);
        IOUtils.closeStreams (inputStream,outputStream);
    }


    public static void main(String[] args) throws Exception {
        init();
        mkDir ( "input" );
        read_write ( "test.txt","input/file.txt" );
        close ();
    }
}

遇到的问题

运行后发现出现以下错误。查找资料，了解到需要在本地Windows也下载hadoop和winutils。下面是参考的4篇博客。有篇博客说如果对远程的hadoop进行上传,创建目录,文件改名(写)等操作，那么是不需要在windows本地配置hadoop的,可一旦涉及到下载(读),hadoop内部的缓存机制要求本地也必须有hadoop。

这里我比较疑惑，我前面对HDFS进行操作，读写文件，也没有报这个错。下面的文章是表示Hadoop需要Windows的部分API来实现类似posix 的访问权限。 https://cwiki.apache.org/confluence/display/HADOOP2/WindowsProblems

我只能理解为运行MapReduce的时候，通过Job操作文件系统需要以上所说的访问权限。

Exception in thread "main" java.lang.RuntimeException: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. -see https://wiki.apache.org/hadoop/WindowsProblems
	at org.apache.hadoop.util.Shell.getWinUtilsPath(Shell.java:737)
	at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:272)
	at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:288)
	at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:840)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkOneDirWithMode(RawLocalFileSystem.java:522)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:562)
	······

解决方法

第一步：下载winutils

下载网址
然后解压到相应目录。根据你自己的hadoop版本，选最接近的。我是3.1.3，选hadoop3.0.0的，发现bin与普通hadoop/bin目录多了一个winutils.exe。

第二部：设置环境变量

需要设置环境变量HADOOP_HOME，就是你解压hadoop目录的地址，再设置环境变量PATH，加上 %HADOOP_HOME%\bin

第三步：拷贝hadoop.dll

如果没有这步，还是汇报错误.将bin下的hadoop.dll拷贝到C:\Windows\System32

org.apache.hadoop.io.nativeio.NativeIO$Windows.access0

然后我就跑通了。
可以参考简书的这篇，但我没有替换这步骤。

总结

以前都是在本地跑，没想到换成本地访问阿里云，HDFS，HBase,MapReduce，都有自己的小问题。庆幸的是目前的学习资料很多，网上每个问题都有很多的解决方法。后面要继续研究Spark了，希望能写出关于Spark的小论文吧。

在查资料的时候，发现了这篇。我觉得下面有个评论说得很好，命途多舛。我觉得他遇到了所有可能遇上的问题，也都解决了。
https://blog.csdn.net/congcong68/article/details/42043093/

参考
https://blog.csdn.net/darkdragonking/article/details/72636917
https://www.cnblogs.com/tele-share/p/9531595.html
https://blog.csdn.net/u013305864/article/details/97191344
https://www.jianshu.com/p/a65a95108620

本地IDEA跑阿里云服务器Word Count

原文：https://www.cnblogs.com/chenshaowei/p/13279206.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)