Pivotal Knowledge Base

Follow

Configure R to use with Hive

Environment

  • PHD 2.X
  • CENTOS 6.X

Question:

How to set up R to use with Hive?

Solution:

 Once you setup hadoop cluster which includes hive server and hive metastore, follow the steps below on the hive client :

Step 1: Get the yum repository to install R

[root@hdm1 ~]# rpm -Uvh http://download.fedoraproject.org/pub/epel/6/i386/epel-release-6-8.noarch.rpm
Retrieving http://download.fedoraproject.org/pub/epel/6/i386/epel-release-6-8.noarch.rpm
warning: /var/tmp/rpm-tmp.BONZww: Header V3 RSA/SHA256 Signature, key ID 0608b895: NOKEY
Preparing... ########################################### [100%]
1:epel-release ########################################### [100%]

Step 2: Update the yum repository and install R

[root@hdm1 ~]# Yum clean all

If you face below error:
Error: Cannot retrieve metalink for repository: epel. Please verify its path and try again
Change the mirrorlist from epel.repo from https to http
Ex: mirrorlist=http://mirrors.fedoraproject.org/metalink?repo=epel-6&arch=$basearch

[root@hdm1 ~]# Yum install R

Step 3: Once you installed "R" package, make sure you have java path set correctly, if not follow the steps for R to use the right path

As root user: R CMD javareconf
As non-root user:R CMD javareconf -e

Step 4: Install rJava and rJDBC

[root@hdm1 ~]# R
> install.packages("rJava")
> install.packages("RJDBC",dep=TRUE)
> q()
Save workspace image? [y/n/c]: y

Step 5: Start the Hive server

[root@hdm1 ~]# $HIVE_HOME/bin/hive --service hiveserver

Step 6: Use R to use Hive

> library("DBI")
> library("rJava")
> library("RJDBC")
> hive.class.path = list.files(path=c("/usr/lib/gphd/hive/lib"), pattern="jar", full.names=T);
> hadoop.lib.path = list.files(path=c("/usr/lib/gphd/hadoop/lib"), pattern="jar", full.names=T);
> hadoop.class.path = list.files(path=c("/usr/lib/gphd/hadoop"), pattern="jar", full.names=T);
> class.path = c(hive.class.path, hadoop.lib.path, hadoop.class.path);
> drv  options(java.parameters = "-Xmx8g");
> hive.master="hdm1.phd.local:10000<http://hdm1.phd.local:10000>";
> url.dbc =  paste0("jdbc:hive://", hive.master,"/default");
> conn = dbConnect(drv, url.dbc, "gpadmin", "changeme");
log4j:WARN No appenders could be found for logger (org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
> dbListTables(conn);
[1] "hbase_table_3" "walmart_test"  "page_view"     "weblogs"
> q() 

Step 7: Use R to use Hive when kerberos is enabled

> library("DBI")
> library("rJava")
> library("RJDBC")
> hive.class.path = list.files(path=c("/usr/lib/gphd/hive/lib"), pattern="jar", full.names=T);
> hadoop.lib.path = list.files(path=c("/usr/lib/gphd/hadoop/lib"), pattern="jar", full.names=T);
> hadoop.class.path = list.files(path=c("/usr/lib/gphd/hadoop"), pattern="jar", full.names=T);
> mapred.class.path = list.files(path=c("/usr/lib/gphd/hadoop-mapreduce", pattern="jar", full.names=T));
> cp = c(hive.class.path, hadoop.lib.path, hadoop.class.path, mapred.class.path, "/usr/lib/gphd/hadoop-mapreduce/hadoop-mapreduce-client-core.jar")
> .jinit(classpath=cp)
> drv  url.dbc =  paste0("jdbc:hive2://hdm1.phd.local:10002/default; > principal=hive/hdm1.phd.local@PHD.LOCAL");
> conn = dbConnect(drv, url.dbc, "hive", "hive");
log4j:WARN No appenders could be found for logger (org.apache.hadoop.util.Shell).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
> dbListTables(conn);
[1] "abctest"
> q()

Comments

Powered by Zendesk