Hadoop commands

Hadoop version
Hadoop fs -mkdir path
Hadoop fs -ls path
Hadoop fs -put source path
Hadoop fs -cat file.txt
Hadoop fs -touchz file.txt
Hadoop fsck path
Hadoop fs -df path
Hadoop fs 
Echo “text” > file.txt
Hadoop fs -moveFromLocal src dest
Hadoop fs -rm path
Hadoop fs -cp src dest
Hadoop fs -mv src dest 
Hadoop fs -setrep -w 2 file.txt
Hadoop-expunge

Word Counter

Docker run -d —name namenode hadoop-namenode
Docker run -d —name datanode hadoop-datanode
Docker exec -it namenode bash
Cd /root
Docker cp input.txt namenode:/root
Hadoop fs -put input.txt /input
Hadoop jar wordcounter.jar /input /output
Hadoop fs -ls /output
Hadoop fs -cat /output/part-00000

Max Temperature

from pyspark.sql import SparkSession, Row

spark = SparkSession.builder \
    .appName("Max Temperature with Year") \
     .getOrCreate()


data = spark.read.csv("temperature_data.csv", header = True, inferSchema = True)

max_temp_row = data.orderBy(data["temperature"].desc()).first()

if max_temp_row:
    max_temp_df = spark.createDataFrame([
        Row(year = max_temp_row["year"], max_temperature = max_temp_row["temperature"])
    ])

    max_temp_df.show()

else:
    print("No temperature data available")

spark.stop()

Hive server setup

docker exec -it hive-hive-server hive

Install docker

version: "3"

services:
  namenode:
    image: bde2020/hadoop-namenode:2.0.0-hadoop3.2.1-java8
    container_name: namenode
    environment:
      - CLUSTER_NAME=hadoop-cluster
      - CORE_CONF_fs_defaultFS=hdfs://namenode:8020
      - HDFS_CONF_dfs_replication=2
    ports:
      - "9870:9870"
      - "8020:8020"
    volumes:
      - namenode_data:/hadoop/dfs/name

  datanode:
    image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8
    container_name: datanode
    environment:
      - CLUSTER_NAME=hadoop-cluster
      - CORE_CONF_fs_defaultFS=hdfs://namenode:8020
      - HDFS_CONF_dfs_replication=2
    ports:
      - "9864:9864"
    depends_on:
      - namenode
    volumes:
      - datanode_data:/hadoop/dfs/data

  hive-metastore-postgresql:
    # Use official Postgres to avoid the broken init scripts in bde image
    image: postgres:12
    container_name: hive-metastore-postgresql
    environment:
      POSTGRES_DB: metastore
      POSTGRES_USER: hive
      POSTGRES_PASSWORD: hive
    ports:
      - "5432:5432"
    volumes:
      - ./metastore_data:/var/lib/postgresql/data
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U hive -d metastore"]
      interval: 5s
      timeout: 5s
      retries: 12

  hive-metastore:
    image: bde2020/hive:2.3.2-postgresql-metastore
    container_name: hive-metastore
    environment:
      - SERVICE_PRECONDITION=namenode:8020 datanode:9864 hive-metastore-postgresql:5432
      - CORE_CONF_fs_defaultFS=hdfs://namenode:8020
      - HIVE_CORE_CONF_fs_defaultFS=hdfs://namenode:8020
      - HIVE_CORE_CONF_javax_jdo_option_ConnectionURL=jdbc:postgresql://hive-metastore-postgresql:5432/metastore
      - HIVE_CORE_CONF_javax_jdo_option_ConnectionDriverName=org.postgresql.Driver
      - HIVE_CORE_CONF_javax_jdo_option_ConnectionUserName=hive
      - HIVE_CORE_CONF_javax_jdo_option_ConnectionPassword=hive
    command: /opt/hive/bin/hive --service metastore
    ports:
      - "9083:9083"
    depends_on:
      - hive-metastore-postgresql
      - namenode

  hive-server:
    image: bde2020/hive:2.3.2-postgresql-metastore
    container_name: hive-server
    environment:
      - SERVICE_PRECONDITION=hive-metastore:9083
      - CORE_CONF_fs_defaultFS=hdfs://namenode:8020
      - HIVE_CORE_CONF_fs_defaultFS=hdfs://namenode:8020
      - HIVE_CORE_CONF_javax_jdo_option_ConnectionURL=jdbc:postgresql://hive-metastore-postgresql:5432/metastore
      - HIVE_CORE_CONF_javax_jdo_option_ConnectionDriverName=org.postgresql.Driver
      - HIVE_CORE_CONF_javax_jdo_option_ConnectionUserName=hive
      - HIVE_CORE_CONF_javax_jdo_option_ConnectionPassword=hive
    ports:
      - "10000:10000"
    depends_on:
      - hive-metastore
      - namenode

volumes:
  namenode_data:
  datanode_data:
  hive_postgres_data:

Start hadoop

docker-compose up -d
docker exec -it namenode bash