hbase data

hbase

1. set hbase client

Before running this cell, you need to create a .env file in the same directory as this notebook, and set the following environment variables in the .env file:

HBASE_FETCH_API =
HBASE_SEND_API =
HBASE_TOKEN =
[1]:
import logging
import os

import geopandas as gpd
import nest_asyncio
from dotenv import load_dotenv

from h3_toolkit.core import H3Toolkit

logging.basicConfig(level=logging.INFO)

load_dotenv()
nest_asyncio.apply()   # needed for jupyter notebook to run asyncio
FETCH_API = os.getenv("HBASE_FETCH_API")
SEND_API = os.getenv("HBASE_SEND_API")
TOKEN = os.getenv("HBASE_TOKEN")
[2]:
from h3_toolkit.hbase import HBaseClient

toolkit = H3Toolkit()
toolkit.set_hbase_client(
    HBaseClient(
        fetch_url=FETCH_API,
        send_url=SEND_API,
        token = TOKEN,
        max_concurrent_requests=5, # don't change this
        chunk_size=1000,         # don't change this
    )
)
[2]:
<h3_toolkit.core.H3Toolkit at 0x1063b54b0>
[3]:
# Check for setting hbase client correctly
toolkit.hbase_client
[3]:
HBaseClient(
 fetch_url = http://10.100.1.64:2891/api/hbase/v1/test/filterdata2,
 send_url = http://10.100.1.64:2891/api/hbase/v1/test/putdata,
 token = eyJhb*********************************************************************************************X-H5I,
 max_concurrent_requests = 5,
 chunk_size = 1000
)

2. Select boundary of the data

[4]:
gdf = gpd.read_file('data/test_geom.geojson')
merged_geometry = gdf['geometry'].union_all()

new_gdf = gpd.GeoDataFrame(geometry=[merged_geometry], crs= gdf.crs)
new_gdf.plot(edgecolor='red', facecolor='none')
[4]:
<Axes: >
../_images/usage_03_hbase_data_6_1.png
[5]:
result = (
    toolkit
    .process_from_vector(
        data = new_gdf,
        resolution = 12,
        geometry_col = 'geometry'
    )
)

result.get_result().head()
INFO:h3_toolkit.core:2025-06-18 09:51:06 - `process_from_vector` - Start converting data to h3 cells in resolution 12
INFO:h3_toolkit.core:2025-06-18 09:51:06 - `process_from_vector` - Finish converting data to h3 cells in resolution 12 with shape (42080, 1)
[5]:
shape: (5, 1)
hex_id
str
"8c4ba0a406403ff"
"8c4ba0a406411ff"
"8c4ba0a406413ff"
"8c4ba0a406415ff"
"8c4ba0a406417ff"
[6]:
import polars as pl

result = (
    toolkit
    .fetch_from_hbase(
        table_name = 'res12_pre_data',
        column_family = 'demographic',
        column_qualifier = ['p_cnt', 'f_cnt', 'm_cnt'],
    )

    # Convert str to float and round to 3 decimal places
    .apply(lambda df: df.with_columns([pl.col(pl.Utf8).exclude('hex_id').cast(pl.Float64).round(3)]))
)

result.get_result(return_geometry=True).head()
INFO:h3_toolkit.hbase:2025-06-18 09:51:06 - `fetch_from_hbase` - Start fetching data from HBase
Fetching data from Hbase ... : 100%|██████████| 43/43 [00:00<00:00, 7413.78chunk/s]
INFO:h3_toolkit.hbase:2025-06-18 09:51:08 - `fetch_from_hbase` - Finish fetching data from HBase
[6]:
hex_id f_cnt m_cnt p_cnt geometry
0 8c4ba0a406403ff 0.567 0.567 1.133 POLYGON ((121.52403 25.03519, 121.52401 25.035...
1 8c4ba0a406411ff 0.567 0.567 1.133 POLYGON ((121.52431 25.03532, 121.5243 25.0352...
2 8c4ba0a406413ff 0.567 0.567 1.133 POLYGON ((121.52449 25.03531, 121.52447 25.035...
3 8c4ba0a406415ff 0.567 0.567 1.133 POLYGON ((121.5242 25.03518, 121.52419 25.0350...
4 8c4ba0a406417ff 0.567 0.567 1.133 POLYGON ((121.52438 25.03518, 121.52436 25.035...

3. Chain all steps together

[7]:
from h3_toolkit import H3Toolkit
from h3_toolkit.hbase import HBaseClient

toolkit = H3Toolkit()

result = (
    toolkit
    .process_from_vector(
        data = new_gdf,
        resolution = 12,
        geometry_col = 'geometry'
    )
    .set_hbase_client(
        HBaseClient(
            fetch_url=FETCH_API,
            send_url=SEND_API,
            token = TOKEN,
            max_concurrent_requests=5, # don't change this
            chunk_size=200000,         # don't change this
        )
    )
    .fetch_from_hbase(
        table_name = 'res12_pre_data',
        column_family = 'demographic',
        column_qualifier = ['p_cnt'],
    )
    .apply(lambda df: df.with_columns([pl.col(pl.Utf8).exclude('hex_id').cast(pl.Float64).round(3)]))
    .get_result(return_geometry=True)
)

result.head()
INFO:h3_toolkit.core:2025-06-18 09:51:08 - `process_from_vector` - Start converting data to h3 cells in resolution 12
INFO:h3_toolkit.core:2025-06-18 09:51:08 - `process_from_vector` - Finish converting data to h3 cells in resolution 12 with shape (42080, 1)
INFO:h3_toolkit.hbase:2025-06-18 09:51:08 - `fetch_from_hbase` - Start fetching data from HBase
Fetching data from Hbase ... : 100%|██████████| 43/43 [00:00<00:00, 9300.01chunk/s]
INFO:h3_toolkit.hbase:2025-06-18 09:51:09 - `fetch_from_hbase` - Finish fetching data from HBase
[7]:
hex_id p_cnt geometry
0 8c4ba0a406403ff 1.133 POLYGON ((121.52403 25.03519, 121.52401 25.035...
1 8c4ba0a406411ff 1.133 POLYGON ((121.52431 25.03532, 121.5243 25.0352...
2 8c4ba0a406413ff 1.133 POLYGON ((121.52449 25.03531, 121.52447 25.035...
3 8c4ba0a406415ff 1.133 POLYGON ((121.5242 25.03518, 121.52419 25.0350...
4 8c4ba0a406417ff 1.133 POLYGON ((121.52438 25.03518, 121.52436 25.035...

4. fetch data from hbase and aggregate new data back to hbase

[8]:
from h3_toolkit import H3Toolkit
from h3_toolkit.aggregation import Mean, SumUp
from h3_toolkit.hbase import HBaseClient

toolkit = H3Toolkit()

result = (
    toolkit
    .process_from_vector(
        data = new_gdf,
        resolution = 12,
        geometry_col = 'geometry'
    )
    .set_hbase_client(
        HBaseClient(
            fetch_url=FETCH_API,
            send_url=SEND_API,
            token = TOKEN,
        )
    )
    .fetch_from_hbase(
        table_name = 'res12_pre_data',
        column_family = 'demographic',
        column_qualifier = ['p_cnt', 'f_cnt', 'm_cnt'],
    )
    .set_aggregation_strategy(
        {
            ('f_cnt', 'p_cnt'): SumUp(),
            'm_cnt': Mean(),
        }
    )
    .process_from_h3(
        target_resolution= 10,
    )
    .apply(lambda df: df.with_columns([pl.col(pl.Float64).round(3)]))
    .send_to_hbase(
        table_name = 'res10_test_data',
        column_family = 'demographic',
        column_qualifier = ['p_cnt'],
    )
    .get_result()
)

result.head()
INFO:h3_toolkit.core:2025-06-18 09:51:09 - `process_from_vector` - Start converting data to h3 cells in resolution 12
INFO:h3_toolkit.core:2025-06-18 09:51:09 - `process_from_vector` - Finish converting data to h3 cells in resolution 12 with shape (42080, 1)
INFO:h3_toolkit.hbase:2025-06-18 09:51:09 - `fetch_from_hbase` - Start fetching data from HBase
Fetching data from Hbase ... : 100%|██████████| 43/43 [00:00<00:00, 9797.11chunk/s]
INFO:h3_toolkit.hbase:2025-06-18 09:51:10 - `fetch_from_hbase` - Finish fetching data from HBase
INFO:h3_toolkit.core:2025-06-18 09:51:10 - `process_from_h3` - Start converting data to h3 cells in resolution 10
INFO:h3_toolkit.core:2025-06-18 09:51:10 - `process_from_h3` - Finish converting data to h3 cells in resolution 10 with shape (937, 4)
INFO:h3_toolkit.hbase:2025-06-18 09:51:10 - `send_to_hbase` - Start sending data from HBase
Sending data to Hbase ... : 100%|██████████| 1/1 [00:00<00:00, 587.85chunk/s]
INFO:h3_toolkit.hbase:2025-06-18 09:51:10 - `send_to_hbase` - Finish sending data from HBase
[8]:
shape: (5, 4)
hex_idf_cntp_cntm_cnt
strf64f64f64
"8a4ba0a4330ffff"464.757898.6068.854
"8a4ba0a412d7fff"271.692484.1744.336
"8a4ba0a4ecf7fff"283.727542.5685.282
"8a4ba0a4e0e7fff"329.451626.5956.064
"8a4ba0a4155ffff"0.00.00.0

5. Visualization

Visualize the fetched H3 data using pydeck with automatic map boundary calculation.

[ ]:
from h3_toolkit.visualization import show_h3

# Visualize the fetched data
# Using Quantiles classifier with k=5 and viridis colormap
show_h3(
    result,
    'p_cnt',
    classifier='Quantiles',
    k=5,
    cmap='viridis'
)