# Python Client

MyScale is compatible with ClickHouse, so you can use the official Clickhouse client (opens new window) to access MyScale from your python applications.

To install the required dependencies, use the following command:

pip install -U clickhouse-connect

# Creating Connection

To learn how to establish a connection to the cluster, please refer to the Connection Details section.

# Creating Table

Use the Python SDK to create a Table with id, data, date, and label columns. Note that the SQL statements for creating tables differ for different vector types. Currently, MyScale supports float and binary vector types:

# Float Vectors

# Create a table with a 128-dimensional float vector.
client.command("""
CREATE TABLE default.myscale_categorical_vector_search
(
    id    UInt32,
    data  Array(Float32),
    CONSTRAINT check_length CHECK length(data) = 128,
    date  Date,
    label Enum8('person' = 1, 'building' = 2, 'animal' = 3)
)
ORDER BY id""")
# Get and print the names of all tables in the current database.
res = client.query("SHOW TABLES").named_results()
print([r['name'] for r in res])

Example output:

['myscale_categorical_vector_search']

# Binary Vectors

If your vector data is binary, you can create a table named myscale_categorical_binary_search with id, data, date, and label columns as follows: Note that the binary data length imported into the Table should be consistent. Assuming we want to import binary data of length 128, we need to set the FixedString length to 128/8=16.

# Create a Table for storing binary data
client.command("""
CREATE TABLE default.myscale_categorical_binary_search
(
    id    UInt32,
    data  FixedString(16),
    date  Date,
    label Enum8('person' = 1, 'building' = 2, 'animal' = 3)
)
ORDER BY id""")
# Get and print the names of all tables in the current database.
res = client.query("SHOW TABLES").named_results()
print([r['name'] for r in res])

Example output:

['myscale_categorical_binary_search']

# Importing Data

# Float Vectors

Importing float vectors is relatively straightforward. Assuming we have a Pandas DataFrame with the following values:

import pandas as pd
# create the data dictionary
data = {
    'id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
    'data': [
        [0,0,0,1,8,7,3,2,5,0,0,3,5,7,11,31,13,0,0,0,0,29,106,107,13,0,0,0,1,61,70,42,0,0,0,0,1,23,28,16,63,4,0,0,0,6,83,81,117,86,25,15,17,50,84,117,31,23,18,35,97,117,49,24,68,27,0,0,0,4,29,71,81,47,13,10,32,87,117,117,45,76,40,22,60,70,41,9,7,21,29,39,53,21,4,1,55,72,3,0,0,0,0,9,65,117,73,37,28,23,17,34,11,11,27,61,64,25,4,0,42,13,1,1,1,14,10,6],
        [65,35,8,0,0,0,1,63,48,27,31,19,16,34,96,114,3,1,8,21,27,43,57,21,11,8,37,8,0,0,1,23,101,104,11,0,0,0,0,29,83,114,114,77,23,14,18,52,28,8,46,75,39,24,59,60,2,0,18,10,20,52,52,16,12,28,4,0,0,3,5,8,102,79,58,3,0,0,0,11,114,112,78,50,17,14,45,104,19,31,53,114,73,44,34,26,3,2,0,0,0,1,8,9,34,20,0,0,0,0,1,23,30,75,87,36,0,0,0,2,0,17,66,73,3,0,0,0],
        [0,0,0,0,0,0,4,1,15,0,0,0,0,0,10,49,27,0,0,0,0,29,113,114,9,0,0,0,3,69,71,42,14,0,0,0,0,1,56,79,63,2,0,0,0,38,118,77,118,60,8,8,18,48,59,104,27,16,7,13,80,118,34,21,118,47,4,0,0,1,32,99,61,40,31,57,46,118,118,61,80,64,16,21,20,33,23,27,6,22,16,14,51,33,0,0,76,40,8,0,2,14,42,94,19,42,57,67,23,34,22,10,9,52,15,21,5,1,3,3,1,38,12,5,18,1,0,0],
        [3,9,45,22,28,11,4,3,77,10,4,1,1,4,3,11,23,0,0,0,26,49,6,7,5,3,3,1,11,50,8,9,11,7,15,21,12,17,21,25,121,12,4,7,4,7,4,41,28,2,0,1,10,42,22,20,1,1,4,9,31,79,16,3,23,4,6,26,31,121,87,40,121,82,16,12,15,41,6,10,76,48,5,3,21,42,41,50,5,17,18,64,86,54,17,6,43,62,56,84,116,108,38,26,58,63,20,87,105,37,2,2,121,121,38,25,44,33,24,46,3,16,27,74,121,55,9,4],
        [6,4,3,7,80,122,62,19,2,0,0,0,32,60,10,19,4,0,0,0,0,10,69,66,0,0,0,0,8,58,49,5,5,31,59,67,122,37,1,2,50,1,0,16,99,48,3,27,122,38,6,7,11,31,87,122,9,8,6,23,122,122,69,21,0,11,31,55,28,0,0,0,61,4,0,37,43,2,0,15,122,122,55,32,6,1,0,12,5,22,52,122,122,9,2,0,2,0,0,5,28,20,2,2,19,3,0,2,12,12,3,16,25,18,34,35,5,4,1,13,21,2,22,51,9,20,57,59],
        [6,2,19,22,22,81,31,12,72,15,12,10,3,6,1,37,30,17,4,2,9,4,2,21,1,0,1,3,11,9,5,2,7,11,17,61,127,127,28,13,49,36,26,45,28,17,4,16,111,46,11,2,7,25,40,89,2,0,8,31,63,60,28,12,0,18,82,127,50,1,0,0,94,28,11,88,15,0,0,4,127,127,34,23,25,18,18,69,6,16,26,90,127,42,12,8,0,3,46,29,0,0,0,0,22,35,15,12,0,0,0,0,46,127,83,17,1,0,0,0,0,14,67,115,45,0,0,0],
        [19,35,5,6,40,23,18,4,21,109,120,23,5,12,24,5,0,5,87,108,47,14,32,8,0,0,0,27,36,30,43,0,29,12,10,15,6,7,17,12,34,9,14,65,20,23,28,14,120,34,14,14,9,34,120,120,7,6,7,27,56,120,120,23,9,5,4,7,2,6,46,13,29,5,5,32,12,20,99,19,120,120,107,38,13,7,24,36,6,24,120,120,55,26,4,3,5,1,0,0,1,5,19,18,2,2,0,1,18,12,30,7,0,5,33,29,66,50,26,2,0,0,49,45,12,28,10,0],
        [28,28,28,27,13,5,4,12,4,8,29,118,69,19,21,7,3,0,0,14,14,10,105,60,0,0,0,0,11,69,76,9,5,2,18,59,17,6,1,5,42,9,16,75,31,21,17,13,118,44,18,16,17,30,78,118,4,4,8,61,118,110,54,25,10,6,21,54,5,5,6,5,38,17,11,31,6,24,64,15,115,118,117,61,13,13,22,25,2,11,66,118,87,25,10,2,10,11,3,2,9,28,4,5,21,18,35,17,6,10,4,30,20,2,13,13,7,30,71,118,0,0,3,12,50,103,44,5],
        [41,38,21,17,42,71,60,50,11,1,2,11,109,115,8,4,27,8,5,22,11,9,8,14,20,10,4,33,12,7,4,1,18,115,95,42,17,1,0,0,19,6,46,115,91,16,0,7,66,7,4,15,12,32,91,109,12,3,1,8,21,115,96,17,1,51,78,14,0,0,0,0,50,40,62,53,0,0,0,3,115,115,40,12,6,13,25,65,7,30,51,65,110,92,25,9,0,1,13,0,0,0,0,0,4,22,11,1,0,0,0,0,13,115,48,1,0,0,0,0,0,36,102,63,11,0,0,0],
        [0,0,0,0,0,2,6,4,0,0,0,0,0,1,44,57,0,0,0,0,0,15,125,52,0,0,0,0,6,57,44,2,23,1,0,0,0,6,20,23,125,30,5,2,1,3,73,125,16,10,11,46,61,97,125,93,0,0,0,31,111,96,21,0,20,6,0,0,9,114,63,5,125,125,83,8,2,26,5,23,14,56,125,125,37,10,7,10,11,2,17,87,42,5,8,19,0,0,7,32,56,91,8,0,1,17,17,3,14,71,15,5,7,9,35,10,2,5,24,39,14,16,4,9,22,6,13,11]
    ],
    'date': ["2030-09-26", "1996-06-22", "1975-10-07", "2024-08-11", "1970-01-31", "2025-04-02", "2007-06-29", "1970-09-10", "2007-10-26", "1971-02-02"],
    'label': ["person", "building", "animal", "animal", "animal", "building", "animal", "building", "person", "building"]
}
# create the dataframe
df = pd.DataFrame(data)

We can insert data using client.insert:

# Query to count the number of rows in the 'default.myscale_categorical_vector_search' table.
db_count_sql="SELECT count(*) FROM default.myscale_categorical_vector_search"
# Get and print the count of rows in the 'default.myscale_categorical_vector_search' table before any inserts.
print(f"before insert, db_count is {client.command(db_count_sql)}")
# Insert data into the 'myscale_categorical_vector_search' table.
df_records = df.to_records(index=False)
df_records['date'] = pd.to_datetime(df_records['date'])
client.insert("default.myscale_categorical_vector_search", df_records.tolist(),
              column_names=df.columns.tolist())
# Get and print the count of rows in the 'default.myscale_categorical_vector_search' table after the insert.
print(f"after insert, db_count is {client.command(db_count_sql)}")

Example output:

before insert, db_count is 0
after insert, db_count is 10

# Binary Vectors

Importing binary vectors requires packing the data into byte values. Assuming we have a Pandas DataFrame with the following values, and we want to import 10 binary data entries:

import pandas as pd
# Create a dict containing the data to be imported
data = {
    'id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
    'data': [
        [0,0,1,0,1,1,0,1,1,1,0,0,0,0,0,1,0,1,1,0,0,1,1,0,1,1,0,1,0,0,0,1,0,0,1,0,1,1,1,1,1,0,0,1,0,1,0,0,1,1,1,0,1,1,1,0,1,0,0,0,1,1,0,0,0,1,1,1,1,0,0,1,0,0,1,1,1,1,0,0,0,1,0,1,0,1,1,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,0,1,1,1,0,0,1,1,0,0,0,1,0,0,0,0,0,1,1],
        [1,1,1,0,0,1,0,1,1,0,0,1,0,1,1,1,0,1,1,1,0,0,1,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,1,0,1,1,1,0,1,1,1,0,1,1,0,1,1,1,0,1,0,1,0,1,0,0,1,1,1,1,0,0,0,0,0,1,0,1,0,0,1,0,1,1,1,1,0,1,0,0,0,1,1,1,1,0,0,0,0,0,0,0,1,1,1,1,1,1,0,1,0,1,0,1,0,1,0,0,1,0,1,1,0,1],
        [0,1,0,0,0,1,0,1,0,1,1,1,0,1,1,0,0,0,1,0,0,1,0,0,1,0,1,1,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,1,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,1,1,1,1,0,0,0,1,1,1,1,0,0,0,0,1,1,0,0,1,1,1,1,0,1,0,0,1,0,1,0,1,1,0,0,1,1,0,1,1,0,1,0,0,1,0,1,1,1,0,0,0,1,0,0,1,0,1,0,1,0,1,1,0,1],
        [1,1,1,1,1,1,1,0,1,0,0,1,0,1,0,0,0,0,1,1,1,0,0,1,1,0,1,0,1,1,1,0,1,0,1,1,0,0,0,0,1,1,1,1,1,1,1,0,0,1,1,1,1,1,0,0,1,0,0,0,1,0,1,1,0,0,1,1,1,1,0,1,1,1,0,0,0,1,0,0,0,1,0,1,1,0,1,0,0,0,0,1,1,0,1,1,1,0,1,1,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,1,1,1,0,0,0,1,0,1,1,1],
        [1,0,1,0,1,1,0,1,1,1,0,0,1,1,0,0,1,1,1,0,1,0,1,1,0,0,1,1,0,1,1,1,0,0,1,1,1,0,0,0,0,1,0,1,1,1,1,0,0,1,0,1,1,1,0,1,1,0,0,1,0,0,0,1,0,0,1,1,1,1,1,0,0,0,1,1,0,1,0,0,0,1,0,1,1,0,0,0,0,0,0,1,1,0,0,0,1,0,1,0,1,0,0,0,0,0,1,1,0,0,0,1,1,1,1,0,0,1,0,0,0,0,0,0,1,1,0,0],
        [0,1,1,0,0,1,1,1,1,1,0,0,0,1,1,0,0,1,1,0,1,1,1,0,1,0,1,0,1,1,0,1,1,0,1,1,0,0,0,0,1,0,1,1,0,1,1,1,1,0,0,0,1,1,0,0,1,1,1,1,1,0,0,0,1,1,1,0,1,0,1,1,1,0,0,0,1,0,0,1,1,1,1,0,1,0,1,1,1,1,1,1,0,0,0,1,0,0,1,1,1,1,1,1,0,1,0,0,0,0,1,1,0,0,1,0,0,1,0,1,0,0,0,1,0,0,1,1],
        [1,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,1,1,1,0,1,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,0,1,1,0,1,0,0,1,1,1,0,1,1,0,1,1,0,0,0,0,1,0,0,1,1,1,1,0,1,0,0,1,1,0,1,0,0,1,1,1,0,1,1,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,1,0,1,1,0,0,1,0,0,1,1,0,1,1,1],
        [1,0,1,1,0,1,1,1,1,1,1,0,0,0,1,1,0,1,1,1,0,1,1,0,0,1,0,1,0,1,0,0,0,1,1,1,0,1,0,0,1,0,1,0,1,0,1,0,0,0,1,1,1,1,1,0,0,1,1,1,1,1,1,0,1,0,1,0,0,1,0,0,1,1,0,0,1,0,1,1,0,1,0,1,1,0,0,0,1,0,0,1,1,1,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,1,1,0],
        [1,1,0,1,0,1,1,1,1,0,0,0,0,0,1,1,1,1,1,1,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,1,1,1,1,0,0,0,1,0,1,1,1,1,1,0,1,1,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,1,1,0,1,0,1,0,1,1,1,0,1,1,0,1,1,0,1,1,0,0,0,0,0,1,0,1,1,0,1,1,0,1,0,0,1,0,0,0,1,1,0,0,1],
        [1,1,1,0,1,1,0,1,0,1,0,1,0,1,1,0,1,1,1,0,1,1,1,1,1,0,1,1,1,1,0,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0,1,1,1,1,0,0,0,1,1,1,1,1,1,1,0,1,1,1,0,0,1,1,0,1,0,1,0,1,1,1,1,0,0,1,1,0,1,1,0,1,1,1,1,1,0,1,1,1,1,0,1,0,1,1,0,1,0,0,0,1,1,1,0,1,0,0,0,1,1,0,1,1]
    ],
    'date': ["2030-09-26", "1996-06-22", "1975-10-07", "2024-08-11", "1970-01-31", "2025-04-02", "2007-06-29", "1970-09-10", "2007-10-26", "1971-02-02"],
    'label': ["person", "building", "animal", "animal", "animal", "building", "animal", "building", "person", "building"]
}
# Create DataFrame
df = pd.DataFrame(data)

Before using client.insert to insert the data, we need to convert the binary arrays to bytearray format.

# Query the row count of 'default.myscale_categorical_binary_search'
db_count_sql="SELECT count(*) FROM default.myscale_categorical_binary_search"
# Before inserting any data, get and print the row count of 'default.myscale_categorical_binary_search'
print(f"Before insertion, db_count is {client.command(db_count_sql)}")
# Insert data into the 'myscale_categorical_binary_search' table
df_records = df.to_records(index=False)
# Format date data
df_records['date'] = pd.to_datetime(df_records['date'])
# Convert binary data to bytearray format
for idx, vector in enumerate(df_records['data']):
    byte_array = bytearray()
    vector_str = "".join(str(it) for it in vector)
    for i in range(0, len(vector_str), 8):
        byte_value = int(vector_str[i:i + 8], 2)
        byte_array.append(byte_value)
    df_records['data'][idx] = byte_array
# Import data into the database
client.insert("default.myscale_categorical_binary_search", df_records.tolist(),
              column_names=df.columns.tolist())
# After inserting data, get and print the row count of 'default.myscale_categorical_binary_search'
print(f"After insertion, db_count is {client.command(db_count_sql)}")

Example output:

Before insertion, db_count is 0
After insertion, db_count is 10

# Creating Vector Index

MyScale executes the index creation command asynchronously, meaning that the database will not be blocked during index creation. However, if the table is very large, creating the index may still take a considerable amount of time. Therefore, it is important to check in the code whether the index has been successfully created.

Here's an example code that demonstrates how to check if an index has been created:

# Float Vectors

# 1. Create an index for vector<float32>
client.command("""
ALTER TABLE default.myscale_categorical_vector_search
    ADD VECTOR INDEX categorical_vector_idx data
    TYPE MSTG
""")
# 2. Query the 'vector_indices' system table to check the status of index creation
get_index_status="SELECT status FROM system.vector_indices WHERE table='myscale_categorical_vector_search'"
# 3. Print the index creation status; if the index is successfully created, the status will be 'Built'
print(f"Index creation status is {client.command(get_index_status)}")

# Binary Vectors

# 1. Create an index for binary data type
client.command("""
ALTER TABLE default.myscale_categorical_binary_search
    ADD VECTOR INDEX categorical_binary_idx data
    TYPE BinaryMSTG('metric_type=Hamming')
""")
# 2. Query the 'vector_indices' system table to check the status of index creation
get_index_status="SELECT status FROM system.vector_indices WHERE table='myscale_categorical_binary_search'"
# 3. Print the index creation status; if the index is successfully created, the status will be 'Built'
print(f"Index creation status is {client.command(get_index_status)}")

In this example, we execute an SQL query to select id, date, label, and the distance between data and a sample vector data calculated using the distance function. The LIMIT 10 clause specifies that the function should return the 10 closest vectors.

# Float Vectors

# Randomly select a row from the table as the target
random_row = client.query("SELECT * FROM default.myscale_categorical_vector_search ORDER BY rand() LIMIT 1")
assert random_row.row_count == 1
target_row_id = random_row.first_item["id"]
target_row_label = random_row.first_item["label"]
target_row_date = random_row.first_item["date"]
target_row_data = random_row.first_item["data"]
print("Current selected item id={}, label={}, date={}".format(target_row_id, target_row_label, target_row_date))
# Get the query result
result = client.query(f"""
SELECT id, date, label, 
    distance(data, {target_row_data}) as dist FROM default.myscale_categorical_vector_search ORDER BY dist LIMIT 10
""")
# 3. Iterate through the query result rows and print the 'id', 'date', 'label', and distance score
print("Top 10 candidates:")
for row in result.named_results():
    print(row["id"], row["date"], row["label"], row["dist"])

Example output:

Current selected item id=3, label=animal, date=2024-08-11
Top 10 candidates:
3   2024-08-11  animal      0.0
5   2025-04-02  building    211995.0
9   1971-02-02  building    214219.0
2   1975-10-07  animal      247505.0
0   2030-09-26  person      252941.0
1   1996-06-22  building    255835.0
7   1970-09-10  building    266691.0
4   1970-01-31  animal      276685.0
8   2007-10-26  person      284773.0
6   2007-06-29  animal      298423.0

# Binary Vectors

# 1. Randomly select a row from the table as the target
random_row = client.query("SELECT * FROM default.myscale_categorical_binary_search ORDER BY rand() LIMIT 1")
assert random_row.row_count == 1
target_row_id = random_row.first_item["id"]
target_row_label = random_row.first_item["label"]
target_row_date = random_row.first_item["date"]
target_row_data = random_row.first_item["data"]
vector_str = ""
for byte in target_row_data:
    binary_str = bin(byte)[2:].zfill(8)
    vector_str += binary_str
print("Current selected item id={}, label={}, data={}, date={}".format(target_row_id, target_row_label, vector_str, target_row_date))
# 2. Get the query result
result = client.query(f"""
SELECT id, date, label, 
    distance(data, unbin('{vector_str}')) as dist FROM default.myscale_categorical_binary_search ORDER BY dist LIMIT 10
""")
# 3. Iterate through the query result rows and print the 'id', 'date', 'label', and distance score
print("Top 10 candidates:")
for row in result.named_results():
    print(row["id"], row["date"], row["label"], row["dist"])

Example output:

Current selected item id=1, label=building, data=11100101100101110111001100100100000100000110111011101110110111010101001111000001010010111101000111100000001111110101010100101101, date=1996-06-22
Top 10 candidates:
1 1996-06-22 building 0.0
3 2024-08-11 animal 50.0
8 2007-10-26 person 50.0
9 1971-02-02 building 52.0
4 1970-01-31 animal 54.0
5 2025-04-02 building 54.0
6 2007-06-29 animal 54.0
2 1975-10-07 animal 56.0
7 1970-09-10 building 58.0
0 2030-09-26 person 65.0
Last Updated: Tue Sep 10 2024 03:19:52 GMT+0000