# Python Client
MyScale is compatible with ClickHouse, so you can use the official Clickhouse client (opens new window) to access MyScale from your python applications.
To install the required dependencies, use the following command:
pip install -U clickhouse-connect
# Creating Connection
To learn how to establish a connection to the cluster, please refer to the Connection Details section.
# Creating Table
Use the Python SDK to create a Table with id
, data
, date
, and label
columns.
Note that the SQL statements for creating tables differ for different vector types. Currently, MyScale supports float and binary vector types:
# Float Vectors
# Create a table with a 128-dimensional float vector.
client.command("""
CREATE TABLE default.myscale_categorical_vector_search
(
id UInt32,
data Array(Float32),
CONSTRAINT check_length CHECK length(data) = 128,
date Date,
label Enum8('person' = 1, 'building' = 2, 'animal' = 3)
)
ORDER BY id""")
# Get and print the names of all tables in the current database.
res = client.query("SHOW TABLES").named_results()
print([r['name'] for r in res])
Example output:
['myscale_categorical_vector_search']
# Binary Vectors
If your vector data is binary, you can create a table named myscale_categorical_binary_search
with id
, data
, date
, and label
columns as follows:
Note that the binary data length imported into the Table should be consistent. Assuming we want to import binary data of length 128, we need to set the FixedString
length to 128/8=16
.
# Create a Table for storing binary data
client.command("""
CREATE TABLE default.myscale_categorical_binary_search
(
id UInt32,
data FixedString(16),
date Date,
label Enum8('person' = 1, 'building' = 2, 'animal' = 3)
)
ORDER BY id""")
# Get and print the names of all tables in the current database.
res = client.query("SHOW TABLES").named_results()
print([r['name'] for r in res])
Example output:
['myscale_categorical_binary_search']
# Importing Data
# Float Vectors
Importing float vectors is relatively straightforward. Assuming we have a Pandas DataFrame with the following values:
import pandas as pd
# create the data dictionary
data = {
'id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
'data': [
[0,0,0,1,8,7,3,2,5,0,0,3,5,7,11,31,13,0,0,0,0,29,106,107,13,0,0,0,1,61,70,42,0,0,0,0,1,23,28,16,63,4,0,0,0,6,83,81,117,86,25,15,17,50,84,117,31,23,18,35,97,117,49,24,68,27,0,0,0,4,29,71,81,47,13,10,32,87,117,117,45,76,40,22,60,70,41,9,7,21,29,39,53,21,4,1,55,72,3,0,0,0,0,9,65,117,73,37,28,23,17,34,11,11,27,61,64,25,4,0,42,13,1,1,1,14,10,6],
[65,35,8,0,0,0,1,63,48,27,31,19,16,34,96,114,3,1,8,21,27,43,57,21,11,8,37,8,0,0,1,23,101,104,11,0,0,0,0,29,83,114,114,77,23,14,18,52,28,8,46,75,39,24,59,60,2,0,18,10,20,52,52,16,12,28,4,0,0,3,5,8,102,79,58,3,0,0,0,11,114,112,78,50,17,14,45,104,19,31,53,114,73,44,34,26,3,2,0,0,0,1,8,9,34,20,0,0,0,0,1,23,30,75,87,36,0,0,0,2,0,17,66,73,3,0,0,0],
[0,0,0,0,0,0,4,1,15,0,0,0,0,0,10,49,27,0,0,0,0,29,113,114,9,0,0,0,3,69,71,42,14,0,0,0,0,1,56,79,63,2,0,0,0,38,118,77,118,60,8,8,18,48,59,104,27,16,7,13,80,118,34,21,118,47,4,0,0,1,32,99,61,40,31,57,46,118,118,61,80,64,16,21,20,33,23,27,6,22,16,14,51,33,0,0,76,40,8,0,2,14,42,94,19,42,57,67,23,34,22,10,9,52,15,21,5,1,3,3,1,38,12,5,18,1,0,0],
[3,9,45,22,28,11,4,3,77,10,4,1,1,4,3,11,23,0,0,0,26,49,6,7,5,3,3,1,11,50,8,9,11,7,15,21,12,17,21,25,121,12,4,7,4,7,4,41,28,2,0,1,10,42,22,20,1,1,4,9,31,79,16,3,23,4,6,26,31,121,87,40,121,82,16,12,15,41,6,10,76,48,5,3,21,42,41,50,5,17,18,64,86,54,17,6,43,62,56,84,116,108,38,26,58,63,20,87,105,37,2,2,121,121,38,25,44,33,24,46,3,16,27,74,121,55,9,4],
[6,4,3,7,80,122,62,19,2,0,0,0,32,60,10,19,4,0,0,0,0,10,69,66,0,0,0,0,8,58,49,5,5,31,59,67,122,37,1,2,50,1,0,16,99,48,3,27,122,38,6,7,11,31,87,122,9,8,6,23,122,122,69,21,0,11,31,55,28,0,0,0,61,4,0,37,43,2,0,15,122,122,55,32,6,1,0,12,5,22,52,122,122,9,2,0,2,0,0,5,28,20,2,2,19,3,0,2,12,12,3,16,25,18,34,35,5,4,1,13,21,2,22,51,9,20,57,59],
[6,2,19,22,22,81,31,12,72,15,12,10,3,6,1,37,30,17,4,2,9,4,2,21,1,0,1,3,11,9,5,2,7,11,17,61,127,127,28,13,49,36,26,45,28,17,4,16,111,46,11,2,7,25,40,89,2,0,8,31,63,60,28,12,0,18,82,127,50,1,0,0,94,28,11,88,15,0,0,4,127,127,34,23,25,18,18,69,6,16,26,90,127,42,12,8,0,3,46,29,0,0,0,0,22,35,15,12,0,0,0,0,46,127,83,17,1,0,0,0,0,14,67,115,45,0,0,0],
[19,35,5,6,40,23,18,4,21,109,120,23,5,12,24,5,0,5,87,108,47,14,32,8,0,0,0,27,36,30,43,0,29,12,10,15,6,7,17,12,34,9,14,65,20,23,28,14,120,34,14,14,9,34,120,120,7,6,7,27,56,120,120,23,9,5,4,7,2,6,46,13,29,5,5,32,12,20,99,19,120,120,107,38,13,7,24,36,6,24,120,120,55,26,4,3,5,1,0,0,1,5,19,18,2,2,0,1,18,12,30,7,0,5,33,29,66,50,26,2,0,0,49,45,12,28,10,0],
[28,28,28,27,13,5,4,12,4,8,29,118,69,19,21,7,3,0,0,14,14,10,105,60,0,0,0,0,11,69,76,9,5,2,18,59,17,6,1,5,42,9,16,75,31,21,17,13,118,44,18,16,17,30,78,118,4,4,8,61,118,110,54,25,10,6,21,54,5,5,6,5,38,17,11,31,6,24,64,15,115,118,117,61,13,13,22,25,2,11,66,118,87,25,10,2,10,11,3,2,9,28,4,5,21,18,35,17,6,10,4,30,20,2,13,13,7,30,71,118,0,0,3,12,50,103,44,5],
[41,38,21,17,42,71,60,50,11,1,2,11,109,115,8,4,27,8,5,22,11,9,8,14,20,10,4,33,12,7,4,1,18,115,95,42,17,1,0,0,19,6,46,115,91,16,0,7,66,7,4,15,12,32,91,109,12,3,1,8,21,115,96,17,1,51,78,14,0,0,0,0,50,40,62,53,0,0,0,3,115,115,40,12,6,13,25,65,7,30,51,65,110,92,25,9,0,1,13,0,0,0,0,0,4,22,11,1,0,0,0,0,13,115,48,1,0,0,0,0,0,36,102,63,11,0,0,0],
[0,0,0,0,0,2,6,4,0,0,0,0,0,1,44,57,0,0,0,0,0,15,125,52,0,0,0,0,6,57,44,2,23,1,0,0,0,6,20,23,125,30,5,2,1,3,73,125,16,10,11,46,61,97,125,93,0,0,0,31,111,96,21,0,20,6,0,0,9,114,63,5,125,125,83,8,2,26,5,23,14,56,125,125,37,10,7,10,11,2,17,87,42,5,8,19,0,0,7,32,56,91,8,0,1,17,17,3,14,71,15,5,7,9,35,10,2,5,24,39,14,16,4,9,22,6,13,11]
],
'date': ["2030-09-26", "1996-06-22", "1975-10-07", "2024-08-11", "1970-01-31", "2025-04-02", "2007-06-29", "1970-09-10", "2007-10-26", "1971-02-02"],
'label': ["person", "building", "animal", "animal", "animal", "building", "animal", "building", "person", "building"]
}
# create the dataframe
df = pd.DataFrame(data)
We can insert data using client.insert
:
# Query to count the number of rows in the 'default.myscale_categorical_vector_search' table.
db_count_sql="SELECT count(*) FROM default.myscale_categorical_vector_search"
# Get and print the count of rows in the 'default.myscale_categorical_vector_search' table before any inserts.
print(f"before insert, db_count is {client.command(db_count_sql)}")
# Insert data into the 'myscale_categorical_vector_search' table.
df_records = df.to_records(index=False)
df_records['date'] = pd.to_datetime(df_records['date'])
client.insert("default.myscale_categorical_vector_search", df_records.tolist(),
column_names=df.columns.tolist())
# Get and print the count of rows in the 'default.myscale_categorical_vector_search' table after the insert.
print(f"after insert, db_count is {client.command(db_count_sql)}")
Example output:
before insert, db_count is 0
after insert, db_count is 10
# Binary Vectors
Importing binary vectors requires packing the data into byte values. Assuming we have a Pandas DataFrame with the following values, and we want to import 10 binary data entries:
import pandas as pd
# Create a dict containing the data to be imported
data = {
'id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
'data': [
[0,0,1,0,1,1,0,1,1,1,0,0,0,0,0,1,0,1,1,0,0,1,1,0,1,1,0,1,0,0,0,1,0,0,1,0,1,1,1,1,1,0,0,1,0,1,0,0,1,1,1,0,1,1,1,0,1,0,0,0,1,1,0,0,0,1,1,1,1,0,0,1,0,0,1,1,1,1,0,0,0,1,0,1,0,1,1,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,0,1,1,1,0,0,1,1,0,0,0,1,0,0,0,0,0,1,1],
[1,1,1,0,0,1,0,1,1,0,0,1,0,1,1,1,0,1,1,1,0,0,1,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,1,0,1,1,1,0,1,1,1,0,1,1,0,1,1,1,0,1,0,1,0,1,0,0,1,1,1,1,0,0,0,0,0,1,0,1,0,0,1,0,1,1,1,1,0,1,0,0,0,1,1,1,1,0,0,0,0,0,0,0,1,1,1,1,1,1,0,1,0,1,0,1,0,1,0,0,1,0,1,1,0,1],
[0,1,0,0,0,1,0,1,0,1,1,1,0,1,1,0,0,0,1,0,0,1,0,0,1,0,1,1,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,1,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,1,1,1,1,0,0,0,1,1,1,1,0,0,0,0,1,1,0,0,1,1,1,1,0,1,0,0,1,0,1,0,1,1,0,0,1,1,0,1,1,0,1,0,0,1,0,1,1,1,0,0,0,1,0,0,1,0,1,0,1,0,1,1,0,1],
[1,1,1,1,1,1,1,0,1,0,0,1,0,1,0,0,0,0,1,1,1,0,0,1,1,0,1,0,1,1,1,0,1,0,1,1,0,0,0,0,1,1,1,1,1,1,1,0,0,1,1,1,1,1,0,0,1,0,0,0,1,0,1,1,0,0,1,1,1,1,0,1,1,1,0,0,0,1,0,0,0,1,0,1,1,0,1,0,0,0,0,1,1,0,1,1,1,0,1,1,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,1,1,1,0,0,0,1,0,1,1,1],
[1,0,1,0,1,1,0,1,1,1,0,0,1,1,0,0,1,1,1,0,1,0,1,1,0,0,1,1,0,1,1,1,0,0,1,1,1,0,0,0,0,1,0,1,1,1,1,0,0,1,0,1,1,1,0,1,1,0,0,1,0,0,0,1,0,0,1,1,1,1,1,0,0,0,1,1,0,1,0,0,0,1,0,1,1,0,0,0,0,0,0,1,1,0,0,0,1,0,1,0,1,0,0,0,0,0,1,1,0,0,0,1,1,1,1,0,0,1,0,0,0,0,0,0,1,1,0,0],
[0,1,1,0,0,1,1,1,1,1,0,0,0,1,1,0,0,1,1,0,1,1,1,0,1,0,1,0,1,1,0,1,1,0,1,1,0,0,0,0,1,0,1,1,0,1,1,1,1,0,0,0,1,1,0,0,1,1,1,1,1,0,0,0,1,1,1,0,1,0,1,1,1,0,0,0,1,0,0,1,1,1,1,0,1,0,1,1,1,1,1,1,0,0,0,1,0,0,1,1,1,1,1,1,0,1,0,0,0,0,1,1,0,0,1,0,0,1,0,1,0,0,0,1,0,0,1,1],
[1,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,1,1,1,0,1,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,0,1,1,0,1,0,0,1,1,1,0,1,1,0,1,1,0,0,0,0,1,0,0,1,1,1,1,0,1,0,0,1,1,0,1,0,0,1,1,1,0,1,1,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,1,0,1,1,0,0,1,0,0,1,1,0,1,1,1],
[1,0,1,1,0,1,1,1,1,1,1,0,0,0,1,1,0,1,1,1,0,1,1,0,0,1,0,1,0,1,0,0,0,1,1,1,0,1,0,0,1,0,1,0,1,0,1,0,0,0,1,1,1,1,1,0,0,1,1,1,1,1,1,0,1,0,1,0,0,1,0,0,1,1,0,0,1,0,1,1,0,1,0,1,1,0,0,0,1,0,0,1,1,1,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,1,1,0],
[1,1,0,1,0,1,1,1,1,0,0,0,0,0,1,1,1,1,1,1,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,1,1,1,1,0,0,0,1,0,1,1,1,1,1,0,1,1,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,1,1,0,1,0,1,0,1,1,1,0,1,1,0,1,1,0,1,1,0,0,0,0,0,1,0,1,1,0,1,1,0,1,0,0,1,0,0,0,1,1,0,0,1],
[1,1,1,0,1,1,0,1,0,1,0,1,0,1,1,0,1,1,1,0,1,1,1,1,1,0,1,1,1,1,0,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0,1,1,1,1,0,0,0,1,1,1,1,1,1,1,0,1,1,1,0,0,1,1,0,1,0,1,0,1,1,1,1,0,0,1,1,0,1,1,0,1,1,1,1,1,0,1,1,1,1,0,1,0,1,1,0,1,0,0,0,1,1,1,0,1,0,0,0,1,1,0,1,1]
],
'date': ["2030-09-26", "1996-06-22", "1975-10-07", "2024-08-11", "1970-01-31", "2025-04-02", "2007-06-29", "1970-09-10", "2007-10-26", "1971-02-02"],
'label': ["person", "building", "animal", "animal", "animal", "building", "animal", "building", "person", "building"]
}
# Create DataFrame
df = pd.DataFrame(data)
Before using client.insert to insert the data, we need to convert the binary arrays to bytearray
format.
# Query the row count of 'default.myscale_categorical_binary_search'
db_count_sql="SELECT count(*) FROM default.myscale_categorical_binary_search"
# Before inserting any data, get and print the row count of 'default.myscale_categorical_binary_search'
print(f"Before insertion, db_count is {client.command(db_count_sql)}")
# Insert data into the 'myscale_categorical_binary_search' table
df_records = df.to_records(index=False)
# Format date data
df_records['date'] = pd.to_datetime(df_records['date'])
# Convert binary data to bytearray format
for idx, vector in enumerate(df_records['data']):
byte_array = bytearray()
vector_str = "".join(str(it) for it in vector)
for i in range(0, len(vector_str), 8):
byte_value = int(vector_str[i:i + 8], 2)
byte_array.append(byte_value)
df_records['data'][idx] = byte_array
# Import data into the database
client.insert("default.myscale_categorical_binary_search", df_records.tolist(),
column_names=df.columns.tolist())
# After inserting data, get and print the row count of 'default.myscale_categorical_binary_search'
print(f"After insertion, db_count is {client.command(db_count_sql)}")
Example output:
Before insertion, db_count is 0
After insertion, db_count is 10
# Creating Vector Index
MyScale executes the index creation command asynchronously, meaning that the database will not be blocked during index creation. However, if the table is very large, creating the index may still take a considerable amount of time. Therefore, it is important to check in the code whether the index has been successfully created.
Here's an example code that demonstrates how to check if an index has been created:
# Float Vectors
# 1. Create an index for vector<float32>
client.command("""
ALTER TABLE default.myscale_categorical_vector_search
ADD VECTOR INDEX categorical_vector_idx data
TYPE MSTG
""")
# 2. Query the 'vector_indices' system table to check the status of index creation
get_index_status="SELECT status FROM system.vector_indices WHERE table='myscale_categorical_vector_search'"
# 3. Print the index creation status; if the index is successfully created, the status will be 'Built'
print(f"Index creation status is {client.command(get_index_status)}")
# Binary Vectors
# 1. Create an index for binary data type
client.command("""
ALTER TABLE default.myscale_categorical_binary_search
ADD VECTOR INDEX categorical_binary_idx data
TYPE BinaryMSTG('metric_type=Hamming')
""")
# 2. Query the 'vector_indices' system table to check the status of index creation
get_index_status="SELECT status FROM system.vector_indices WHERE table='myscale_categorical_binary_search'"
# 3. Print the index creation status; if the index is successfully created, the status will be 'Built'
print(f"Index creation status is {client.command(get_index_status)}")
# Vector Search
In this example, we execute an SQL query to select id
, date
, label
, and the distance
between data
and a sample vector data calculated using the distance function. The LIMIT 10
clause specifies that the function should return the 10 closest vectors.
# Float Vectors
# Randomly select a row from the table as the target
random_row = client.query("SELECT * FROM default.myscale_categorical_vector_search ORDER BY rand() LIMIT 1")
assert random_row.row_count == 1
target_row_id = random_row.first_item["id"]
target_row_label = random_row.first_item["label"]
target_row_date = random_row.first_item["date"]
target_row_data = random_row.first_item["data"]
print("Current selected item id={}, label={}, date={}".format(target_row_id, target_row_label, target_row_date))
# Get the query result
result = client.query(f"""
SELECT id, date, label,
distance(data, {target_row_data}) as dist FROM default.myscale_categorical_vector_search ORDER BY dist LIMIT 10
""")
# 3. Iterate through the query result rows and print the 'id', 'date', 'label', and distance score
print("Top 10 candidates:")
for row in result.named_results():
print(row["id"], row["date"], row["label"], row["dist"])
Example output:
Current selected item id=3, label=animal, date=2024-08-11
Top 10 candidates:
3 2024-08-11 animal 0.0
5 2025-04-02 building 211995.0
9 1971-02-02 building 214219.0
2 1975-10-07 animal 247505.0
0 2030-09-26 person 252941.0
1 1996-06-22 building 255835.0
7 1970-09-10 building 266691.0
4 1970-01-31 animal 276685.0
8 2007-10-26 person 284773.0
6 2007-06-29 animal 298423.0
# Binary Vectors
# 1. Randomly select a row from the table as the target
random_row = client.query("SELECT * FROM default.myscale_categorical_binary_search ORDER BY rand() LIMIT 1")
assert random_row.row_count == 1
target_row_id = random_row.first_item["id"]
target_row_label = random_row.first_item["label"]
target_row_date = random_row.first_item["date"]
target_row_data = random_row.first_item["data"]
vector_str = ""
for byte in target_row_data:
binary_str = bin(byte)[2:].zfill(8)
vector_str += binary_str
print("Current selected item id={}, label={}, data={}, date={}".format(target_row_id, target_row_label, vector_str, target_row_date))
# 2. Get the query result
result = client.query(f"""
SELECT id, date, label,
distance(data, unbin('{vector_str}')) as dist FROM default.myscale_categorical_binary_search ORDER BY dist LIMIT 10
""")
# 3. Iterate through the query result rows and print the 'id', 'date', 'label', and distance score
print("Top 10 candidates:")
for row in result.named_results():
print(row["id"], row["date"], row["label"], row["dist"])
Example output:
Current selected item id=1, label=building, data=11100101100101110111001100100100000100000110111011101110110111010101001111000001010010111101000111100000001111110101010100101101, date=1996-06-22
Top 10 candidates:
1 1996-06-22 building 0.0
3 2024-08-11 animal 50.0
8 2007-10-26 person 50.0
9 1971-02-02 building 52.0
4 1970-01-31 animal 54.0
5 2025-04-02 building 54.0
6 2007-06-29 animal 54.0
2 1975-10-07 animal 56.0
7 1970-09-10 building 58.0
0 2030-09-26 person 65.0