# Cliente de Python

MyScale es compatible con ClickHouse, por lo que puedes usar el cliente oficial de Clickhouse (opens new window) para acceder a MyScale desde tus aplicaciones de Python.

Para instalar las dependencias necesarias, utiliza el siguiente comando:

pip install -U clickhouse-connect

# Crear conexión

Para aprender cómo establecer una conexión con el clúster, consulta la sección de Detalles de conexión.

# Crear tabla

Utilice el SDK de Python para crear una Tabla que contenga las columnas id, data, date y label. Tenga en cuenta que las sentencias SQL para crear tablas difieren según el tipo de vector. Actualmente, MyScale admite vectores de coma flotante y vectores binarios:

# Vectores de coma flotante

Si sus datos de vector son una matriz Float32, siga la siguiente SQL para crear la tabla, asumiendo que los vectores que se importarán posteriormente tendrán una longitud de 128.

# Crear una tabla que contenga vectores de coma flotante de 128 dimensiones.
client.command("""
CREATE TABLE default.myscale_categorical_vector_search
(
    id    UInt32,
    data  Array(Float32),
    CONSTRAINT check_length CHECK length(data) = 128,
    date  Date,
    label Enum8('person' = 1, 'building' = 2, 'animal' = 3)
)
ORDER BY id""")
# Obtener e imprimir los nombres de todas las tablas en la base de datos actual.
res = client.query("SHOW TABLES").named_results()
print([r['name'] for r in res])

Resultado del código de ejemplo:

['myscale_categorical_vector_search']

# Vectores binarios

Si sus datos de vector son datos binarios, puede crear una tabla llamada myscale_categorical_binary_search que contenga las columnas id, data, date y label de la siguiente manera. Tenga en cuenta que los datos binarios importados a la tabla deben tener una longitud consistente. Supongamos que la longitud de los datos binarios que se importarán es 128, entonces la longitud de FixedString debe establecerse en 128/8=16.

# Crear una tabla para almacenar datos binarios
client.command("""
CREATE TABLE default.myscale_categorical_binary_search
(
    id    UInt32,
    data  FixedString(16),
    date  Date,
    label Enum8('person' = 1, 'building' = 2, 'animal' = 3)
)
ORDER BY id""")
# Obtener e imprimir los nombres de todas las tablas en la base de datos actual.
res = client.query("SHOW TABLES").named_results()
print([r['name'] for r in res])

Resultado del código de ejemplo:

['myscale_categorical_binary_search']

# Importar datos

# Vectores de coma flotante

La importación de vectores de coma flotante es relativamente directa. Supongamos que tenemos un DataFrame de Pandas con los siguientes valores:

import pandas as pd
# Crear un diccionario que contenga los datos a importar
data = {
    'id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
    'data': [
        [0,0,0,1,8,7,3,2,5,0,0,3,5,7,11,31,13,0,0,0,0,29,106,107,13,0,0,0,1,61,70,42,0,0,0,0,1,23,28,16,63,4,0,0,0,6,83,81,117,86,25,15,17,50,84,117,31,23,18,35,97,117,49,24,68,27,0,0,0,4,29,71,81,47,13,10,32,87,117,117,45,76,40,22,60,70,41,9,7,21,29,39,53,21,4,1,55,72,3,0,0,0,0,9,65,117,73,37,28,23,17,34,11,11,27,61,64,25,4,0,42,13,1,1,1,14,10,6],
        [65,35,8,0,0,0,1,63,48,27,31,19,16,34,96,114,3,1,8,21,27,43,57,21,11,8,37,8,0,0,1,23,101,104,11,0,0,0,0,29,83,114,114,77,23,14,18,52,28,8,46,75,39,24,59,60,2,0,18,10,20,52,52,16,12,28,4,0,0,3,5,8,102,79,58,3,0,0,0,11,114,112,78,50,17,14,45,104,19,31,53,114,73,44,34,26,3,2,0,0,0,1,8,9,34,20,0,0,0,0,1,23,30,75,87,36,0,0,0,2,0,17,66,73,3,0,0,0],
        [0,0,0,0,0,0,4,1,15,0,0,0,0,0,10,49,27,0,0,0,0,29,113,114,9,0,0,0,3,69,71,42,14,0,0,0,0,1,56,79,63,2,0,0,0,38,118,77,118,60,8,8,18,48,59,104,27,16,7,13,80,118,34,21,118,47,4,0,0,1,32,99,61,40,31,57,46,118,118,61,80,64,16,21,20,33,23,27,6,22,16,14,51,33,0,0,76,40,8,0,2,14,42,94,19,42,57,67,23,34,22,10,9,52,15,21,5,1,3,3,1,38,12,5,18,1,0,0],
        [3,9,45,22,28,11,4,3,77,10,4,1,1,4,3,11,23,0,0,0,26,49,6,7,5,3,3,1,11,50,8,9,11,7,15,21,12,17,21,25,121,12,4,7,4,7,4,41,28,2,0,1,10,42,22,20,1,1,4,9,31,79,16,3,23,4,6,26,31,121,87,40,121,82,16,12,15,41,6,10,76,48,5,3,21,42,41,50,5,17,18,64,86,54,17,6,43,62,56,84,116,108,38,26,58,63,20,87,105,37,2,2,121,121,38,25,44,33,24,46,3,16,27,74,121,55,9,4],
        [6,4,3,7,80,122,62,19,2,0,0,0,32,60,10,19,4,0,0,0,0,10,69,66,0,0,0,0,8,58,49,5,5,31,59,67,122,37,1,2,50,1,0,16,99,48,3,27,122,38,6,7,11,31,87,122,9,8,6,23,122,122,69,21,0,11,31,55,28,0,0,0,61,4,0,37,43,2,0,15,122,122,55,32,6,1,0,12,5,22,52,122,122,9,2,0,2,0,0,5,28,20,2,2,19,3,0,2,12,12,3,16,25,18,34,35,5,4,1,13,21,2,22,51,9,20,57,59],
        [6,2,19,22,22,81,31,12,72,15,12,10,3,6,1,37,30,17,4,2,9,4,2,21,1,0,1,3,11,9,5,2,7,11,17,61,127,127,28,13,49,36,26,45,28,17,4,16,111,46,11,2,7,25,40,89,2,0,8,31,63,60,28,12,0,18,82,127,50,1,0,0,94,28,11,88,15,0,0,4,127,127,34,23,25,18,18,69,6,16,26,90,127,42,12,8,0,3,46,29,0,0,0,0,22,35,15,12,0,0,0,0,46,127,83,17,1,0,0,0,0,14,67,115,45,0,0,0],
        [19,35,5,6,40,23,18,4,21,109,120,23,5,12,24,5,0,5,87,108,47,14,32,8,0,0,0,27,36,30,43,0,29,12,10,15,6,7,17,12,34,9,14,65,20,23,28,14,120,34,14,14,9,34,120,120,7,6,7,27,56,120,120,23,9,5,4,7,2,6,46,13,29,5,5,32,12,20,99,19,120,120,107,38,13,7,24,36,6,24,120,120,55,26,4,3,5,1,0,0,1,5,19,18,2,2,0,1,18,12,30,7,0,5,33,29,66,50,26,2,0,0,49,45,12,28,10,0],
        [28,28,28,27,13,5,4,12,4,8,29,118,69,19,21,7,3,0,0,14,14,10,105,60,0,0,0,0,11,69,76,9,5,2,18,59,17,6,1,5,42,9,16,75,31,21,17,13,118,44,18,16,17,30,78,118,4,4,8,61,118,110,54,25,10,6,21,54,5,5,6,5,38,17,11,31,6,24,64,15,115,118,117,61,13,13,22,25,2,11,66,118,87,25,10,2,10,11,3,2,9,28,4,5,21,18,35,17,6,10,4,30,20,2,13,13,7,30,71,118,0,0,3,12,50,103,44,5],
        [41,38,21,17,42,71,60,50,11,1,2,11,109,115,8,4,27,8,5,22,11,9,8,14,20,10,4,33,12,7,4,1,18,115,95,42,17,1,0,0,19,6,46,115,91,16,0,7,66,7,4,15,12,32,91,109,12,3,1,8,21,115,96,17,1,51,78,14,0,0,0,0,50,40,62,53,0,0,0,3,115,115,40,12,6,13,25,65,7,30,51,65,110,92,25,9,0,1,13,0,0,0,0,0,4,22,11,1,0,0,0,0,13,115,48,1,0,0,0,0,0,36,102,63,11,0,0,0],
        [0,0,0,0,0,2,6,4,0,0,0,0,0,1,44,57,0,0,0,0,0,15,125,52,0,0,0,0,6,57,44,2,23,1,0,0,0,6,20,23,125,30,5,2,1,3,73,125,16,10,11,46,61,97,125,93,0,0,0,31,111,96,21,0,20,6,0,0,9,114,63,5,125,125,83,8,2,26,5,23,14,56,125,125,37,10,7,10,11,2,17,87,42,5,8,19,0,0,7,32,56,91,8,0,1,17,17,3,14,71,15,5,7,9,35,10,2,5,24,39,14,16,4,9,22,6,13,11]
    ],
    'date': ["2030-09-26", "1996-06-22", "1975-10-07", "2024-08-11", "1970-01-31", "2025-04-02", "2007-06-29", "1970-09-10", "2007-10-26", "1971-02-02"],
    'label': ["person", "building", "animal", "animal", "animal", "building", "animal", "building", "person", "building"]
}
# crear el dataframe
df = pd.DataFrame(data)

Podemos insertar datos usando client.insert:

# Consultar el número de filas en la tabla 'default.myscale_categorical_vector_search'
db_count_sql="SELECT count(*) FROM default.myscale_categorical_vector_search"
# Antes de insertar cualquier dato, obtener e imprimir el número de filas en la tabla 'default.myscale_categorical_vector_search'
print(f"Antes de insertar, db_count es {client.command(db_count_sql)}")
# Insertar los datos en la tabla 'myscale_categorical_vector_search'
df_records = df.to_records(index=False)
df_records['date'] = pd.to_datetime(df_records['date'])
client.insert("default.myscale_categorical_vector_search", df_records.tolist(),
              column_names=df.columns.tolist())
# Después de insertar los datos, obtener e imprimir el número de filas en la tabla 'default.myscale_categorical_vector_search'
print(f"Después de insertar, db_count es {client.command(db_count_sql)}")

Resultado de la ejecución del código de ejemplo:

antes de insertar, db_count es 0
después de insertar, db_count es 10

# Vectores binarios

La importación de vectores binarios es relativamente más compleja. Supongamos que tenemos un DataFrame de Pandas y necesitamos importar 10 datos binarios, con los siguientes valores:

import pandas as pd
# Crear un diccionario que contenga los datos a importar
data = {
    'id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
    'data': [
        [0,0,1,0,1,1,0,1,1,1,0,0,0,0,0,1,0,1,1,0,0,1,1,0,1,1,0,1,0,0,0,1,0,0,1,0,1,1,1,1,1,0,0,1,0,1,0,0,1,1,1,0,1,1,1,0,1,0,0,0,1,1,0,0,0,1,1,1,1,0,0,1,0,0,1,1,1,1,0,0,0,1,0,1,0,1,1,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,0,1,1,1,0,0,1,1,0,0,0,1,0,0,0,0,0,1,1],
        [1,1,1,0,0,1,0,1,1,0,0,1,0,1,1,1,0,1,1,1,0,0,1,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,1,0,1,1,1,0,1,1,1,0,1,1,0,1,1,1,0,1,0,1,0,1,0,0,1,1,1,1,0,0,0,0,0,1,0,1,0,0,1,0,1,1,1,1,0,1,0,0,0,1,1,1,1,0,0,0,0,0,0,0,1,1,1,1,1,1,0,1,0,1,0,1,0,1,0,0,1,0,1,1,0,1],
        [0,1,0,0,0,1,0,1,0,1,1,1,0,1,1,0,0,0,1,0,0,1,0,0,1,0,1,1,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,1,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,1,1,1,1,0,0,0,1,1,1,1,0,0,0,0,1,1,0,0,1,1,1,1,0,1,0,0,1,0,1,0,1,1,0,0,1,1,0,1,1,0,1,0,0,1,0,1,1,1,0,0,0,1,0,0,1,0,1,0,1,0,1,1,0,1],
        [1,1,1,1,1,1,1,0,1,0,0,1,0,1,0,0,0,0,1,1,1,0,0,1,1,0,1,0,1,1,1,0,1,0,1,1,0,0,0,0,1,1,1,1,1,1,1,0,0,1,1,1,1,1,0,0,1,0,0,0,1,0,1,1,0,0,1,1,1,1,0,1,1,1,0,0,0,1,0,0,0,1,0,1,1,0,1,0,0,0,0,1,1,0,1,1,1,0,1,1,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,1,1,1,0,0,0,1,0,1,1,1],
        [1,0,1,0,1,1,0,1,1,1,0,0,1,1,0,0,1,1,1,0,1,0,1,1,0,0,1,1,0,1,1,1,0,0,1,1,1,0,0,0,0,1,0,1,1,1,1,0,0,1,0,1,1,1,0,1,1,0,0,1,0,0,0,1,0,0,1,1,1,1,1,0,0,0,1,1,0,1,0,0,0,1,0,1,1,0,0,0,0,0,0,1,1,0,0,0,1,0,1,0,1,0,0,0,0,0,1,1,0,0,0,1,1,1,1,0,0,1,0,0,0,0,0,0,1,1,0,0],
        [0,1,1,0,0,1,1,1,1,1,0,0,0,1,1,0,0,1,1,0,1,1,1,0,1,0,1,0,1,1,0,1,1,0,1,1,0,0,0,0,1,0,1,1,0,1,1,1,1,0,0,0,1,1,0,0,1,1,1,1,1,0,0,0,1,1,1,0,1,0,1,1,1,0,0,0,1,0,0,1,1,1,1,0,1,0,1,1,1,1,1,1,0,0,0,1,0,0,1,1,1,1,1,1,0,1,0,0,0,0,1,1,0,0,1,0,0,1,0,1,0,0,0,1,0,0,1,1],
        [1,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,1,1,1,0,1,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,0,1,1,0,1,0,0,1,1,1,0,1,1,0,1,1,0,0,0,0,1,0,0,1,1,1,1,0,1,0,0,1,1,0,1,0,0,1,1,1,0,1,1,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,1,0,1,1,0,0,1,0,0,1,1,0,1,1,1],
        [1,0,1,1,0,1,1,1,1,1,1,0,0,0,1,1,0,1,1,1,0,1,1,0,0,1,0,1,0,1,0,0,0,1,1,1,0,1,0,0,1,0,1,0,1,0,1,0,0,0,1,1,1,1,1,0,0,1,1,1,1,1,1,0,1,0,1,0,0,1,0,0,1,1,0,0,1,0,1,1,0,1,0,1,1,0,0,0,1,0,0,1,1,1,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,1,1,0],
        [1,1,0,1,0,1,1,1,1,0,0,0,0,0,1,1,1,1,1,1,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,1,1,1,1,0,0,0,1,0,1,1,1,1,1,0,1,1,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,1,1,0,1,0,1,0,1,1,1,0,1,1,0,1,1,0,1,1,0,0,0,0,0,1,0,1,1,0,1,1,0,1,0,0,1,0,0,0,1,1,0,0,1],
        [1,1,1,0,1,1,0,1,0,1,0,1,0,1,1,0,1,1,1,0,1,1,1,1,1,0,1,1,1,1,0,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0,1,1,1,1,0,0,0,1,1,1,1,1,1,1,0,1,1,1,0,0,1,1,0,1,0,1,0,1,1,1,1,0,0,1,1,0,1,1,0,1,1,1,1,1,0,1,1,1,1,0,1,0,1,1,0,1,0,0,0,1,1,1,0,1,0,0,0,1,1,0,1,1]
    ],
    'date': ["2030-09-26", "1996-06-22", "1975-10-07", "2024-08-11", "1970-01-31", "2025-04-02", "2007-06-29", "1970-09-10", "2007-10-26", "1971-02-02"],
    'label': ["person", "building", "animal", "animal", "animal", "building", "animal", "building", "person", "building"]
}
# crear el dataframe
df = pd.DataFrame(data)

Podemos insertar datos usando client.insert:

# Consultar el número de filas en la tabla 'default.myscale_categorical_binary_search'
db_count_sql="SELECT count(*) FROM default.myscale_categorical_binary_search"
# Antes de insertar cualquier dato, obtener e imprimir el número de filas en la tabla 'default.myscale_categorical_binary_search'
print(f"Antes de insertar, db_count es {client.command(db_count_sql)}")
# Insertar los datos en la tabla 'myscale_categorical_binary_search'
df_records = df.to_records(index=False)
# Formatear los datos de fecha
df_records['date'] = pd.to_datetime(df_records['date'])
# Convertir los datos binarios a formato bytearray
for idx, vector in enumerate(df_records['data']):
    byte_array = bytearray()
    vector_str = "".join(str(it) for it in vector)
    for i in range(0, len(vector_str), 8):
        byte_value = int(vector_str[i:i + 8], 2)
        byte_array.append(byte_value)
    df_records['data'][idx] = byte_array
# Importar datos a la base de datos
client.insert("default.myscale_categorical_binary_search", df_records.tolist(),
              column_names=df.columns.tolist())
# Después de insertar los datos, obtener e imprimir el número de filas en la tabla 'default.myscale_categorical_binary_search'
print(f"Después de insertar, db_count es {client.command(db_count_sql)}")

Resultado de la ejecución del código de ejemplo:

antes de insertar, db_count es 0
después de insertar, db_count es 10

# Creando Índice Vectorial

MyScale ejecuta los comandos de creación de índice de forma asíncrona, lo que significa que no bloqueará la base de datos durante la creación del índice. Sin embargo, si la tabla es muy grande, la creación del índice aún puede llevar bastante tiempo. Por lo tanto, es importante verificar en el código si el índice se creó correctamente. Aquí hay un código de ejemplo que muestra cómo verificar si se ha creado el índice:

# Vectores de coma flotante

# 1. Crear un índice para vector<float32>
client.command("""
ALTER TABLE default.myscale_categorical_vector_search
    ADD VECTOR INDEX categorical_vector_idx data
    TYPE MSTG
""")
# 2. Consultar la tabla del sistema 'vector_indices' para verificar el estado de creación del índice
get_index_status="SELECT status FROM system.vector_indices WHERE table='myscale_categorical_vector_search'"
# 3. Imprimir el estado de creación del índice, si el índice se creó correctamente, el estado será 'Built'
print(f"El estado de creación del índice es {client.command(get_index_status)}")

# Vectores binarios

# 1. Crear un índice para datos de tipo binario
client.command("""
ALTER TABLE default.myscale_categorical_binary_search
    ADD VECTOR INDEX categorical_binary_idx data
    TYPE BinaryMSTG('metric_type=Hamming')
""")
# 2. Consultar la tabla del sistema 'vector_indices' para verificar el estado de creación del índice
get_index_status="SELECT status FROM system.vector_indices WHERE table='myscale_categorical_binary_search'"
# 3. Imprimir el estado de creación del índice, si el índice se creó correctamente, el estado será 'Built'
print(f"El estado de creación del índice es {client.command(get_index_status)}")

# Búsqueda Vectorial

En este ejemplo, ejecutamos una consulta SQL para seleccionar id, date, label y la distancia calculada entre data y un vector de muestra utilizando la función distance. La cláusula LIMIT 10 especifica que la función debe devolver los 10 vectores más cercanos.

# Vectores de coma flotante

 # Seleccionar una fila aleatoria de la tabla como objetivo
random_row = client.query("SELECT * FROM default.myscale_categorical_vector_search ORDER BY rand() LIMIT 1")
assert random_row.row_count == 1
target_row_id = random_row.first_item["id"]
target_row_label = random_row.first_item["label"]
target_row_date = random_row.first_item["date"]
target_row_data = random_row.first_item["data"]
print("El elemento seleccionado actualmente es id={}, label={}, date={}".format(target_row_id, target_row_label, target_row_date))
# Obtener los resultados de la consulta
result = client.query(f"""
SELECT id, date, label, 
    distance(data, {target_row_data}) as dist FROM default.myscale_categorical_vector_search ORDER BY dist LIMIT 10
""")
# 3. Iterar sobre las filas del resultado de la consulta e imprimir el 'id', 'date', 'label' y la puntuación de distancia 'distance' de cada fila
print("Los 10 principales candidatos:")
for row in result.named_results():
    print(row["id"], row["date"], row["label"], row["dist"])

Resultado del código de ejemplo:

El elemento seleccionado actualmente es id=3, label=animal, date=2024-08-11
Los 10 principales candidatos:
3   2024-08-11  animal      0.0
5   2025-04-02  building    211995.0
9   1971-02-02  building    214219.0
2   1975-10-07  animal      247505.0
0   2030-09-26  person      252941.0
1   1996-06-22  building    255835.0
7   1970-09-10  building    266691.0
4   1970-01-31  animal      276685.0
8   2007-10-26  person      284773.0
6   2007-06-29  animal      298423.0

# Vectores binarios

 # 1. Seleccionar una fila aleatoria de la tabla como objetivo
random_row = client.query("SELECT * FROM default.myscale_categorical_binary_search ORDER BY rand() LIMIT 1")
assert random_row.row_count == 1
target_row_id = random_row.first_item["id"]
target_row_label = random_row.first_item["label"]
target_row_date = random_row.first_item["date"]
target_row_data = random_row.first_item["data"]
vector_str = ""
for byte in target_row_data:
    binary_str = bin(byte)[2:].zfill(8)
    vector_str += binary_str
print("El elemento seleccionado actualmente es id={}, label={}, data={}, date={}".format(target_row_id, target_row_label, vector_str, target_row_date))
# 2. Obtener los resultados de la consulta
result = client.query(f"""
SELECT id, date, label, 
    distance(data, unbin('{vector_str}')) as dist FROM default.myscale_categorical_binary_search ORDER BY dist LIMIT 10
""")
# 3. Iterar sobre las filas del resultado de la consulta e imprimir el 'id', 'date', 'label' y la puntuación de distancia 'distance' de cada fila
print("Los 10 principales candidatos:")
for row in result.named_results():
    print(row["id"], row["date"], row["label"], row["dist"])

Resultado del código de ejemplo:

El elemento seleccionado actualmente es id=1, label=building, data=11100101100101110111001100100100000100000110111011101110110111010101001111000001010010111101000111100000001111110101010100101101, date=1996-06-22
Los 10 principales candidatos:
1 1996-06-22 building 0.0
3 2024-08-11 animal 50.0
8 2007-10-26 person 50.0
9 1971-02-02 building 52.0
4 1970-01-31 animal 54.0
5 2025-04-02 building 54.0
6 2007-06-29 animal 54.0
2 1975-10-07 animal 56.0
7 1970-09-10 building 58.0
0 2030-09-26 person 65.0
Last Updated: Fri Nov 01 2024 09:02:06 GMT+0000