首页 > 脚本专栏 > python > python chroma向量数据库安装

python之chroma向量数据库安装方式

2026-01-22 08:36:48 作者：像风一样的男人@

Chroma支持四种距离函数配置,包括欧氏距离、余弦距离、内积和曼哈顿距离,在人脸比对场景中,余弦距离是最优选择

环境

# python3.10.3
chromadb==1.0.4

服务方式启动

# 直接使用 chroma 命令启动（推荐，因为环境变量已包含 bin 目录）
chroma run --host 0.0.0.0 --port 8000 --path ./chroma_data

# 或使用完整路径启动（适用于未激活环境时）
/home/your_username/miniconda3/envs/chroma-env/bin/chroma run --host 0.0.0.0 --port 8000

连接本地文件模式（无需启动服务器）

import chromadb
from chromadb.config import Settings

# 1. 初始化本地文件模式的 Chroma 客户端
# 数据会存储在 ./chroma_local_data 目录
client = chromadb.Client(
    Settings(
        persist_directory="./chroma_local_data",  # 本地数据存储路径
        anonymized_telemetry=False  # 关闭匿名统计
    )
)

# 2. 创建或获取一个集合（Collection）
# 集合是 Chroma 中存储向量的容器
collection = client.get_or_create_collection(
    name="local_demo_collection",
    metadata={"description": "本地文件模式测试集合"}  # 可选：添加集合描述
)

# 3. 向集合中添加数据
# 包含：文档（原始文本）、元数据（附加信息）、唯一ID、向量（可选，不提供则自动生成）
collection.add(
    documents=[
        "Python 是一种解释型编程语言",
        "Chroma 是一个轻量级向量数据库",
        "向量数据库常用于存储和检索嵌入向量"
    ],
    metadatas=[
        {"category": "programming"},
        {"category": "database"},
        {"category": "ai"}
    ],
    ids=["doc1", "doc2", "doc3"]  # 每个文档的唯一ID
)

# 4. 持久化数据（本地模式需要手动触发，确保数据写入磁盘）
client.persist()

# 5. 检索相似内容
# 用查询文本检索最相似的2个文档（Chroma 会自动生成查询文本的向量）
results = collection.query(
    query_texts=["什么是向量数据库？"],  # 查询文本
    n_results=2  # 返回Top 2相似结果
)

# 打印检索结果
print("本地文件模式检索结果：")
for i, doc in enumerate(results["documents"][0]):
    print(f"\n相似文档 {i+1}：{doc}")
    print(f"相似度分数：{results['distances'][0][i]}")  # 分数越低越相似
    print(f"元数据：{results['metadatas'][0][i]}")

通过端口连接（客户端 - 服务器模式）

import chromadb
from chromadb.config import Settings

# 1. 连接到远程/本地服务器
client = chromadb.HttpClient(
    host="localhost",  # 服务器IP（本地为localhost，远程填服务器IP）
    port=8000,         # 服务器端口（与启动时一致）
    settings=Settings(
        anonymized_telemetry=False
    )
)

# 2. 创建或获取集合（与本地模式API一致）
collection = client.get_or_create_collection(
    name="server_demo_collection",
    metadata={"description": "服务器模式测试集合"}
)

# 3. 添加数据（API与本地模式一致）
collection.add(
    documents=[
        "北京是中国的首都",
        "巴黎是法国的首都",
        "东京是日本的首都"
    ],
    metadatas=[
        {"country": "中国"},
        {"country": "法国"},
        {"country": "日本"}
    ],
    ids=["city1", "city2", "city3"]
)

# 4. 检索相似内容
results = collection.query(
    query_texts=["哪些城市是国家首都？"],
    n_results=3
)

# 打印检索结果
print("\n服务器模式检索结果：")
for i, doc in enumerate(results["documents"][0]):
    print(f"\n相似文档 {i+1}：{doc}")
    print(f"相似度分数：{results['distances'][0][i]}")
    print(f"元数据：{results['metadatas'][0][i]}")

创建连接

chroma_client = chromadb.HttpClient(  # 创建连接
            host=CHROMA_HOST, # IP
            port=CHROMA_PORT, # 端口
            settings=Settings(
                anonymized_telemetry=False  # 关闭自动上报
            )
        )


face_collection = chroma_client.get_or_create_collection(
            name=FACE_TABEL_NAME,  # 库名
            metadata={"description": "人脸向量和id",  # 库的描述
                      "hnsw:space": "cosine"}  # 指定距离函数为余弦距离
        )  # 指向人脸库

调用 query 方法时，distances 字段的含义和范围因距离函数而异：

“l2”：返回欧氏距离，范围 [0, +∞)，值越小越相似。
“cosine”：返回余弦距离，范围 [0, 2]，值越小越相似（0 代表完全匹配）。
“ip”：返回内积，范围 [-1, 1]（未归一化）或 [0, 1]（归一化后），值越大越相似。
“l1”：返回曼哈顿距离，范围 [0, +∞)，值越小越相似。

总结：

Chroma 支持 4 种距离函数配置，核心通过 hnsw:space 指定。
人脸比对场景，“cosine” 是最优选择，其次是 “ip”（归一化后等价），避免使用默认的 “l2”（欧氏距离）或 “l1”（曼哈顿距离）。

新增数据(修改数据)

face_vector = face.embedding.tolist()  # 人脸向量数据

metadata = {
            "face_id": face_id,  # id
            "people_type": people_type,  # 人员类别
        }
try:
    face_collection.add(
        ids=[face_id],  # 用传入的 face_id 作为唯一标识
        embeddings=[face_vector],  # 新向量（存在则覆盖）
        metadatas=[metadata]  # 新元数据（存在则覆盖）
     )
            
 except Exception as e:
     pass

根据id查数据

face_collection.get(ids=['21121'], include=["embeddings", 'metadatas'])  # 不指定不返回向量数据
# all_data = r.face_collection.get() 不指定就是查询全部

根据id删除数据

all_data = r.face_collection.get(include=[])  # include=[] 表示只返回 ids 查全部数据
all_ids = all_data["ids"]
r.face_collection.delete(ids=all_ids)

删除库(集合)

r.chroma_client.delete_collection(name='face')

查询全部集合

collections = r.chroma_client.list_collections()
for coll in collections:
    print(f'集合名：{coll.name}')
    print(f'描述(元数据):{coll.metadata} ')

r.chroma_client.get_collection('face').metadata #查询单个集合

元数据查询

# 1. 查询元数据中 people_type=0（访客）的所有记录
result = face_collection.get(
    where={"people_type": 0},  # 元数据过滤条件
    include=["ids", "metadatas", "embeddings"]  # 指定返回的内容（可选）
)

# 返回结果解析
print("符合条件的ID：", result["ids"])  # 子ID列表，如["1_aug0", "2_front"]
print("符合条件的元数据：", result["metadatas"])  # 对应的元数据列表
print("符合条件的向量：", result["embeddings"])  # 对应的向量列表（若include包含）

where 参数支持多种条件，满足复杂查询需求：

等于（==）：

# 查询 main_face_id="user_001" 的所有记录
face_collection.get(where={"main_face_id": "user_001"})

不等于（!=）：

# 查询 people_type 不是员工（!=1）的记录
face_collection.get(where={"people_type": {"$ne": 1}})

范围查询（>/</>=/<=）：

# 查询自定义数值型元数据（如 score）大于0.8的记录
face_collection.get(where={"score": {"$gt": 0.8}})

包含（in）：

# 查询 pose_type 是正面或侧面的记录
face_collection.get(where={"pose_type": {"$in": ["front", "left_side"]}})

总结

以上为个人经验，希望能给大家一个参考，也希望大家多多支持脚本之家。