Skip to content

Indexing Examples

This guide demonstrates practical indexing patterns using the Titanic dataset as an example.

Titanic Dataset

The Titanic dataset contains passenger information from the RMS Titanic. Each record includes:

  • name: Passenger name
  • sex: Gender (male/female)
  • age: Age in years
  • survived: Survival status (0=No, 1=Yes)
  • pclass: Passenger class (1=1st, 2=2nd, 3=3rd)
  • fare: Ticket fare
  • embarked: Port of embarkation (C=Cherbourg, Q=Queenstown, S=Southampton)
  • sibsp: Number of siblings/spouses aboard
  • parch: Number of parents/children aboard

Synchronous Indexing

Basic Example

from taiyo import SolrClient, SolrDocument

passengers = [
    {
        "name": "Braund, Mr. Owen Harris",
        "sex": "male",
        "age": 22.0,
        "survived": 0,
        "pclass": 3,
        "fare": 7.25,
        "embarked": "S",
        "sibsp": 1,
        "parch": 0,
    },
    {
        "name": "Cumings, Mrs. John Bradley",
        "sex": "female",
        "age": 38.0,
        "survived": 1,
        "pclass": 1,
        "fare": 71.2833,
        "embarked": "C",
        "sibsp": 1,
        "parch": 0,
    },
    {
        "name": "Heikkinen, Miss. Laina",
        "sex": "female",
        "age": 26.0,
        "survived": 1,
        "pclass": 3,
        "fare": 7.925,
        "embarked": "S",
        "sibsp": 0,
        "parch": 0,
    },
]

with SolrClient("http://localhost:8983/solr") as client:
    client.set_collection("titanic")

    docs = [SolrDocument(**passenger) for passenger in passengers]
    client.add(docs, commit=True)

    print(f"Indexed {len(docs)} passengers")

Loading from CSV

import csv
from taiyo import SolrClient, SolrDocument


def index_titanic_csv(csv_path: str):
    with SolrClient("http://localhost:8983/solr") as client:
        client.set_collection("titanic")

        with open(csv_path, "r") as f:
            reader = csv.DictReader(f)

            batch = []
            batch_size = 100

            for row in reader:
                doc = SolrDocument(
                    name=row["name"],
                    sex=row["sex"],
                    age=float(row["age"]) if row["age"] else None,
                    survived=int(row["survived"]),
                    pclass=int(row["pclass"]),
                    fare=float(row["fare"]) if row["fare"] else None,
                    embarked=row.get("embarked", ""),
                    sibsp=int(row["sibsp"]),
                    parch=int(row["parch"]),
                )

                batch.append(doc)

                if len(batch) >= batch_size:
                    client.add(batch, commit=False)
                    print(f"Indexed {len(batch)} passengers")
                    batch = []

            if batch:
                client.add(batch, commit=False)
                print(f"Indexed {len(batch)} passengers")

            client.commit()
            print("Indexing complete")


index_titanic_csv("titanic.csv")

With Type-Safe Model

from typing import Optional
from taiyo import SolrDocument, SolrClient


class Passenger(SolrDocument):
    name: str
    sex: str
    age: Optional[float] = None
    survived: int
    pclass: int
    fare: Optional[float] = None
    embarked: Optional[str] = None
    sibsp: int = 0
    parch: int = 0


passengers = [
    Passenger(
        name="Braund, Mr. Owen Harris",
        sex="male",
        age=22.0,
        survived=0,
        pclass=3,
        fare=7.25,
        embarked="S",
        sibsp=1,
        parch=0,
    ),
    Passenger(
        name="Cumings, Mrs. John Bradley",
        sex="female",
        age=38.0,
        survived=1,
        pclass=1,
        fare=71.2833,
        embarked="C",
        sibsp=1,
        parch=0,
    ),
]

with SolrClient("http://localhost:8983/solr") as client:
    client.set_collection("titanic")
    client.add(passengers, commit=True)

Asynchronous Indexing

Basic Async Example

import asyncio
from taiyo import AsyncSolrClient, SolrDocument


async def index_passengers():
    passengers = [
        {
            "name": "Braund, Mr. Owen Harris",
            "sex": "male",
            "age": 22.0,
            "survived": 0,
            "pclass": 3,
            "fare": 7.25,
            "embarked": "S",
        },
        {
            "name": "Cumings, Mrs. John Bradley",
            "sex": "female",
            "age": 38.0,
            "survived": 1,
            "pclass": 1,
            "fare": 71.2833,
            "embarked": "C",
        },
    ]

    async with AsyncSolrClient("http://localhost:8983/solr") as client:
        client.set_collection("titanic")

        docs = [SolrDocument(**p) for p in passengers]
        await client.add(docs, commit=True)

        print(f"Indexed {len(docs)} passengers")


asyncio.run(index_passengers())

Async CSV Loading

import asyncio
import csv
from taiyo import AsyncSolrClient, SolrDocument


async def index_titanic_async(csv_path: str):
    async with AsyncSolrClient("http://localhost:8983/solr") as client:
        client.set_collection("titanic")

        batch = []
        batch_size = 100

        with open(csv_path, "r") as f:
            reader = csv.DictReader(f)

            for row in reader:
                doc = SolrDocument(
                    name=row["name"],
                    sex=row["sex"],
                    age=float(row["age"]) if row["age"] else None,
                    survived=int(row["survived"]),
                    pclass=int(row["pclass"]),
                    fare=float(row["fare"]) if row["fare"] else None,
                    embarked=row.get("embarked", ""),
                    sibsp=int(row["sibsp"]),
                    parch=int(row["parch"]),
                )

                batch.append(doc)

                if len(batch) >= batch_size:
                    await client.add(batch, commit=False)
                    print(f"Indexed {len(batch)} passengers")
                    batch = []

            if batch:
                await client.add(batch, commit=False)
                print(f"Indexed {len(batch)} passengers")

            await client.commit()
            print("Indexing complete")


asyncio.run(index_titanic_async("titanic.csv"))

Concurrent Indexing from Multiple Sources

import asyncio
from taiyo import AsyncSolrClient, SolrDocument


async def load_male_passengers():
    return [
        SolrDocument(
            name="Braund, Mr. Owen Harris", sex="male", age=22.0, survived=0, pclass=3
        ),
        SolrDocument(
            name="Allen, Mr. William Henry", sex="male", age=35.0, survived=0, pclass=3
        ),
    ]


async def load_female_passengers():
    return [
        SolrDocument(
            name="Cumings, Mrs. John Bradley",
            sex="female",
            age=38.0,
            survived=1,
            pclass=1,
        ),
        SolrDocument(
            name="Heikkinen, Miss. Laina", sex="female", age=26.0, survived=1, pclass=3
        ),
    ]


async def index_from_source(client, source_func, name):
    docs = await source_func()
    await client.add(docs, commit=False)
    print(f"Indexed {len(docs)} passengers from {name}")


async def index_concurrently():
    async with AsyncSolrClient("http://localhost:8983/solr") as client:
        client.set_collection("titanic")

        await asyncio.gather(
            index_from_source(client, load_male_passengers, "male source"),
            index_from_source(client, load_female_passengers, "female source"),
        )

        await client.commit()
        print("All sources indexed")


asyncio.run(index_concurrently())

Pandas Integration

import pandas as pd
from taiyo import SolrClient, SolrDocument

df = pd.read_csv("titanic.csv")

df = df.fillna({"age": 0.0, "fare": 0.0, "embarked": ""})

with SolrClient("http://localhost:8983/solr") as client:
    client.set_collection("titanic")

    batch = []
    batch_size = 100

    for _, row in df.iterrows():
        doc = SolrDocument(
            name=row["name"],
            sex=row["sex"],
            age=float(row["age"]),
            survived=int(row["survived"]),
            pclass=int(row["pclass"]),
            fare=float(row["fare"]),
            embarked=row["embarked"],
            sibsp=int(row["sibsp"]),
            parch=int(row["parch"]),
        )

        batch.append(doc)

        if len(batch) >= batch_size:
            client.add(batch, commit=False)
            batch = []

    if batch:
        client.add(batch, commit=False)

    client.commit()
    print(f"Indexed {len(df)} passengers")

Schema Setup for Titanic Dataset

from taiyo import SolrClient
from taiyo.schema import SolrField

with SolrClient("http://localhost:8983/solr") as client:
    client.set_collection("titanic")

    fields = [
        SolrField(name="name", type="text_general", indexed=True, stored=True),
        SolrField(name="sex", type="string", indexed=True, stored=True),
        SolrField(name="age", type="pfloat", indexed=True, stored=True),
        SolrField(name="survived", type="pint", indexed=True, stored=True),
        SolrField(name="pclass", type="pint", indexed=True, stored=True),
        SolrField(name="fare", type="pfloat", indexed=True, stored=True),
        SolrField(name="embarked", type="string", indexed=True, stored=True),
        SolrField(name="sibsp", type="pint", indexed=True, stored=True),
        SolrField(name="parch", type="pint", indexed=True, stored=True),
    ]

    for field in fields:
        client.add_field(field)

    print("Schema configured")

Searching Indexed Data

from taiyo import SolrClient
from taiyo.parsers import ExtendedDisMaxQueryParser

with SolrClient("http://localhost:8983/solr") as client:
    client.set_collection("titanic")

    parser = ExtendedDisMaxQueryParser(query="Mrs", query_fields={"name": 2.0}).facet(
        fields=["sex", "pclass", "survived"], mincount=1
    )

    results = client.search(parser)

    print(f"Found {results.num_found} passengers")

    for doc in results.docs:
        print(f"{doc.name} - Class {doc.pclass}, Survived: {doc.survived}")

    if results.facets:
        print("\nFacets:")
        facets = results.facets
        for field, facet in facets.fields.items():
            print(f"\n{field}:")
            for bucket in facet.buckets:
                print(f"  {bucket.value}: {bucket.count}")

Error Handling

from taiyo import SolrClient, SolrDocument, SolrError


def safe_index(passengers):
    with SolrClient("http://localhost:8983/solr") as client:
        client.set_collection("titanic")

        failed = []

        for passenger in passengers:
            try:
                doc = SolrDocument(**passenger)
                client.add(doc, commit=False)
            except ValueError as e:
                print(f"Invalid passenger data: {e}")
                failed.append(passenger)
            except SolrError as e:
                print(f"Solr error: {e}")
                failed.append(passenger)

        try:
            client.commit()
        except SolrError as e:
            print(f"Commit failed: {e}")

        return failed

See Also