Indexing Documents

Learn how to add, update, and manage documents in LumoSearch indexes.

Initial Indexing

Create an index by passing documents to the constructor:

import { LumoSearch } from '@lumosearch/search'

const docs = [
  { id: 1, title: 'JavaScript Guide', category: 'tutorials' },
  { id: 2, title: 'TypeScript Handbook', category: 'docs' },
  { id: 3, title: 'React Patterns', category: 'tutorials' }
]

const search = new LumoSearch(docs, {
  keys: [{ name: 'title', weight: 3 }]
})

// Indexes are built automatically

Adding Documents

Add new documents without rebuilding the entire index:

// Add a single document
search.add({
  id: 4,
  title: 'Node.js in Action',
  category: 'tutorials'
})

// Document is indexed immediately
const results = search.search('node')
// => Returns the new document

Removing Documents

By Index Position

// Remove document at position 0
search.removeAt(0)

// Note: positions shift after removal
// If you had [A, B, C], after removeAt(0):
// [B, C] (B is now at position 0)

By Predicate

// Remove all archived documents
search.remove((doc) => doc.archived === true)

// Remove documents by category
search.remove((doc) => doc.category === 'deprecated')

// Remove old documents
search.remove((doc) => {
  const docDate = new Date(doc.publishedAt)
  const cutoff = new Date('2020-01-01')
  return docDate < cutoff
})

Replacing the Collection

Replace all documents and rebuild indexes:

const newDocs = [
  { title: 'Updated Doc 1' },
  { title: 'Updated Doc 2' }
]

// Replace entire collection
search.setCollection(newDocs)

// All indexes are rebuilt
// Previous documents are gone

Incremental Updates Pattern

// Real-time updates from API
async function syncDocuments() {
  const response = await fetch('/api/documents/recent')
  const newDocs = await response.json()

  newDocs.forEach(doc => {
    // Check if exists
    const existing = search.search(doc.id, {
      filters: { id: doc.id },
      limit: 1
    })

    if (existing.length > 0) {
      // Update: remove old, add new
      search.removeAt(existing[0].refIndex)
      search.add(doc)
    } else {
      // New document
      search.add(doc)
    }
  })
}

// Poll every minute
setInterval(syncDocuments, 60000)

Batch Operations

// Add multiple documents
const newDocs = [
  { title: 'Doc 1' },
  { title: 'Doc 2' },
  { title: 'Doc 3' }
]

newDocs.forEach(doc => search.add(doc))

// Or for larger batches, use setCollection
const currentDocs = search.exportSnapshot().docs
const allDocs = [...currentDocs, ...newDocs]
search.setCollection(allDocs)

Document ID Management

// Track documents by ID for easier updates
class SearchManager {
  constructor(docs, options) {
    this.search = new LumoSearch(docs, options)
    this.idMap = new Map()
    docs.forEach((doc, index) => {
      this.idMap.set(doc.id, index)
    })
  }

  addDocument(doc) {
    this.search.add(doc)
    const newIndex = this.search.exportSnapshot().docs.length - 1
    this.idMap.set(doc.id, newIndex)
  }

  updateDocument(id, newDoc) {
    const index = this.idMap.get(id)
    if (index !== undefined) {
      this.search.removeAt(index)
      this.search.add(newDoc)
      // Rebuild ID map
      this.rebuildIdMap()
    }
  }

  deleteDocument(id) {
    const index = this.idMap.get(id)
    if (index !== undefined) {
      this.search.removeAt(index)
      this.idMap.delete(id)
      // Rebuild ID map
      this.rebuildIdMap()
    }
  }

  rebuildIdMap() {
    const docs = this.search.exportSnapshot().docs
    this.idMap.clear()
    docs.forEach((doc, index) => {
      this.idMap.set(doc.id, index)
    })
  }

  search(query, options) {
    return this.search.search(query, options)
  }
}

Document Requirements

  • Documents must be plain JavaScript objects
  • Fields specified in keys must exist
  • Field values should be strings or convertible to strings
  • Nested fields are supported with dot notation
  • Array fields are flattened and indexed as space-separated strings

Array Fields

const docs = [
  {
    title: 'JavaScript Tutorial',
    tags: ['javascript', 'programming', 'web']
  }
]

const search = new LumoSearch(docs, {
  keys: [
    { name: 'title', weight: 3 },
    { name: 'tags', weight: 2 }
  ]
})

// Array is indexed as: "javascript programming web"
const results = search.search('programming')
// => Matches the document via tags array

Large Dataset Strategies

Lazy Loading

// Load initial subset
const initialDocs = await fetchDocuments({ limit: 1000 })
const search = new LumoSearch(initialDocs, { keys: ['title'] })

// Load more on demand
async function loadMore() {
  const moreDocs = await fetchDocuments({ offset: 1000, limit: 1000 })
  moreDocs.forEach(doc => search.add(doc))
}

Pagination

// For very large datasets, fetch and index in chunks
async function indexAllDocuments() {
  const PAGE_SIZE = 500
  let page = 0
  let allDocs = []

  while (true) {
    const docs = await fetchDocuments({
      offset: page * PAGE_SIZE,
      limit: PAGE_SIZE
    })

    if (docs.length === 0) break

    allDocs = [...allDocs, ...docs]
    page++

    // Show progress
    console.log(`Indexed ${allDocs.length} documents`)
  }

  const search = new LumoSearch(allDocs, {
    keys: ['title', 'body']
  })

  return search
}

Performance Tips

  • Use add() for single documents — it's fast
  • For bulk additions (>100 docs), use setCollection()
  • Avoid frequent rebuilds — batch your updates
  • Keep document objects lean — only include searchable fields
  • For huge datasets (>100k docs), consider web workers

Handling Updates

// WebSocket updates
socket.on('document:created', (doc) => {
  search.add(doc)
})

socket.on('document:updated', ({ id, data }) => {
  const results = search.search(id, {
    filters: { id },
    limit: 1
  })

  if (results.length > 0) {
    search.removeAt(results[0].refIndex)
    search.add(data)
  }
})

socket.on('document:deleted', ({ id }) => {
  const results = search.search(id, {
    filters: { id },
    limit: 1
  })

  if (results.length > 0) {
    search.removeAt(results[0].refIndex)
  }
})

Note: Index positions change after removals. Always search for the current position before removing by ID. Or use the SearchManager pattern above for easier ID-based management.

Related