Files
assistant-storefront/spec/enterprise/jobs/captain/documents/crawl_job_spec.rb

134 lines
4.7 KiB
Ruby
Raw Normal View History

require 'rails_helper'
RSpec.describe Captain::Documents::CrawlJob, type: :job do
let(:document) { create(:captain_document, external_link: 'https://example.com/page') }
let(:assistant_id) { document.assistant_id }
let(:webhook_url) { Rails.application.routes.url_helpers.enterprise_webhooks_firecrawl_url }
describe '#perform' do
context 'when CAPTAIN_FIRECRAWL_API_KEY is configured' do
let(:firecrawl_service) { instance_double(Captain::Tools::FirecrawlService) }
let(:account) { document.account }
let(:token) { Digest::SHA256.hexdigest("-key#{document.assistant_id}#{document.account_id}") }
before do
allow(Captain::Tools::FirecrawlService).to receive(:new).and_return(firecrawl_service)
allow(firecrawl_service).to receive(:perform)
create(:installation_config, name: 'CAPTAIN_FIRECRAWL_API_KEY', value: 'test-key')
end
context 'with account usage limits' do
before do
allow(account).to receive(:usage_limits).and_return({ captain: { documents: { current_available: 20 } } })
end
it 'uses FirecrawlService with the correct crawl limit' do
expect(firecrawl_service).to receive(:perform).with(
document.external_link,
"#{webhook_url}?assistant_id=#{assistant_id}&token=#{token}",
20
)
described_class.perform_now(document)
end
end
context 'when crawl limit exceeds maximum' do
before do
allow(account).to receive(:usage_limits).and_return({ captain: { documents: { current_available: 1000 } } })
end
it 'caps the crawl limit at 500' do
expect(firecrawl_service).to receive(:perform).with(
document.external_link,
"#{webhook_url}?assistant_id=#{assistant_id}&token=#{token}",
500
)
described_class.perform_now(document)
end
end
context 'with no usage limits configured' do
before do
allow(account).to receive(:usage_limits).and_return({})
end
it 'uses default crawl limit of 10' do
expect(firecrawl_service).to receive(:perform).with(
document.external_link,
"#{webhook_url}?assistant_id=#{assistant_id}&token=#{token}",
10
)
described_class.perform_now(document)
end
end
end
context 'when CAPTAIN_FIRECRAWL_API_KEY is not configured' do
let(:page_links) { ['https://example.com/page1', 'https://example.com/page2'] }
let(:simple_crawler) { instance_double(Captain::Tools::SimplePageCrawlService) }
before do
allow(Captain::Tools::SimplePageCrawlService)
.to receive(:new)
.with(document.external_link)
.and_return(simple_crawler)
allow(simple_crawler).to receive(:page_links).and_return(page_links)
end
it 'enqueues SimplePageCrawlParserJob for each discovered link' do
page_links.each do |link|
expect(Captain::Tools::SimplePageCrawlParserJob)
.to receive(:perform_later)
.with(
assistant_id: assistant_id,
page_link: link
)
end
# Should also crawl the original link
expect(Captain::Tools::SimplePageCrawlParserJob)
.to receive(:perform_later)
.with(
assistant_id: assistant_id,
page_link: document.external_link
)
described_class.perform_now(document)
end
it 'uses SimplePageCrawlService to discover page links' do
expect(simple_crawler).to receive(:page_links)
described_class.perform_now(document)
end
end
context 'when document is a PDF' do
let(:pdf_document) do
doc = create(:captain_document, external_link: 'https://example.com/document')
allow(doc).to receive(:pdf_document?).and_return(true)
allow(doc).to receive(:update!).and_return(true)
doc
end
it 'processes PDF using PdfProcessingService' do
pdf_service = instance_double(Captain::Llm::PdfProcessingService)
expect(Captain::Llm::PdfProcessingService).to receive(:new).with(pdf_document).and_return(pdf_service)
expect(pdf_service).to receive(:process)
expect(pdf_document).to receive(:update!).with(status: :available)
described_class.perform_now(pdf_document)
end
it 'handles PDF processing errors' do
allow(Captain::Llm::PdfProcessingService).to receive(:new).and_raise(StandardError, 'Processing failed')
expect { described_class.perform_now(pdf_document) }.to raise_error(StandardError, 'Processing failed')
end
end
end
end