Coverage for apps/recipes/services/search.py: 86%

1"""

2Async multi-site recipe search service.

3"""

5import asyncio

6import logging

7import re

8from dataclasses import dataclass

9from datetime import datetime

10from typing import Optional

11from urllib.parse import quote_plus, urljoin, urlparse

13from asgiref.sync import sync_to_async

14from bs4 import BeautifulSoup

15from curl_cffi.requests import AsyncSession

16from django.utils import timezone

18from apps.recipes.services.fingerprint import (

19 BROWSER_PROFILES,

20 get_random_delay,

21)

23logger = logging.getLogger(__name__)

26@dataclass

27class SearchResult:

28 """A single search result from a recipe site."""

30 url: str

31 title: str

32 host: str

33 image_url: str = ""

34 description: str = ""

35 rating_count: Optional[int] = None

38class RecipeSearch:

39 """

40 Async recipe search service that queries multiple sites concurrently.

42 Uses curl_cffi with browser impersonation to fetch search pages,

43 then parses results using BeautifulSoup with site-specific selectors.

45 Browser profiles are centralized in fingerprint.py for maintainability.

46 """

48 MAX_CONCURRENT = 10

49 DEFAULT_TIMEOUT = 30

51 def __init__(self):

52 self.timeout = self.DEFAULT_TIMEOUT

54 async def search(

55 self,

56 query: str,

57 sources: Optional[list[str]] = None,

58 page: int = 1,

59 per_page: int = 20,

60 ) -> dict:

61 """

62 Search for recipes across multiple sites.

64 Args:

65 query: Search query string

66 sources: Optional list of hosts to search (None = all enabled)

67 page: Page number (1-indexed)

68 per_page: Results per page

70 Returns:

71 dict with keys:

72 - results: List of SearchResult dicts

73 - total: Total result count

74 - page: Current page

75 - has_more: Whether more results exist

76 - sites: Dict mapping host to result count

77 """

78 from apps.recipes.models import SearchSource

80 # Get enabled sources

81 get_sources = sync_to_async(lambda: list(SearchSource.objects.filter(is_enabled=True)))

82 enabled_sources = await get_sources()

84 # Filter by requested sources if specified

85 if sources:

86 enabled_sources = [s for s in enabled_sources if s.host in sources]

88 if not enabled_sources:

89 return {

90 "results": [],

91 "total": 0,

92 "page": page,

93 "has_more": False,

94 "sites": {},

95 }

97 # Create semaphore for concurrency control

98 semaphore = asyncio.Semaphore(self.MAX_CONCURRENT)

100 # Search all sources concurrently with primary browser profile

101 # If all sources fail, we try fallback profiles

102 primary_profile = BROWSER_PROFILES[0]

103

104 async with AsyncSession(impersonate=primary_profile) as session:

105 tasks = [self._search_source(session, semaphore, source, query) for source in enabled_sources]

106 results_by_source = await asyncio.gather(*tasks, return_exceptions=True)

107

108 # Aggregate results

109 all_results: list[SearchResult] = []

110 site_counts: dict[str, int] = {}

111

112 for source, result in zip(enabled_sources, results_by_source):

113 if isinstance(result, Exception):

114 logger.warning(f"Search failed for {source.host}: {result}")

115 await self._record_failure(source)

116 continue

117

118 site_counts[source.host] = len(result)

119 all_results.extend(result)

120 await self._record_success(source)

121

122 # Deduplicate by URL

123 seen_urls = set()

124 unique_results = []

125 for r in all_results:

126 if r.url not in seen_urls:

127 seen_urls.add(r.url)

128 unique_results.append(r)

129

130 # Convert to dict format for ranking

131 result_dicts = [

132 {

133 "url": r.url,

134 "title": r.title,

135 "host": r.host,

136 "image_url": r.image_url,

137 "description": r.description,

138 "rating_count": r.rating_count,

139 }

140 for r in unique_results

141 ]

142

143 # Apply AI ranking (optional, skips if unavailable)

144 result_dicts = await self._apply_ai_ranking(query, result_dicts)

145

146 # Paginate

147 total = len(result_dicts)

148 start = (page - 1) * per_page

149 end = start + per_page

150 paginated = result_dicts[start:end]

151

152 return {

153 "results": paginated,

154 "total": total,

155 "page": page,

156 "has_more": end < total,

157 "sites": site_counts,

158 }

159

160 async def _apply_ai_ranking(self, query: str, results: list[dict]) -> list[dict]:

161 """Apply AI ranking to search results (non-blocking).

162

163 Skips ranking if AI is unavailable or if it fails.

164 """

165 try:

166 from apps.ai.services.ranking import rank_results

167

168 ranked = await sync_to_async(rank_results)(query, results)

169 return ranked

170 except Exception as e:

171 logger.warning(f"AI ranking failed: {e}")

172 return results

173

174 async def _search_source(

175 self,

176 session: AsyncSession,

177 semaphore: asyncio.Semaphore,

178 source,

179 query: str,

180 ) -> list[SearchResult]:

181 """

182 Search a single source for recipes.

183

184 Uses randomized delays to avoid bot detection patterns.

185 """

186 async with semaphore:

187 # Add randomized delay to avoid predictable request patterns

188 await asyncio.sleep(get_random_delay())

189 # Build search URL

190 search_url = source.search_url_template.replace("{query}", quote_plus(query))

191

192 try:

193 response = await asyncio.wait_for(

194 session.get(

195 search_url,

196 timeout=self.timeout,

197 allow_redirects=True,

198 ),

199 timeout=self.timeout + 5, # Extra buffer for asyncio

200 )

201

202 if response.status_code != 200:

203 raise Exception(f"HTTP {response.status_code}")

204

205 return self._parse_search_results(

206 response.text,

207 source.host,

208 source.result_selector,

209 search_url,

210 )

211

212 except asyncio.TimeoutError:

213 raise Exception("Request timed out")

214

215 def _parse_search_results(

216 self,

217 html: str,

218 host: str,

219 selector: str,

220 base_url: str,

221 ) -> list[SearchResult]:

222 """

223 Parse search results from HTML.

224

225 Uses the site-specific CSS selector if available,

226 otherwise falls back to common patterns.

227 """

228 soup = BeautifulSoup(html, "html.parser")

229 results = []

230

231 # Try site-specific selector first

232 if selector:

233 elements = soup.select(selector)

234 if elements:

235 for el in elements[:20]: # Limit per site

236 result = self._extract_result_from_element(el, host, base_url)

237 if result:

238 results.append(result)

239 return results

240

241 # Fallback: Look for common recipe link patterns

242 results = self._fallback_parse(soup, host, base_url)

243 return results[:20] # Limit per site

244

245 def _find_link(self, element) -> Optional[tuple]:

246 """Find recipe link in an HTML element.

247

248 Returns:

249 Tuple of (link_element, url) if found, None otherwise.

250 """

251 link = element.find("a", href=True)

252 if not link:

253 link = element if element.name == "a" and element.get("href") else None

254 if not link:

255 return None

256

257 url = link.get("href", "")

258 if not url:

259 return None

260

261 return link, url

262

263 def _extract_title(self, element, link) -> str:

264 """Extract title from element with multiple fallback strategies.

265

266 Tries: heading elements, link text, title/aria-label attributes.

267 """

268 title_el = element.find(["h2", "h3", "h4", ".title", '[class*="title"]'])

269 if title_el:

270 title = title_el.get_text(strip=True)

271 if title:

272 return title

273

274 title = link.get_text(strip=True)

275 if title:

276 return title

277

278 return link.get("title", "") or link.get("aria-label", "")

279

280 def _extract_rating(self, title: str) -> tuple[str, Optional[int]]:

281 """Extract and strip rating count from title.

282

283 Handles patterns like "Recipe Name1,392Ratings".

284

285 Returns:

286 Tuple of (cleaned_title, rating_count).

287 """

288 rating_match = re.search(r"([\d,]+)\s*[Rr]atings?\s*$", title)

289 if not rating_match:

290 return title, None

291

292 rating_str = rating_match.group(1).replace(",", "")

293 try:

294 rating_count = int(rating_str)

295 cleaned_title = title[: rating_match.start()].strip()

296 return cleaned_title, rating_count

297 except ValueError:

298 return title, None

299

300 def _extract_image(self, element, base_url: str) -> str:

301 """Extract image URL with multiple fallback strategies.

302

303 Tries: src, data-src, data-lazy-src attributes.

304 """

305 img = element.find("img")

306 if not img:

307 return ""

308

309 image_url = img.get("src") or img.get("data-src") or img.get("data-lazy-src", "")

310 if image_url:

311 return urljoin(base_url, image_url)

312 return ""

313

314 def _extract_description(self, element) -> str:

315 """Extract description from element."""

316 desc_el = element.find(["p", ".description", '[class*="description"]'])

317 if desc_el:

318 return desc_el.get_text(strip=True)[:200]

319 return ""

320

321 def _extract_result_from_element(

322 self,

323 element,

324 host: str,

325 base_url: str,

326 ) -> Optional[SearchResult]:

327 """Extract search result data from an HTML element."""

328 # Find and validate link

329 link_result = self._find_link(element)

330 if not link_result:

331 return None

332 link, url = link_result

333

334 # Make URL absolute and validate

335 url = urljoin(base_url, url)

336 if not self._looks_like_recipe_url(url, host):

337 return None

338

339 # Extract title

340 title = self._extract_title(element, link)

341 if not title:

342 return None

343

344 # Extract and strip rating from title

345 title, rating_count = self._extract_rating(title)

346

347 # Title may have become empty after stripping rating (QA-053)

348 if not title:

349 return None

350

351 return SearchResult(

352 url=url,

353 title=title[:200],

354 host=host,

355 image_url=self._extract_image(element, base_url),

356 description=self._extract_description(element),

357 rating_count=rating_count,

358 )

359

360 def _fallback_parse(

361 self,

362 soup: BeautifulSoup,

363 host: str,

364 base_url: str,

365 ) -> list[SearchResult]:

366 """

367 Fallback parser for sites without a specific selector.

368

369 Looks for common patterns in recipe search results.

370 """

371 results = []

372

373 # Strategy 1: Look for article elements with links

374 for article in soup.find_all("article")[:30]:

375 result = self._extract_result_from_element(article, host, base_url)

376 if result:

377 results.append(result)

378

379 if results:

380 return results

381

382 # Strategy 2: Look for card-like divs

383 card_selectors = [

384 '[class*="recipe-card"]',

385 '[class*="card"]',

386 '[class*="result"]',

387 '[class*="item"]',

388 ]

389 for selector in card_selectors:

390 for card in soup.select(selector)[:30]:

391 result = self._extract_result_from_element(card, host, base_url)

392 if result:

393 results.append(result)

394 if results:

395 return results

396

397 # Strategy 3: Look for links that look like recipe URLs

398 for link in soup.find_all("a", href=True)[:100]:

399 url = urljoin(base_url, link.get("href", ""))

400 if self._looks_like_recipe_url(url, host):

401 title = link.get_text(strip=True)

402 if title and len(title) > 5:

403 results.append(

404 SearchResult(

405 url=url,

406 title=title[:200],

407 host=host,

408 )

409 )

410

411 return results

412

413 def _looks_like_recipe_url(self, url: str, host: str) -> bool:

414 """

415 Check if a URL looks like a recipe detail page.

416 """

417 parsed = urlparse(url)

418

419 # Must be from the expected host

420 if host not in parsed.netloc:

421 return False

422

423 path = parsed.path.lower()

424

425 # Common recipe URL patterns

426 recipe_patterns = [

427 r"/recipe[s]?/",

428 r"/dish/",

429 r"/food/",

430 r"/cooking/",

431 r"/\d+/", # Numeric ID in path

432 r"-recipe/?$", # URL ending with -recipe

433 r"/a\d+/", # Alphanumeric IDs like /a69912280/

434 r"/food-cooking/", # Pioneer Woman style

435 ]

436

437 # Exclude non-recipe paths

438 exclude_patterns = [

439 r"/search",

440 r"/tag/",

441 r"/category/",

442 r"/author/",

443 r"/profile/",

444 r"/user/",

445 r"/about",

446 r"/contact",

447 r"/privacy",

448 r"/terms",

449 r"/newsletter",

450 r"/subscribe",

451 # Article/blog paths (QA-053)

452 r"/article/",

453 r"/articles/",

454 r"/blog/",

455 r"/post/",

456 r"/posts/",

457 r"/news/",

458 r"/story/",

459 r"/stories/",

460 r"/feature/",

461 r"/features/",

462 r"/guide/",

463 r"/guides/",

464 r"/review/",

465 r"/reviews/",

466 r"/roundup/",

467 r"/list/",

468 r"/listicle/",

469 # Video paths (QA-053)

470 r"/video/",

471 r"/videos/",

472 r"/watch/",

473 r"/watch\?",

474 r"/embed/",

475 r"/player/",

476 r"/clip/",

477 r"/clips/",

478 r"/episode/",

479 r"/episodes/",

480 r"/series/",

481 r"/show/",

482 r"/shows/",

483 r"/gallery/",

484 r"/galleries/",

485 r"/slideshow/",

486 r"/photo-gallery/",

487 # Index/listing pages (QA-053)

488 r"/seasons?(?:/|$)",

489 r"/cuisines?(?:/|$)",

490 r"/ingredients?(?:/|$)",

491 r"/collections?(?:/|$)",

492 r"/occasions?(?:/|$)",

493 r"/courses?(?:/|$)",

494 r"/diets?(?:/|$)",

495 r"/techniques?(?:/|$)",

496 r"/chefs?(?:/|$)",

497 r"/dishes(?:/|$)",

498 r"/menus?(?:/|$)",

499 r"/meal-plans?(?:/|$)",

500 ]

501

502 for pattern in exclude_patterns:

503 if re.search(pattern, path):

504 return False

505

506 # Site-specific requirements (QA-058)

507 # AllRecipes has article pages at root that look like recipes but aren't

508 # Real recipes are always under /recipe/ path

509 if "allrecipes.com" in host and "/recipe/" not in path:

510 return False

511

512 # Check for recipe patterns

513 for pattern in recipe_patterns:

514 if re.search(pattern, path):

515 return True

516

517 # Heuristic: URL path has enough segments and isn't too short

518 segments = [s for s in path.split("/") if s]

519 if len(segments) >= 2 and len(path) > 20:

520 return True

521

522 # Also accept single-segment slug-style URLs (common for food blogs)

523 # e.g., /30-cloves-garlic-chicken/

524 if len(segments) == 1 and len(path) > 15 and path.count("-") >= 2:

525 return True

526

527 return False

528

529 async def _record_failure(self, source) -> None:

530 """Record a search failure for maintenance tracking."""

531 from apps.recipes.models import SearchSource

532

533 @sync_to_async

534 def update():

535 source.consecutive_failures += 1

536 if source.consecutive_failures >= 3:

537 source.needs_attention = True

538 source.save(update_fields=["consecutive_failures", "needs_attention"])

539

540 await update()

541

542 async def _record_success(self, source) -> None:

543 """Record a successful search."""

544 from apps.recipes.models import SearchSource

545

546 @sync_to_async

547 def update():

548 source.consecutive_failures = 0

549 source.needs_attention = False

550 source.last_validated_at = timezone.now()

551 source.save(

552 update_fields=[

553 "consecutive_failures",

554 "needs_attention",

555 "last_validated_at",

556 ]

557 )

558

559 await update()

Coverage for apps / recipes / services / search.py: 86%

215 statements