Coverage for apps / recipes / services / search.py: 86%

215 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-14 19:13 +0000

1""" 

2Async multi-site recipe search service. 

3""" 

4 

5import asyncio 

6import logging 

7import re 

8from dataclasses import dataclass 

9from datetime import datetime 

10from typing import Optional 

11from urllib.parse import quote_plus, urljoin, urlparse 

12 

13from asgiref.sync import sync_to_async 

14from bs4 import BeautifulSoup 

15from curl_cffi.requests import AsyncSession 

16from django.utils import timezone 

17 

18from apps.recipes.services.fingerprint import ( 

19 BROWSER_PROFILES, 

20 get_random_delay, 

21) 

22 

23logger = logging.getLogger(__name__) 

24 

25 

26@dataclass 

27class SearchResult: 

28 """A single search result from a recipe site.""" 

29 

30 url: str 

31 title: str 

32 host: str 

33 image_url: str = "" 

34 description: str = "" 

35 rating_count: Optional[int] = None 

36 

37 

38class RecipeSearch: 

39 """ 

40 Async recipe search service that queries multiple sites concurrently. 

41 

42 Uses curl_cffi with browser impersonation to fetch search pages, 

43 then parses results using BeautifulSoup with site-specific selectors. 

44 

45 Browser profiles are centralized in fingerprint.py for maintainability. 

46 """ 

47 

48 MAX_CONCURRENT = 10 

49 DEFAULT_TIMEOUT = 30 

50 

51 def __init__(self): 

52 self.timeout = self.DEFAULT_TIMEOUT 

53 

54 async def search( 

55 self, 

56 query: str, 

57 sources: Optional[list[str]] = None, 

58 page: int = 1, 

59 per_page: int = 20, 

60 ) -> dict: 

61 """ 

62 Search for recipes across multiple sites. 

63 

64 Args: 

65 query: Search query string 

66 sources: Optional list of hosts to search (None = all enabled) 

67 page: Page number (1-indexed) 

68 per_page: Results per page 

69 

70 Returns: 

71 dict with keys: 

72 - results: List of SearchResult dicts 

73 - total: Total result count 

74 - page: Current page 

75 - has_more: Whether more results exist 

76 - sites: Dict mapping host to result count 

77 """ 

78 from apps.recipes.models import SearchSource 

79 

80 # Get enabled sources 

81 get_sources = sync_to_async(lambda: list(SearchSource.objects.filter(is_enabled=True))) 

82 enabled_sources = await get_sources() 

83 

84 # Filter by requested sources if specified 

85 if sources: 

86 enabled_sources = [s for s in enabled_sources if s.host in sources] 

87 

88 if not enabled_sources: 

89 return { 

90 "results": [], 

91 "total": 0, 

92 "page": page, 

93 "has_more": False, 

94 "sites": {}, 

95 } 

96 

97 # Create semaphore for concurrency control 

98 semaphore = asyncio.Semaphore(self.MAX_CONCURRENT) 

99 

100 # Search all sources concurrently with primary browser profile 

101 # If all sources fail, we try fallback profiles 

102 primary_profile = BROWSER_PROFILES[0] 

103 

104 async with AsyncSession(impersonate=primary_profile) as session: 

105 tasks = [self._search_source(session, semaphore, source, query) for source in enabled_sources] 

106 results_by_source = await asyncio.gather(*tasks, return_exceptions=True) 

107 

108 # Aggregate results 

109 all_results: list[SearchResult] = [] 

110 site_counts: dict[str, int] = {} 

111 

112 for source, result in zip(enabled_sources, results_by_source): 

113 if isinstance(result, Exception): 

114 logger.warning(f"Search failed for {source.host}: {result}") 

115 await self._record_failure(source) 

116 continue 

117 

118 site_counts[source.host] = len(result) 

119 all_results.extend(result) 

120 await self._record_success(source) 

121 

122 # Deduplicate by URL 

123 seen_urls = set() 

124 unique_results = [] 

125 for r in all_results: 

126 if r.url not in seen_urls: 

127 seen_urls.add(r.url) 

128 unique_results.append(r) 

129 

130 # Convert to dict format for ranking 

131 result_dicts = [ 

132 { 

133 "url": r.url, 

134 "title": r.title, 

135 "host": r.host, 

136 "image_url": r.image_url, 

137 "description": r.description, 

138 "rating_count": r.rating_count, 

139 } 

140 for r in unique_results 

141 ] 

142 

143 # Apply AI ranking (optional, skips if unavailable) 

144 result_dicts = await self._apply_ai_ranking(query, result_dicts) 

145 

146 # Paginate 

147 total = len(result_dicts) 

148 start = (page - 1) * per_page 

149 end = start + per_page 

150 paginated = result_dicts[start:end] 

151 

152 return { 

153 "results": paginated, 

154 "total": total, 

155 "page": page, 

156 "has_more": end < total, 

157 "sites": site_counts, 

158 } 

159 

160 async def _apply_ai_ranking(self, query: str, results: list[dict]) -> list[dict]: 

161 """Apply AI ranking to search results (non-blocking). 

162 

163 Skips ranking if AI is unavailable or if it fails. 

164 """ 

165 try: 

166 from apps.ai.services.ranking import rank_results 

167 

168 ranked = await sync_to_async(rank_results)(query, results) 

169 return ranked 

170 except Exception as e: 

171 logger.warning(f"AI ranking failed: {e}") 

172 return results 

173 

174 async def _search_source( 

175 self, 

176 session: AsyncSession, 

177 semaphore: asyncio.Semaphore, 

178 source, 

179 query: str, 

180 ) -> list[SearchResult]: 

181 """ 

182 Search a single source for recipes. 

183 

184 Uses randomized delays to avoid bot detection patterns. 

185 """ 

186 async with semaphore: 

187 # Add randomized delay to avoid predictable request patterns 

188 await asyncio.sleep(get_random_delay()) 

189 # Build search URL 

190 search_url = source.search_url_template.replace("{query}", quote_plus(query)) 

191 

192 try: 

193 response = await asyncio.wait_for( 

194 session.get( 

195 search_url, 

196 timeout=self.timeout, 

197 allow_redirects=True, 

198 ), 

199 timeout=self.timeout + 5, # Extra buffer for asyncio 

200 ) 

201 

202 if response.status_code != 200: 

203 raise Exception(f"HTTP {response.status_code}") 

204 

205 return self._parse_search_results( 

206 response.text, 

207 source.host, 

208 source.result_selector, 

209 search_url, 

210 ) 

211 

212 except asyncio.TimeoutError: 

213 raise Exception("Request timed out") 

214 

215 def _parse_search_results( 

216 self, 

217 html: str, 

218 host: str, 

219 selector: str, 

220 base_url: str, 

221 ) -> list[SearchResult]: 

222 """ 

223 Parse search results from HTML. 

224 

225 Uses the site-specific CSS selector if available, 

226 otherwise falls back to common patterns. 

227 """ 

228 soup = BeautifulSoup(html, "html.parser") 

229 results = [] 

230 

231 # Try site-specific selector first 

232 if selector: 

233 elements = soup.select(selector) 

234 if elements: 

235 for el in elements[:20]: # Limit per site 

236 result = self._extract_result_from_element(el, host, base_url) 

237 if result: 

238 results.append(result) 

239 return results 

240 

241 # Fallback: Look for common recipe link patterns 

242 results = self._fallback_parse(soup, host, base_url) 

243 return results[:20] # Limit per site 

244 

245 def _find_link(self, element) -> Optional[tuple]: 

246 """Find recipe link in an HTML element. 

247 

248 Returns: 

249 Tuple of (link_element, url) if found, None otherwise. 

250 """ 

251 link = element.find("a", href=True) 

252 if not link: 

253 link = element if element.name == "a" and element.get("href") else None 

254 if not link: 

255 return None 

256 

257 url = link.get("href", "") 

258 if not url: 

259 return None 

260 

261 return link, url 

262 

263 def _extract_title(self, element, link) -> str: 

264 """Extract title from element with multiple fallback strategies. 

265 

266 Tries: heading elements, link text, title/aria-label attributes. 

267 """ 

268 title_el = element.find(["h2", "h3", "h4", ".title", '[class*="title"]']) 

269 if title_el: 

270 title = title_el.get_text(strip=True) 

271 if title: 

272 return title 

273 

274 title = link.get_text(strip=True) 

275 if title: 

276 return title 

277 

278 return link.get("title", "") or link.get("aria-label", "") 

279 

280 def _extract_rating(self, title: str) -> tuple[str, Optional[int]]: 

281 """Extract and strip rating count from title. 

282 

283 Handles patterns like "Recipe Name1,392Ratings". 

284 

285 Returns: 

286 Tuple of (cleaned_title, rating_count). 

287 """ 

288 rating_match = re.search(r"([\d,]+)\s*[Rr]atings?\s*$", title) 

289 if not rating_match: 

290 return title, None 

291 

292 rating_str = rating_match.group(1).replace(",", "") 

293 try: 

294 rating_count = int(rating_str) 

295 cleaned_title = title[: rating_match.start()].strip() 

296 return cleaned_title, rating_count 

297 except ValueError: 

298 return title, None 

299 

300 def _extract_image(self, element, base_url: str) -> str: 

301 """Extract image URL with multiple fallback strategies. 

302 

303 Tries: src, data-src, data-lazy-src attributes. 

304 """ 

305 img = element.find("img") 

306 if not img: 

307 return "" 

308 

309 image_url = img.get("src") or img.get("data-src") or img.get("data-lazy-src", "") 

310 if image_url: 

311 return urljoin(base_url, image_url) 

312 return "" 

313 

314 def _extract_description(self, element) -> str: 

315 """Extract description from element.""" 

316 desc_el = element.find(["p", ".description", '[class*="description"]']) 

317 if desc_el: 

318 return desc_el.get_text(strip=True)[:200] 

319 return "" 

320 

321 def _extract_result_from_element( 

322 self, 

323 element, 

324 host: str, 

325 base_url: str, 

326 ) -> Optional[SearchResult]: 

327 """Extract search result data from an HTML element.""" 

328 # Find and validate link 

329 link_result = self._find_link(element) 

330 if not link_result: 

331 return None 

332 link, url = link_result 

333 

334 # Make URL absolute and validate 

335 url = urljoin(base_url, url) 

336 if not self._looks_like_recipe_url(url, host): 

337 return None 

338 

339 # Extract title 

340 title = self._extract_title(element, link) 

341 if not title: 

342 return None 

343 

344 # Extract and strip rating from title 

345 title, rating_count = self._extract_rating(title) 

346 

347 # Title may have become empty after stripping rating (QA-053) 

348 if not title: 

349 return None 

350 

351 return SearchResult( 

352 url=url, 

353 title=title[:200], 

354 host=host, 

355 image_url=self._extract_image(element, base_url), 

356 description=self._extract_description(element), 

357 rating_count=rating_count, 

358 ) 

359 

360 def _fallback_parse( 

361 self, 

362 soup: BeautifulSoup, 

363 host: str, 

364 base_url: str, 

365 ) -> list[SearchResult]: 

366 """ 

367 Fallback parser for sites without a specific selector. 

368 

369 Looks for common patterns in recipe search results. 

370 """ 

371 results = [] 

372 

373 # Strategy 1: Look for article elements with links 

374 for article in soup.find_all("article")[:30]: 

375 result = self._extract_result_from_element(article, host, base_url) 

376 if result: 

377 results.append(result) 

378 

379 if results: 

380 return results 

381 

382 # Strategy 2: Look for card-like divs 

383 card_selectors = [ 

384 '[class*="recipe-card"]', 

385 '[class*="card"]', 

386 '[class*="result"]', 

387 '[class*="item"]', 

388 ] 

389 for selector in card_selectors: 

390 for card in soup.select(selector)[:30]: 

391 result = self._extract_result_from_element(card, host, base_url) 

392 if result: 

393 results.append(result) 

394 if results: 

395 return results 

396 

397 # Strategy 3: Look for links that look like recipe URLs 

398 for link in soup.find_all("a", href=True)[:100]: 

399 url = urljoin(base_url, link.get("href", "")) 

400 if self._looks_like_recipe_url(url, host): 

401 title = link.get_text(strip=True) 

402 if title and len(title) > 5: 

403 results.append( 

404 SearchResult( 

405 url=url, 

406 title=title[:200], 

407 host=host, 

408 ) 

409 ) 

410 

411 return results 

412 

413 def _looks_like_recipe_url(self, url: str, host: str) -> bool: 

414 """ 

415 Check if a URL looks like a recipe detail page. 

416 """ 

417 parsed = urlparse(url) 

418 

419 # Must be from the expected host 

420 if host not in parsed.netloc: 

421 return False 

422 

423 path = parsed.path.lower() 

424 

425 # Common recipe URL patterns 

426 recipe_patterns = [ 

427 r"/recipe[s]?/", 

428 r"/dish/", 

429 r"/food/", 

430 r"/cooking/", 

431 r"/\d+/", # Numeric ID in path 

432 r"-recipe/?$", # URL ending with -recipe 

433 r"/a\d+/", # Alphanumeric IDs like /a69912280/ 

434 r"/food-cooking/", # Pioneer Woman style 

435 ] 

436 

437 # Exclude non-recipe paths 

438 exclude_patterns = [ 

439 r"/search", 

440 r"/tag/", 

441 r"/category/", 

442 r"/author/", 

443 r"/profile/", 

444 r"/user/", 

445 r"/about", 

446 r"/contact", 

447 r"/privacy", 

448 r"/terms", 

449 r"/newsletter", 

450 r"/subscribe", 

451 # Article/blog paths (QA-053) 

452 r"/article/", 

453 r"/articles/", 

454 r"/blog/", 

455 r"/post/", 

456 r"/posts/", 

457 r"/news/", 

458 r"/story/", 

459 r"/stories/", 

460 r"/feature/", 

461 r"/features/", 

462 r"/guide/", 

463 r"/guides/", 

464 r"/review/", 

465 r"/reviews/", 

466 r"/roundup/", 

467 r"/list/", 

468 r"/listicle/", 

469 # Video paths (QA-053) 

470 r"/video/", 

471 r"/videos/", 

472 r"/watch/", 

473 r"/watch\?", 

474 r"/embed/", 

475 r"/player/", 

476 r"/clip/", 

477 r"/clips/", 

478 r"/episode/", 

479 r"/episodes/", 

480 r"/series/", 

481 r"/show/", 

482 r"/shows/", 

483 r"/gallery/", 

484 r"/galleries/", 

485 r"/slideshow/", 

486 r"/photo-gallery/", 

487 # Index/listing pages (QA-053) 

488 r"/seasons?(?:/|$)", 

489 r"/cuisines?(?:/|$)", 

490 r"/ingredients?(?:/|$)", 

491 r"/collections?(?:/|$)", 

492 r"/occasions?(?:/|$)", 

493 r"/courses?(?:/|$)", 

494 r"/diets?(?:/|$)", 

495 r"/techniques?(?:/|$)", 

496 r"/chefs?(?:/|$)", 

497 r"/dishes(?:/|$)", 

498 r"/menus?(?:/|$)", 

499 r"/meal-plans?(?:/|$)", 

500 ] 

501 

502 for pattern in exclude_patterns: 

503 if re.search(pattern, path): 

504 return False 

505 

506 # Site-specific requirements (QA-058) 

507 # AllRecipes has article pages at root that look like recipes but aren't 

508 # Real recipes are always under /recipe/ path 

509 if "allrecipes.com" in host and "/recipe/" not in path: 

510 return False 

511 

512 # Check for recipe patterns 

513 for pattern in recipe_patterns: 

514 if re.search(pattern, path): 

515 return True 

516 

517 # Heuristic: URL path has enough segments and isn't too short 

518 segments = [s for s in path.split("/") if s] 

519 if len(segments) >= 2 and len(path) > 20: 

520 return True 

521 

522 # Also accept single-segment slug-style URLs (common for food blogs) 

523 # e.g., /30-cloves-garlic-chicken/ 

524 if len(segments) == 1 and len(path) > 15 and path.count("-") >= 2: 

525 return True 

526 

527 return False 

528 

529 async def _record_failure(self, source) -> None: 

530 """Record a search failure for maintenance tracking.""" 

531 from apps.recipes.models import SearchSource 

532 

533 @sync_to_async 

534 def update(): 

535 source.consecutive_failures += 1 

536 if source.consecutive_failures >= 3: 

537 source.needs_attention = True 

538 source.save(update_fields=["consecutive_failures", "needs_attention"]) 

539 

540 await update() 

541 

542 async def _record_success(self, source) -> None: 

543 """Record a successful search.""" 

544 from apps.recipes.models import SearchSource 

545 

546 @sync_to_async 

547 def update(): 

548 source.consecutive_failures = 0 

549 source.needs_attention = False 

550 source.last_validated_at = timezone.now() 

551 source.save( 

552 update_fields=[ 

553 "consecutive_failures", 

554 "needs_attention", 

555 "last_validated_at", 

556 ] 

557 ) 

558 

559 await update() 

← Back to Dashboard