Coverage for apps / recipes / services / search.py: 86%

200 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-11 00:40 +0000

1""" 

2Async multi-site recipe search service. 

3""" 

4 

5import asyncio 

6import logging 

7import re 

8from dataclasses import dataclass 

9from datetime import datetime 

10from typing import Optional 

11from urllib.parse import quote_plus, urljoin, urlparse 

12 

13from asgiref.sync import sync_to_async 

14from bs4 import BeautifulSoup 

15from curl_cffi.requests import AsyncSession 

16from django.utils import timezone 

17 

18from apps.recipes.services.fingerprint import ( 

19 BROWSER_PROFILES, 

20 get_random_delay, 

21) 

22 

23logger = logging.getLogger(__name__) 

24 

25 

26@dataclass 

27class SearchResult: 

28 """A single search result from a recipe site.""" 

29 url: str 

30 title: str 

31 host: str 

32 image_url: str = '' 

33 description: str = '' 

34 rating_count: Optional[int] = None 

35 

36 

37class RecipeSearch: 

38 """ 

39 Async recipe search service that queries multiple sites concurrently. 

40 

41 Uses curl_cffi with browser impersonation to fetch search pages, 

42 then parses results using BeautifulSoup with site-specific selectors. 

43 

44 Browser profiles are centralized in fingerprint.py for maintainability. 

45 """ 

46 

47 MAX_CONCURRENT = 10 

48 DEFAULT_TIMEOUT = 30 

49 

50 def __init__(self): 

51 self.timeout = self.DEFAULT_TIMEOUT 

52 

53 async def search( 

54 self, 

55 query: str, 

56 sources: Optional[list[str]] = None, 

57 page: int = 1, 

58 per_page: int = 20, 

59 ) -> dict: 

60 """ 

61 Search for recipes across multiple sites. 

62 

63 Args: 

64 query: Search query string 

65 sources: Optional list of hosts to search (None = all enabled) 

66 page: Page number (1-indexed) 

67 per_page: Results per page 

68 

69 Returns: 

70 dict with keys: 

71 - results: List of SearchResult dicts 

72 - total: Total result count 

73 - page: Current page 

74 - has_more: Whether more results exist 

75 - sites: Dict mapping host to result count 

76 """ 

77 from apps.recipes.models import SearchSource 

78 

79 # Get enabled sources 

80 get_sources = sync_to_async(lambda: list( 

81 SearchSource.objects.filter(is_enabled=True) 

82 )) 

83 enabled_sources = await get_sources() 

84 

85 # Filter by requested sources if specified 

86 if sources: 

87 enabled_sources = [s for s in enabled_sources if s.host in sources] 

88 

89 if not enabled_sources: 

90 return { 

91 'results': [], 

92 'total': 0, 

93 'page': page, 

94 'has_more': False, 

95 'sites': {}, 

96 } 

97 

98 # Create semaphore for concurrency control 

99 semaphore = asyncio.Semaphore(self.MAX_CONCURRENT) 

100 

101 # Search all sources concurrently with primary browser profile 

102 # If all sources fail, we try fallback profiles 

103 primary_profile = BROWSER_PROFILES[0] 

104 

105 async with AsyncSession(impersonate=primary_profile) as session: 

106 tasks = [ 

107 self._search_source(session, semaphore, source, query) 

108 for source in enabled_sources 

109 ] 

110 results_by_source = await asyncio.gather(*tasks, return_exceptions=True) 

111 

112 # Aggregate results 

113 all_results: list[SearchResult] = [] 

114 site_counts: dict[str, int] = {} 

115 

116 for source, result in zip(enabled_sources, results_by_source): 

117 if isinstance(result, Exception): 

118 logger.warning(f"Search failed for {source.host}: {result}") 

119 await self._record_failure(source) 

120 continue 

121 

122 site_counts[source.host] = len(result) 

123 all_results.extend(result) 

124 await self._record_success(source) 

125 

126 # Deduplicate by URL 

127 seen_urls = set() 

128 unique_results = [] 

129 for r in all_results: 

130 if r.url not in seen_urls: 

131 seen_urls.add(r.url) 

132 unique_results.append(r) 

133 

134 # Convert to dict format for ranking 

135 result_dicts = [ 

136 { 

137 'url': r.url, 

138 'title': r.title, 

139 'host': r.host, 

140 'image_url': r.image_url, 

141 'description': r.description, 

142 'rating_count': r.rating_count, 

143 } 

144 for r in unique_results 

145 ] 

146 

147 # Apply AI ranking (optional, skips if unavailable) 

148 result_dicts = await self._apply_ai_ranking(query, result_dicts) 

149 

150 # Paginate 

151 total = len(result_dicts) 

152 start = (page - 1) * per_page 

153 end = start + per_page 

154 paginated = result_dicts[start:end] 

155 

156 return { 

157 'results': paginated, 

158 'total': total, 

159 'page': page, 

160 'has_more': end < total, 

161 'sites': site_counts, 

162 } 

163 

164 async def _apply_ai_ranking(self, query: str, results: list[dict]) -> list[dict]: 

165 """Apply AI ranking to search results (non-blocking). 

166 

167 Skips ranking if AI is unavailable or if it fails. 

168 """ 

169 try: 

170 from apps.ai.services.ranking import rank_results 

171 ranked = await sync_to_async(rank_results)(query, results) 

172 return ranked 

173 except Exception as e: 

174 logger.warning(f'AI ranking failed: {e}') 

175 return results 

176 

177 async def _search_source( 

178 self, 

179 session: AsyncSession, 

180 semaphore: asyncio.Semaphore, 

181 source, 

182 query: str, 

183 ) -> list[SearchResult]: 

184 """ 

185 Search a single source for recipes. 

186 

187 Uses randomized delays to avoid bot detection patterns. 

188 """ 

189 async with semaphore: 

190 # Add randomized delay to avoid predictable request patterns 

191 await asyncio.sleep(get_random_delay()) 

192 # Build search URL 

193 search_url = source.search_url_template.replace( 

194 '{query}', 

195 quote_plus(query) 

196 ) 

197 

198 try: 

199 response = await asyncio.wait_for( 

200 session.get( 

201 search_url, 

202 timeout=self.timeout, 

203 allow_redirects=True, 

204 ), 

205 timeout=self.timeout + 5, # Extra buffer for asyncio 

206 ) 

207 

208 if response.status_code != 200: 

209 raise Exception(f"HTTP {response.status_code}") 

210 

211 return self._parse_search_results( 

212 response.text, 

213 source.host, 

214 source.result_selector, 

215 search_url, 

216 ) 

217 

218 except asyncio.TimeoutError: 

219 raise Exception("Request timed out") 

220 

221 def _parse_search_results( 

222 self, 

223 html: str, 

224 host: str, 

225 selector: str, 

226 base_url: str, 

227 ) -> list[SearchResult]: 

228 """ 

229 Parse search results from HTML. 

230 

231 Uses the site-specific CSS selector if available, 

232 otherwise falls back to common patterns. 

233 """ 

234 soup = BeautifulSoup(html, 'html.parser') 

235 results = [] 

236 

237 # Try site-specific selector first 

238 if selector: 

239 elements = soup.select(selector) 

240 if elements: 

241 for el in elements[:20]: # Limit per site 

242 result = self._extract_result_from_element(el, host, base_url) 

243 if result: 

244 results.append(result) 

245 return results 

246 

247 # Fallback: Look for common recipe link patterns 

248 results = self._fallback_parse(soup, host, base_url) 

249 return results[:20] # Limit per site 

250 

251 def _extract_result_from_element( 

252 self, 

253 element, 

254 host: str, 

255 base_url: str, 

256 ) -> Optional[SearchResult]: 

257 """ 

258 Extract search result data from an HTML element. 

259 """ 

260 # Find the link 

261 link = element.find('a', href=True) 

262 if not link: 

263 link = element if element.name == 'a' and element.get('href') else None 

264 if not link: 

265 return None 

266 

267 url = link.get('href', '') 

268 if not url: 

269 return None 

270 

271 # Make URL absolute 

272 url = urljoin(base_url, url) 

273 

274 # Skip non-recipe URLs 

275 if not self._looks_like_recipe_url(url, host): 

276 return None 

277 

278 # Extract title 

279 title = '' 

280 title_el = element.find(['h2', 'h3', 'h4', '.title', '[class*="title"]']) 

281 if title_el: 

282 title = title_el.get_text(strip=True) 

283 if not title: 

284 title = link.get_text(strip=True) 

285 if not title: 

286 title = link.get('title', '') or link.get('aria-label', '') 

287 

288 if not title: 

289 return None 

290 

291 # Extract and strip rating count from title (e.g., "Recipe Name1,392Ratings") 

292 rating_count = None 

293 rating_match = re.search(r'([\d,]+)\s*[Rr]atings?\s*$', title) 

294 if rating_match: 

295 # Extract the number and remove commas 

296 rating_str = rating_match.group(1).replace(',', '') 

297 try: 

298 rating_count = int(rating_str) 

299 # Remove rating text from title 

300 title = title[:rating_match.start()].strip() 

301 except ValueError: 

302 pass 

303 

304 # Title may have become empty after stripping rating (QA-053) 

305 if not title: 

306 return None 

307 

308 # Extract image 

309 image_url = '' 

310 img = element.find('img') 

311 if img: 

312 image_url = img.get('src') or img.get('data-src') or img.get('data-lazy-src', '') 

313 if image_url: 

314 image_url = urljoin(base_url, image_url) 

315 

316 # Extract description 

317 description = '' 

318 desc_el = element.find(['p', '.description', '[class*="description"]']) 

319 if desc_el: 

320 description = desc_el.get_text(strip=True)[:200] 

321 

322 return SearchResult( 

323 url=url, 

324 title=title[:200], 

325 host=host, 

326 image_url=image_url, 

327 description=description, 

328 rating_count=rating_count, 

329 ) 

330 

331 def _fallback_parse( 

332 self, 

333 soup: BeautifulSoup, 

334 host: str, 

335 base_url: str, 

336 ) -> list[SearchResult]: 

337 """ 

338 Fallback parser for sites without a specific selector. 

339 

340 Looks for common patterns in recipe search results. 

341 """ 

342 results = [] 

343 

344 # Strategy 1: Look for article elements with links 

345 for article in soup.find_all('article')[:30]: 

346 result = self._extract_result_from_element(article, host, base_url) 

347 if result: 

348 results.append(result) 

349 

350 if results: 

351 return results 

352 

353 # Strategy 2: Look for card-like divs 

354 card_selectors = [ 

355 '[class*="recipe-card"]', 

356 '[class*="card"]', 

357 '[class*="result"]', 

358 '[class*="item"]', 

359 ] 

360 for selector in card_selectors: 

361 for card in soup.select(selector)[:30]: 

362 result = self._extract_result_from_element(card, host, base_url) 

363 if result: 

364 results.append(result) 

365 if results: 

366 return results 

367 

368 # Strategy 3: Look for links that look like recipe URLs 

369 for link in soup.find_all('a', href=True)[:100]: 

370 url = urljoin(base_url, link.get('href', '')) 

371 if self._looks_like_recipe_url(url, host): 

372 title = link.get_text(strip=True) 

373 if title and len(title) > 5: 

374 results.append(SearchResult( 

375 url=url, 

376 title=title[:200], 

377 host=host, 

378 )) 

379 

380 return results 

381 

382 def _looks_like_recipe_url(self, url: str, host: str) -> bool: 

383 """ 

384 Check if a URL looks like a recipe detail page. 

385 """ 

386 parsed = urlparse(url) 

387 

388 # Must be from the expected host 

389 if host not in parsed.netloc: 

390 return False 

391 

392 path = parsed.path.lower() 

393 

394 # Common recipe URL patterns 

395 recipe_patterns = [ 

396 r'/recipe[s]?/', 

397 r'/dish/', 

398 r'/food/', 

399 r'/cooking/', 

400 r'/\d+/', # Numeric ID in path 

401 r'-recipe/?$', # URL ending with -recipe 

402 r'/a\d+/', # Alphanumeric IDs like /a69912280/ 

403 r'/food-cooking/', # Pioneer Woman style 

404 ] 

405 

406 # Exclude non-recipe paths 

407 exclude_patterns = [ 

408 r'/search', 

409 r'/tag/', 

410 r'/category/', 

411 r'/author/', 

412 r'/profile/', 

413 r'/user/', 

414 r'/about', 

415 r'/contact', 

416 r'/privacy', 

417 r'/terms', 

418 r'/newsletter', 

419 r'/subscribe', 

420 # Article/blog paths (QA-053) 

421 r'/article/', 

422 r'/articles/', 

423 r'/blog/', 

424 r'/post/', 

425 r'/posts/', 

426 r'/news/', 

427 r'/story/', 

428 r'/stories/', 

429 r'/feature/', 

430 r'/features/', 

431 r'/guide/', 

432 r'/guides/', 

433 r'/review/', 

434 r'/reviews/', 

435 r'/roundup/', 

436 r'/list/', 

437 r'/listicle/', 

438 # Video paths (QA-053) 

439 r'/video/', 

440 r'/videos/', 

441 r'/watch/', 

442 r'/watch\?', 

443 r'/embed/', 

444 r'/player/', 

445 r'/clip/', 

446 r'/clips/', 

447 r'/episode/', 

448 r'/episodes/', 

449 r'/series/', 

450 r'/show/', 

451 r'/shows/', 

452 r'/gallery/', 

453 r'/galleries/', 

454 r'/slideshow/', 

455 r'/photo-gallery/', 

456 # Index/listing pages (QA-053) 

457 r'/seasons?(?:/|$)', 

458 r'/cuisines?(?:/|$)', 

459 r'/ingredients?(?:/|$)', 

460 r'/collections?(?:/|$)', 

461 r'/occasions?(?:/|$)', 

462 r'/courses?(?:/|$)', 

463 r'/diets?(?:/|$)', 

464 r'/techniques?(?:/|$)', 

465 r'/chefs?(?:/|$)', 

466 r'/dishes(?:/|$)', 

467 r'/menus?(?:/|$)', 

468 r'/meal-plans?(?:/|$)', 

469 ] 

470 

471 for pattern in exclude_patterns: 

472 if re.search(pattern, path): 

473 return False 

474 

475 # Site-specific requirements (QA-058) 

476 # AllRecipes has article pages at root that look like recipes but aren't 

477 # Real recipes are always under /recipe/ path 

478 if 'allrecipes.com' in host and '/recipe/' not in path: 

479 return False 

480 

481 # Check for recipe patterns 

482 for pattern in recipe_patterns: 

483 if re.search(pattern, path): 

484 return True 

485 

486 # Heuristic: URL path has enough segments and isn't too short 

487 segments = [s for s in path.split('/') if s] 

488 if len(segments) >= 2 and len(path) > 20: 

489 return True 

490 

491 # Also accept single-segment slug-style URLs (common for food blogs) 

492 # e.g., /30-cloves-garlic-chicken/ 

493 if len(segments) == 1 and len(path) > 15 and path.count('-') >= 2: 

494 return True 

495 

496 return False 

497 

498 async def _record_failure(self, source) -> None: 

499 """Record a search failure for maintenance tracking.""" 

500 from apps.recipes.models import SearchSource 

501 

502 @sync_to_async 

503 def update(): 

504 source.consecutive_failures += 1 

505 if source.consecutive_failures >= 3: 

506 source.needs_attention = True 

507 source.save(update_fields=['consecutive_failures', 'needs_attention']) 

508 

509 await update() 

510 

511 async def _record_success(self, source) -> None: 

512 """Record a successful search.""" 

513 from apps.recipes.models import SearchSource 

514 

515 @sync_to_async 

516 def update(): 

517 source.consecutive_failures = 0 

518 source.needs_attention = False 

519 source.last_validated_at = timezone.now() 

520 source.save(update_fields=[ 

521 'consecutive_failures', 

522 'needs_attention', 

523 'last_validated_at', 

524 ]) 

525 

526 await update() 

← Back to Dashboard