Coverage for apps / recipes / services / search_parsers.py: 79%

183 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-12 10:49 +0000

1""" 

2HTML/URL parsing helpers for recipe search. 

3 

4Extracted from RecipeSearch to keep search.py focused on orchestration. 

5All functions are module-level (no class needed). 

6""" 

7 

8import logging 

9import re 

10from typing import Optional 

11from urllib.parse import urljoin, urlparse 

12 

13from bs4 import BeautifulSoup 

14 

15from apps.recipes.services.search import SearchResult 

16 

17logger = logging.getLogger(__name__) 

18 

19 

20def find_link(element) -> Optional[tuple]: 

21 """Find recipe link in an HTML element. 

22 

23 Returns: 

24 Tuple of (link_element, url) if found, None otherwise. 

25 """ 

26 link = element.find("a", href=True) 

27 if not link: 

28 link = element if element.name == "a" and element.get("href") else None 

29 if not link: 

30 return None 

31 

32 url = link.get("href", "") 

33 if not url: 

34 return None 

35 

36 return link, url 

37 

38 

39def extract_title(element, link) -> str: 

40 """Extract title from element with multiple fallback strategies. 

41 

42 Tries: heading elements, link text, title/aria-label attributes. 

43 """ 

44 title_el = element.find(["h2", "h3", "h4", ".title", '[class*="title"]']) 

45 if title_el: 

46 title = title_el.get_text(strip=True) 

47 if title: 

48 return title 

49 

50 title = link.get_text(strip=True) 

51 if title: 

52 return title 

53 

54 return link.get("title", "") or link.get("aria-label", "") 

55 

56 

57def extract_rating(title: str) -> tuple[str, Optional[int]]: 

58 """Extract and strip rating count from title. 

59 

60 Handles patterns like "Recipe Name1,392Ratings". 

61 

62 Returns: 

63 Tuple of (cleaned_title, rating_count). 

64 """ 

65 rating_match = re.search(r"([\d,]+)\s*[Rr]atings?\s*$", title) 

66 if not rating_match: 

67 return title, None 

68 

69 rating_str = rating_match.group(1).replace(",", "") 

70 try: 

71 rating_count = int(rating_str) 

72 cleaned_title = title[: rating_match.start()].strip() 

73 return cleaned_title, rating_count 

74 except ValueError: 

75 return title, None 

76 

77 

78def extract_description(element) -> str: 

79 """Extract description from element.""" 

80 desc_el = element.find(["p", ".description", '[class*="description"]']) 

81 if desc_el: 

82 return desc_el.get_text(strip=True)[:200] 

83 return "" 

84 

85 

86def parse_srcset(srcset: str) -> list[tuple[str, int]]: 

87 """Parse srcset string into (url, width) pairs. 

88 

89 Handles srcset format: "url1 100w, url2 200w" 

90 URLs may contain commas (e.g. resize=93,84) so we split on 

91 the width descriptor pattern rather than plain commas. 

92 """ 

93 results = [] 

94 for match in re.finditer(r"(https?://\S+?)\s+(\d+)w", srcset): 

95 url = match.group(1) 

96 width = int(match.group(2)) 

97 results.append((url, width)) 

98 return results 

99 

100 

101def _collect_srcset_strings(element) -> list[str]: 

102 """Collect non-WebP srcset strings from <picture><source> and <img>.""" 

103 srcsets = [] 

104 picture = element.find("picture") 

105 if picture: 

106 for src in picture.find_all("source"): 

107 srcset = src.get("srcset", "") 

108 if srcset and "webp" not in (src.get("type") or "").lower(): 

109 srcsets.append(srcset) 

110 img = element.find("img") 

111 if img and img.get("srcset", ""): 

112 srcsets.append(img["srcset"]) 

113 return srcsets 

114 

115 

116def best_url_from_srcset(element, base_url: str) -> str: 

117 """Extract the largest non-WebP image URL from srcset attributes.""" 

118 entries = [e for s in _collect_srcset_strings(element) for e in parse_srcset(s)] 

119 if not entries: 

120 return "" 

121 best_url, _ = max(entries, key=lambda e: e[1]) 

122 return urljoin(base_url, best_url) 

123 

124 

125def extract_image(element, base_url: str) -> str: 

126 """Extract image URL with multiple fallback strategies. 

127 

128 Tries srcset (for largest image), then src, data-src, data-lazy-src. 

129 Handles <picture><source srcset> patterns used by modern sites. 

130 """ 

131 # Try to get the best image from srcset first (larger than thumbnail) 

132 srcset_url = best_url_from_srcset(element, base_url) 

133 if srcset_url: 

134 return srcset_url 

135 

136 # Fallback to img src attributes 

137 img = element.find("img") 

138 if not img: 

139 return "" 

140 

141 image_url = img.get("src") or img.get("data-src") or img.get("data-lazy-src", "") 

142 if image_url: 

143 return urljoin(base_url, image_url) 

144 return "" 

145 

146 

147def extract_result_from_element( 

148 element, 

149 host: str, 

150 base_url: str, 

151) -> Optional[SearchResult]: 

152 """Extract search result data from an HTML element.""" 

153 # Find and validate link 

154 link_result = find_link(element) 

155 if not link_result: 

156 return None 

157 link, url = link_result 

158 

159 # Make URL absolute and get signal strength 

160 url = urljoin(base_url, url) 

161 url_signal = get_url_signal(url, host) 

162 if url_signal in ("strong_exclude", "reject"): 

163 return None 

164 

165 # Extract title 

166 title = extract_title(element, link) 

167 if not title: 

168 return None 

169 

170 # Extract and strip rating from title 

171 title, rating_count = extract_rating(title) 

172 

173 # Title may have become empty after stripping rating (QA-053) 

174 if not title: 

175 return None 

176 

177 # Filter non-recipe content by title (012-filter-search-results) 

178 if not looks_like_recipe_title(title, url_signal): 

179 logger.debug("Filtered non-recipe title: %s (%s)", title, url) 

180 return None 

181 

182 image_url = extract_image(element, base_url) 

183 description = extract_description(element) 

184 

185 # Field validation: neutral URL results must have both image AND description. 

186 # Real recipe cards from search pages almost always have both. 

187 # Editorial/article results often lack one or both. 

188 if url_signal == "neutral" and (not image_url or not description): 

189 logger.debug("Filtered neutral URL missing image or description: %s (%s)", title, url) 

190 return None 

191 

192 return SearchResult( 

193 url=url, 

194 title=title[:200], 

195 host=host, 

196 image_url=image_url, 

197 description=description, 

198 rating_count=rating_count, 

199 ) 

200 

201 

202def _parse_articles( 

203 soup: BeautifulSoup, 

204 host: str, 

205 base_url: str, 

206) -> list[SearchResult]: 

207 """Strategy: extract results from <article> elements.""" 

208 _ext = extract_result_from_element 

209 return [r for el in soup.find_all("article")[:30] if (r := _ext(el, host, base_url))] 

210 

211 

212def _parse_cards( 

213 soup: BeautifulSoup, 

214 host: str, 

215 base_url: str, 

216) -> list[SearchResult]: 

217 """Strategy: extract results from card-like div elements.""" 

218 _ext = extract_result_from_element 

219 for sel in ('[class*="recipe-card"]', '[class*="card"]', '[class*="result"]', '[class*="item"]'): 

220 results = [r for el in soup.select(sel)[:30] if (r := _ext(el, host, base_url))] 

221 if results: 

222 return results 

223 return [] 

224 

225 

226def _parse_links( 

227 soup: BeautifulSoup, 

228 host: str, 

229 base_url: str, 

230) -> list[SearchResult]: 

231 """Strategy: extract results from links that look like recipe URLs.""" 

232 results = [] 

233 for link in soup.find_all("a", href=True)[:100]: 

234 url = urljoin(base_url, link.get("href", "")) 

235 url_signal = get_url_signal(url, host) 

236 if url_signal not in ("strong_include", "neutral"): 

237 continue 

238 title = link.get_text(strip=True) 

239 if title and len(title) > 5 and looks_like_recipe_title(title, url_signal): 

240 results.append(SearchResult(url=url, title=title[:200], host=host)) 

241 return results 

242 

243 

244def fallback_parse( 

245 soup: BeautifulSoup, 

246 host: str, 

247 base_url: str, 

248) -> list[SearchResult]: 

249 """Fallback parser for sites without a specific selector. 

250 

251 Tries article elements, card-like divs, then bare recipe links. 

252 """ 

253 for strategy in (_parse_articles, _parse_cards, _parse_links): 

254 results = strategy(soup, host, base_url) 

255 if results: 

256 return results 

257 return [] 

258 

259 

260# Compiled patterns for looks_like_recipe_url (avoid recompiling per call) 

261_RECIPE_PATTERNS = [ 

262 re.compile(p) 

263 for p in [ 

264 r"/recipe[s]?/", 

265 r"/dish/", 

266 r"/food/", 

267 r"/cooking/", 

268 r"/\d+/", 

269 r"-recipe/?$", 

270 r"/a\d+/", 

271 r"/food-cooking/", 

272 ] 

273] 

274 

275_EXCLUDE_PATTERNS = [ 

276 re.compile(p) 

277 for p in [ 

278 r"/search", 

279 r"/tag/", 

280 r"/category/", 

281 r"/author/", 

282 r"/profile/", 

283 r"/user/", 

284 r"/about", 

285 r"/contact", 

286 r"/privacy", 

287 r"/terms", 

288 r"/newsletter", 

289 r"/subscribe", 

290 # Article/blog paths (QA-053) 

291 r"/article/", 

292 r"/articles/", 

293 r"/blog/", 

294 r"/post/", 

295 r"/posts/", 

296 r"/news/", 

297 r"/story/", 

298 r"/stories/", 

299 r"/feature/", 

300 r"/features/", 

301 r"/guide/", 

302 r"/guides/", 

303 r"/review/", 

304 r"/reviews/", 

305 r"/roundup/", 

306 r"/list/", 

307 r"/listicle/", 

308 # Video paths (QA-053) 

309 r"/video/", 

310 r"/videos/", 

311 r"/watch/", 

312 r"/watch\?", 

313 r"/embed/", 

314 r"/player/", 

315 r"/clip/", 

316 r"/clips/", 

317 r"/episode/", 

318 r"/episodes/", 

319 r"/series/", 

320 r"/show/", 

321 r"/shows/", 

322 r"/gallery/", 

323 r"/galleries/", 

324 r"/slideshow/", 

325 r"/photo-gallery/", 

326 # Index/listing pages (QA-053) 

327 r"/seasons?(?:/|$)", 

328 r"/cuisines?(?:/|$)", 

329 r"/ingredients?(?:/|$)", 

330 r"/collections?(?:/|$)", 

331 r"/occasions?(?:/|$)", 

332 r"/courses?(?:/|$)", 

333 r"/diets?(?:/|$)", 

334 r"/techniques?(?:/|$)", 

335 r"/chefs?(?:/|$)", 

336 r"/dishes(?:/|$)", 

337 r"/menus?(?:/|$)", 

338 r"/meal-plans?(?:/|$)", 

339 ] 

340] 

341 

342 

343def _check_exclusion_patterns(path: str) -> bool: 

344 """Return True if path matches any exclusion pattern.""" 

345 return any(pattern.search(path) for pattern in _EXCLUDE_PATTERNS) 

346 

347 

348def _check_recipe_patterns(path: str) -> bool: 

349 """Return True if path matches any recipe pattern.""" 

350 return any(pattern.search(path) for pattern in _RECIPE_PATTERNS) 

351 

352 

353# Site-specific rules: host → callable returning signal or None 

354_SITE_RULES: dict[str, callable] = { 

355 "allrecipes.com": lambda path: "reject" if "/recipe/" not in path else None, 

356} 

357 

358 

359def _check_site_rules(host: str, path: str) -> Optional[str]: 

360 """Apply site-specific rules. Returns a signal string or None.""" 

361 for domain, rule in _SITE_RULES.items(): 

362 if host == domain or host.endswith(f".{domain}"): 

363 return rule(path) 

364 return None 

365 

366 

367def _check_path_heuristics(path: str) -> str: 

368 """Apply heuristic fallbacks for paths with no strong signal. 

369 

370 Returns "neutral" or "reject". 

371 """ 

372 segments = [s for s in path.split("/") if s] 

373 if len(segments) >= 2 and len(path) > 20: 

374 return "neutral" 

375 if len(segments) == 1 and len(path) > 15 and path.count("-") >= 2: 

376 return "neutral" 

377 return "reject" 

378 

379 

380def get_url_signal(url: str, host: str) -> str: 

381 """Determine URL signal strength for recipe filtering. 

382 

383 Returns: 

384 "strong_exclude" - URL matches exclusion patterns (articles, blogs, etc.) 

385 "strong_include" - URL matches recipe patterns (/recipe/, /recipes/, etc.) 

386 "neutral" - URL passes heuristics but has no strong signal 

387 "reject" - URL fails all checks (wrong host, too short, etc.) 

388 """ 

389 parsed = urlparse(url) 

390 

391 if host not in parsed.netloc: 

392 return "reject" 

393 

394 path = parsed.path.lower() 

395 

396 if _check_exclusion_patterns(path): 

397 return "strong_exclude" 

398 

399 site_signal = _check_site_rules(host, path) 

400 if site_signal is not None: 

401 return site_signal 

402 

403 if _check_recipe_patterns(path): 

404 return "strong_include" 

405 

406 return _check_path_heuristics(path) 

407 

408 

409def looks_like_recipe_url(url: str, host: str) -> bool: 

410 """Check if a URL looks like a recipe detail page.""" 

411 signal = get_url_signal(url, host) 

412 return signal in ("strong_include", "neutral") 

413 

414 

415# Strong editorial patterns — always reject even if recipe words present 

416# These are clearly article/editorial headlines, not recipe titles 

417_STRONG_EDITORIAL_PATTERNS = [ 

418 re.compile(p, re.IGNORECASE) 

419 for p in [ 

420 r"\bdeserves?\s+a\s+(?:gold|silver|bronze)\s+medal\b", 

421 r"\bis\s+a\s+weeknight\s+winner\b", 

422 r"\btop\s+trending\s+recipe\s+of\s+\d{4}\b", 

423 r"\binsanely\s+awesome\b", 

424 r"\bmost\s+beautiful\s+destination\b", 

425 r"\bbest\s+time\s+to\s+book\b", 

426 ] 

427] 

428 

429# Mild editorial patterns — rejected unless recipe-context words present 

430_EDITORIAL_TITLE_PATTERNS = [ 

431 re.compile(p, re.IGNORECASE) 

432 for p in [ 

433 # Listicles: "Top 10...", "5 Best...", "7 Reasons..." 

434 r"^(?:the\s+)?(?:top\s+)?\d+\s+(?:best|worst|things|reasons|ways|places|tips|tricks|destinations|restaurants|spots|cities|towns)\b", 

435 # Travel/destination content 

436 r"\btravel\s+guide\b", 

437 r"\bbest\s+destinations?\b", 

438 r"\bplaces?\s+to\s+visit\b", 

439 r"\bwhere\s+to\s+(?:eat|go|stay|travel)\b", 

440 r"\bbook\s+(?:your\s+)?(?:thanksgiving|christmas|holiday)\s+travel\b", 

441 # Review/editorial 

442 r"^review\s*:", 

443 r"\b(?:product|book|restaurant|movie|hotel|app)\s+review\b", 

444 # News/trending headers 

445 r"^(?:news|breaking|update|trending)\s*:", 

446 # Meta/navigation pages 

447 r"^(?:about\s+us|contact\s+us|privacy\s+policy|terms\s+of|cookie\s+policy|subscribe|newsletter|sign\s+up|log\s+in)\b", 

448 ] 

449] 

450 

451# Recipe-context words that override mild editorial title patterns 

452_RECIPE_CONTEXT_PATTERN = re.compile( 

453 r"\b(?:recipe[s]?|cook(?:ing|ed)?|bake[ds]?|baking|roast(?:ed|ing)?|" 

454 r"grill(?:ed|ing)?|how\s+to\s+(?:make|cook|bake|prepare)|homemade|" 

455 r"ingredient[s]?|from\s+scratch|step.by.step|easy\s+(?:to\s+)?make)\b", 

456 re.IGNORECASE, 

457) 

458 

459 

460def looks_like_recipe_title(title: str, url_signal: str) -> bool: 

461 """Check if a search result title looks like recipe content. 

462 

463 Uses tiered resolution with URL signal strength: 

464 - strong_include URLs: always pass (recipe URL overrides title concerns) 

465 - neutral URLs: evaluated by title patterns 

466 - Strong editorial patterns always reject (even with recipe words) 

467 - Mild editorial patterns rejected unless recipe-context words present 

468 """ 

469 if url_signal == "strong_include": 

470 return True 

471 

472 title_stripped = title.strip() 

473 if not title_stripped: 

474 return False 

475 

476 # Strong editorial patterns always reject 

477 for pattern in _STRONG_EDITORIAL_PATTERNS: 

478 if pattern.search(title_stripped): 

479 return False 

480 

481 # Mild editorial patterns rejected unless recipe-context words present 

482 for pattern in _EDITORIAL_TITLE_PATTERNS: 

483 if pattern.search(title_stripped): 

484 if _RECIPE_CONTEXT_PATTERN.search(title_stripped): 

485 return True 

486 return False 

487 

488 return True 

← Back to Dashboard