Coverage for apps/recipes/services/search

1"""

2HTML/URL parsing helpers for recipe search.

4Extracted from RecipeSearch to keep search.py focused on orchestration.

5All functions are module-level (no class needed).

6"""

8import logging

9import re

10from typing import Optional

11from urllib.parse import urljoin

13from bs4 import BeautifulSoup

15from apps.recipes.services.search import SearchResult

16from apps.recipes.services.search_classifiers import (

17 get_url_signal,

18 looks_like_recipe_title,

19)

21logger = logging.getLogger(__name__)

24def find_link(element) -> Optional[tuple]:

25 """Find recipe link in an HTML element.

27 Returns:

28 Tuple of (link_element, url) if found, None otherwise.

29 """

30 link = element.find("a", href=True)

31 if not link:

32 link = element if element.name == "a" and element.get("href") else None

33 if not link:

34 return None

36 url = link.get("href", "")

37 if not url:

38 return None

40 return link, url

43def extract_title(element, link) -> str:

44 """Extract title from element with multiple fallback strategies.

46 Uses separator=" " so nested metadata spans (e.g. prep-time) don't

47 bleed into the title without whitespace.

48 """

49 title_el = element.find(["h2", "h3", "h4", ".title", '[class*="title"]'])

50 if title_el:

51 title = title_el.get_text(separator=" ", strip=True)

52 if title:

53 return _strip_title_metadata(title)

55 title = link.get_text(separator=" ", strip=True)

56 if title:

57 return _strip_title_metadata(title)

59 return link.get("title", "") or link.get("aria-label", "")

62# Strips trailing time metadata, e.g. "30 mins", "1 hr 30 mins".

63_TRAILING_TIME_RE = re.compile(

64 r"\s+\d+\s*(?:hr?s?|hour[s]?|min(?:ute)?[s]?)(?:\s+\d+\s*(?:min(?:ute)?[s]?))?$",

65 re.IGNORECASE,

66)

69def _strip_title_metadata(title: str) -> str:

70 """Strip trailing time/duration metadata from a recipe card title."""

71 return _TRAILING_TIME_RE.sub("", title).strip()

74def extract_rating(title: str) -> tuple[str, Optional[int]]:

75 """Extract and strip rating count from title.

77 Handles patterns like "Recipe Name1,392Ratings".

79 Returns:

80 Tuple of (cleaned_title, rating_count).

81 """

82 rating_match = re.search(r"([\d,]+)\s*[Rr]atings?\s*$", title)

83 if not rating_match:

84 return title, None

86 rating_str = rating_match.group(1).replace(",", "")

87 try:

88 rating_count = int(rating_str)

89 cleaned_title = title[: rating_match.start()].strip()

90 return cleaned_title, rating_count

91 except ValueError:

92 return title, None

95def extract_description(element) -> str:

96 """Extract description from element."""

97 desc_el = element.find(["p", ".description", '[class*="description"]'])

98 if desc_el:

99 return desc_el.get_text(strip=True)[:200]

100 return ""

101

102

103def parse_srcset(srcset: str) -> list[tuple[str, int]]:

104 """Parse srcset string into (url, width) pairs.

105

106 Handles srcset format: "url1 100w, url2 200w"

107 URLs may contain commas (e.g. resize=93,84) so we split on

108 the width descriptor pattern rather than plain commas.

109 """

110 results = []

111 for match in re.finditer(r"(https?://\S+?)\s+(\d+)w", srcset):

112 url = match.group(1)

113 width = int(match.group(2))

114 results.append((url, width))

115 return results

116

117

118def _collect_srcset_strings(element) -> list[str]:

119 """Collect non-WebP srcset strings from <picture><source> and <img>."""

120 srcsets = []

121 picture = element.find("picture")

122 if picture:

123 for src in picture.find_all("source"):

124 srcset = src.get("srcset", "")

125 if srcset and "webp" not in (src.get("type") or "").lower():

126 srcsets.append(srcset)

127 img = element.find("img")

128 if img and img.get("srcset", ""):

129 srcsets.append(img["srcset"])

130 return srcsets

131

132

133def best_url_from_srcset(element, base_url: str) -> str:

134 """Extract the largest non-WebP image URL from srcset attributes."""

135 entries = [e for s in _collect_srcset_strings(element) for e in parse_srcset(s)]

136 if not entries:

137 return ""

138 best_url, _ = max(entries, key=lambda e: e[1])

139 return urljoin(base_url, best_url)

140

141

142def extract_image(element, base_url: str) -> str:

143 """Extract image URL with multiple fallback strategies.

144

145 Tries srcset (for largest image), then src, data-src, data-lazy-src.

146 Handles <picture><source srcset> patterns used by modern sites.

147 """

148 # Try to get the best image from srcset first (larger than thumbnail)

149 srcset_url = best_url_from_srcset(element, base_url)

150 if srcset_url:

151 return srcset_url

152

153 # Fallback to img src attributes

154 img = element.find("img")

155 if not img:

156 return ""

157

158 image_url = img.get("src") or img.get("data-src") or img.get("data-lazy-src", "")

159 if image_url:

160 return urljoin(base_url, image_url)

161 return ""

162

163

164def extract_result_from_element(

165 element,

166 host: str,

167 base_url: str,

168) -> Optional[SearchResult]:

169 """Extract search result data from an HTML element."""

170 # Find and validate link

171 link_result = find_link(element)

172 if not link_result:

173 return None

174 link, url = link_result

175

176 # Make URL absolute and get signal strength

177 url = urljoin(base_url, url)

178 url_signal = get_url_signal(url, host)

179 if url_signal in ("strong_exclude", "reject"):

180 return None

181

182 # Extract title

183 title = extract_title(element, link)

184 if not title:

185 return None

186

187 # Extract and strip rating from title

188 title, rating_count = extract_rating(title)

189

190 # Title may have become empty after stripping rating (QA-053)

191 if not title:

192 return None

193

194 # Filter non-recipe content by title (012-filter-search-results)

195 if not looks_like_recipe_title(title, url_signal):

196 logger.debug("Filtered non-recipe title: %s (%s)", title, url)

197 return None

198

199 image_url = extract_image(element, base_url)

200 description = extract_description(element)

201

202 # Neutral URLs must have both image AND description — recipe cards almost

203 # always do; editorial/article results often lack one or both.

204 if url_signal == "neutral" and (not image_url or not description):

205 logger.debug("Filtered neutral URL missing image or description: %s (%s)", title, url)

206 return None

207

208 return SearchResult(

209 url=url,

210 title=title[:200],

211 host=host,

212 image_url=image_url,

213 description=description,

214 rating_count=rating_count,

215 )

216

217

218def _parse_articles(

219 soup: BeautifulSoup,

220 host: str,

221 base_url: str,

222) -> list[SearchResult]:

223 """Strategy: extract results from <article> elements."""

224 _ext = extract_result_from_element

225 return [r for el in soup.find_all("article")[:30] if (r := _ext(el, host, base_url))]

226

227

228def _parse_cards(

229 soup: BeautifulSoup,

230 host: str,

231 base_url: str,

232) -> list[SearchResult]:

233 """Strategy: extract results from card-like div elements."""

234 _ext = extract_result_from_element

235 for sel in ('[class*="recipe-card"]', '[class*="card"]', '[class*="result"]', '[class*="item"]'):

236 results = [r for el in soup.select(sel)[:30] if (r := _ext(el, host, base_url))]

237 if results:

238 return results

239 return []

240

241

242def _parse_links(

243 soup: BeautifulSoup,

244 host: str,

245 base_url: str,

246) -> list[SearchResult]:

247 """Strategy: extract results from links that look like recipe URLs."""

248 results = []

249 for link in soup.find_all("a", href=True)[:100]:

250 url = urljoin(base_url, link.get("href", ""))

251 url_signal = get_url_signal(url, host)

252 if url_signal not in ("strong_include", "neutral"):

253 continue

254 title = link.get_text(strip=True)

255 if title and len(title) > 5 and looks_like_recipe_title(title, url_signal):

256 results.append(SearchResult(url=url, title=title[:200], host=host))

257 return results

258

259

260def fallback_parse(

261 soup: BeautifulSoup,

262 host: str,

263 base_url: str,

264) -> list[SearchResult]:

265 """Fallback parser for sites without a specific selector.

266

267 Tries article elements, card-like divs, then bare recipe links.

268 """

269 for strategy in (_parse_articles, _parse_cards, _parse_links):

270 results = strategy(soup, host, base_url)

271 if results:

272 return results

273 return []

Coverage for apps / recipes / services / search_parsers.py: 72%

134 statements