Coverage for apps/recipes/services/search

1"""URL- and title-level classifiers used by the recipe search parser.

3Extracted from `search_parsers.py` to keep that file under the per-file size

4budget. The classifiers decide whether a candidate result represents a recipe

5detail page rather than an article, gallery, or category index.

6"""

8import re

9from typing import Optional

10from urllib.parse import urlparse

13# Compiled patterns for looks_like_recipe_url (avoid recompiling per call)

14_RECIPE_PATTERNS = [

15 re.compile(p)

16 for p in [

17 r"/recipe[s]?/",

18 r"/dish/",

19 r"/food/",

20 r"/cooking/",

21 r"/\d+/",

22 r"-recipe/?$",

23 r"/a\d+/",

24 r"/food-cooking/",

25 ]

26]

28_EXCLUDE_PATTERNS = [

29 re.compile(p)

30 for p in [

31 r"/search",

32 r"/tag/",

33 r"/category/",

34 r"/author/",

35 r"/profile/",

36 r"/user/",

37 r"/about",

38 r"/contact",

39 r"/privacy",

40 r"/terms",

41 r"/newsletter",

42 r"/subscribe",

43 # Article/blog paths (QA-053)

44 r"/article/",

45 r"/articles/",

46 r"/blog/",

47 r"/post/",

48 r"/posts/",

49 r"/news/",

50 r"/story/",

51 r"/stories/",

52 r"/feature/",

53 r"/features/",

54 r"/guide/",

55 r"/guides/",

56 r"/review/",

57 r"/reviews/",

58 r"/roundup/",

59 r"/list/",

60 r"/listicle/",

61 # Video paths (QA-053)

62 r"/video/",

63 r"/videos/",

64 r"/watch/",

65 r"/watch\?",

66 r"/embed/",

67 r"/player/",

68 r"/clip/",

69 r"/clips/",

70 r"/episode/",

71 r"/episodes/",

72 r"/series/",

73 r"/show/",

74 r"/shows/",

75 r"/gallery/",

76 r"/galleries/",

77 r"/slideshow/",

78 r"/photo-gallery/",

79 # Index/listing pages (QA-053)

80 r"/seasons?(?:/|$)",

81 r"/cuisines?(?:/|$)",

82 r"/ingredients?(?:/|$)",

83 r"/collections?(?:/|$)",

84 r"/occasions?(?:/|$)",

85 r"/courses?(?:/|$)",

86 r"/diets?(?:/|$)",

87 r"/techniques?(?:/|$)",

88 r"/chefs?(?:/|$)",

89 r"/dishes(?:/|$)",

90 r"/menus?(?:/|$)",

91 r"/meal-plans?(?:/|$)",

92 ]

93]

96def _check_exclusion_patterns(path: str) -> bool:

97 """Return True if path matches any exclusion pattern."""

98 return any(pattern.search(path) for pattern in _EXCLUDE_PATTERNS)

100

101def _check_recipe_patterns(path: str) -> bool:

102 """Return True if path matches any recipe pattern."""

103 return any(pattern.search(path) for pattern in _RECIPE_PATTERNS)

104

105

106def _skinnytaste_signal(path: str) -> Optional[str]:

107 """Skinnytaste publishes recipes at /<slug>/ with no description on the

108 search page, which the generic neutral-URL filter would reject. Treat any

109 top-level hyphenated slug as a strong recipe signal; exclusion patterns

110 above already strip /about/, /category/, etc.

111 """

112 segments = [s for s in path.split("/") if s]

113 if len(segments) == 1 and len(segments[0]) > 15 and segments[0].count("-") >= 2:

114 return "strong_include"

115 return None

116

117

118# Site-specific rules: host → callable returning signal or None

119_SITE_RULES: dict[str, callable] = {

120 "allrecipes.com": lambda path: "reject" if "/recipe/" not in path else None,

121 "skinnytaste.com": _skinnytaste_signal,

122}

123

124

125def _check_site_rules(host: str, path: str) -> Optional[str]:

126 """Apply site-specific rules. Returns a signal string or None."""

127 for domain, rule in _SITE_RULES.items():

128 if host == domain or host.endswith(f".{domain}"):

129 return rule(path)

130 return None

131

132

133def _check_path_heuristics(path: str) -> str:

134 """Apply heuristic fallbacks for paths with no strong signal.

135

136 Returns "neutral" or "reject".

137 """

138 segments = [s for s in path.split("/") if s]

139 if len(segments) >= 2 and len(path) > 20:

140 return "neutral"

141 if len(segments) == 1 and len(path) > 15 and path.count("-") >= 2:

142 return "neutral"

143 return "reject"

144

145

146def get_url_signal(url: str, host: str) -> str:

147 """Determine URL signal strength for recipe filtering.

148

149 Returns:

150 "strong_exclude" - URL matches exclusion patterns (articles, blogs, etc.)

151 "strong_include" - URL matches recipe patterns (/recipe/, /recipes/, etc.)

152 "neutral" - URL passes heuristics but has no strong signal

153 "reject" - URL fails all checks (wrong host, too short, etc.)

154 """

155 parsed = urlparse(url)

156

157 if host not in parsed.netloc:

158 return "reject"

159

160 path = parsed.path.lower()

161

162 if _check_exclusion_patterns(path):

163 return "strong_exclude"

164

165 site_signal = _check_site_rules(host, path)

166 if site_signal is not None:

167 return site_signal

168

169 if _check_recipe_patterns(path):

170 return "strong_include"

171

172 return _check_path_heuristics(path)

173

174

175def looks_like_recipe_url(url: str, host: str) -> bool:

176 """Check if a URL looks like a recipe detail page."""

177 signal = get_url_signal(url, host)

178 return signal in ("strong_include", "neutral")

179

180

181# Strong editorial patterns — always reject even if recipe words present

182# These are clearly article/editorial headlines, not recipe titles

183_STRONG_EDITORIAL_PATTERNS = [

184 re.compile(p, re.IGNORECASE)

185 for p in [

186 r"\bdeserves?\s+a\s+(?:gold|silver|bronze)\s+medal\b",

187 r"\bis\s+a\s+weeknight\s+winner\b",

188 r"\btop\s+trending\s+recipe\s+of\s+\d{4}\b",

189 r"\binsanely\s+awesome\b",

190 r"\bmost\s+beautiful\s+destination\b",

191 r"\bbest\s+time\s+to\s+book\b",

192 ]

193]

194

195# Mild editorial patterns — rejected unless recipe-context words present

196_EDITORIAL_TITLE_PATTERNS = [

197 re.compile(p, re.IGNORECASE)

198 for p in [

199 # Listicles: "Top 10...", "5 Best...", "7 Reasons..."

201 # Travel/destination content

202 r"\btravel\s+guide\b",

203 r"\bbest\s+destinations?\b",

204 r"\bplaces?\s+to\s+visit\b",

205 r"\bwhere\s+to\s+(?:eat|go|stay|travel)\b",

206 r"\bbook\s+(?:your\s+)?(?:thanksgiving|christmas|holiday)\s+travel\b",

207 # Review/editorial

208 r"^review\s*:",

210 # News/trending headers

211 r"^(?:news|breaking|update|trending)\s*:",

212 # Meta/navigation pages

214 ]

215]

216

217# Recipe-context words that override mild editorial title patterns

218_RECIPE_CONTEXT_PATTERN = re.compile(

221 r"ingredient[s]?|from\s+scratch|step.by.step|easy\s+(?:to\s+)?make)\b",

222 re.IGNORECASE,

223)

224

225

226def looks_like_recipe_title(title: str, url_signal: str) -> bool:

227 """Check if a search result title looks like recipe content.

228

229 Uses tiered resolution with URL signal strength:

230 - strong_include URLs: always pass (recipe URL overrides title concerns)

231 - neutral URLs: evaluated by title patterns

232 - Strong editorial patterns always reject (even with recipe words)

233 - Mild editorial patterns rejected unless recipe-context words present

234 """

235 if url_signal == "strong_include":

236 return True

237

238 title_stripped = title.strip()

239 if not title_stripped:

240 return False

241

242 # Strong editorial patterns always reject

243 for pattern in _STRONG_EDITORIAL_PATTERNS:

244 if pattern.search(title_stripped):

245 return False

246

247 # Mild editorial patterns rejected unless recipe-context words present

248 for pattern in _EDITORIAL_TITLE_PATTERNS:

249 if pattern.search(title_stripped):

250 if _RECIPE_CONTEXT_PATTERN.search(title_stripped):

251 return True

252 return False

253

254 return True

Coverage for apps / recipes / services / search_classifiers.py: 100%

61 statements