To count the number of web pages scraped, we can use a simple middleware with a predefined list of known bots, and increment a measurement with IMetricsService
everytime a request comes from a bot.
using Microsoft.AspNetCore.Http;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using Toggly.FeatureManagement;
namespace Web.Helpers
{
public class BotTrackerMiddleware
{
private readonly RequestDelegate _next;
private readonly IMetricsService _metricsService;
List<string> _crawlers = new List<string>()
{
"bot","crawler","spider","80legs","baidu","yahoo! slurp","ia_archiver","mediapartners-google",
"lwp-trivial","nederland.zoek","ahoy","anthill","appie","arale","araneo","ariadne",
"atn_worldwide","atomz","bjaaland","ukonline","calif","combine","cosmos","cusco",
"cyberspyder","digger","grabber","downloadexpress","ecollector","ebiness","esculapio",
"esther","felix ide","hamahakki","kit-fireball","fouineur","freecrawl","desertrealm",
"gcreep","golem","griffon","gromit","gulliver","gulper","whowhere","havindex","hotwired",
"htdig","ingrid","informant","inspectorwww","iron33","teoma","ask jeeves","jeeves",
"image.kapsi.net","kdd-explorer","label-grabber","larbin","linkidator","linkwalker",
"lockon","marvin","mattie","mediafox","merzscope","nec-meshexplorer","udmsearch","moget",
"motor","muncher","muninn","muscatferret","mwdsearch","sharp-info-agent","webmechanic",
"netscoop","newscan-online","objectssearch","orbsearch","packrat","pageboy","parasite",
"patric","pegasus","phpdig","piltdownman","pimptrain","plumtreewebaccessor","getterrobo-plus",
"raven","roadrunner","robbie","robocrawl","robofox","webbandit","scooter","search-au",
"searchprocess","senrigan","shagseeker","site valet","skymob","slurp","snooper","speedy",
"curl_image_client","suke","www.sygol.com","tach_bw","templeton","titin","topiclink","udmsearch",
"urlck","valkyrie libwww-perl","verticrawl","victoria","webscout","voyager","crawlpaper",
"webcatcher","t-h-u-n-d-e-r-s-t-o-n-e","webmoose","pagesinventory","webquest","webreaper",
"webwalker","winona","occam","robi","fdse","jobo","rhcs","gazz","dwcp","yeti","fido","wlm",
"wolp","wwwc","xget","legs","curl","webs","wget","sift","cmc"
};
public BotTrackerMiddleware(RequestDelegate next, IMetricsService metricsService)
{
_next = next;
_metricsService = metricsService;
}
/// <summary>
/// Increase measurement for BotScrape metric each time the user agent matches a bot
/// </summary>
/// <param name="context"></param>
/// <returns></returns>
public async Task InvokeAsync(HttpContext context)
{
string ua = context.Request.Headers.UserAgent.FirstOrDefault().ToLower() ?? string.Empty;
if (_crawlers.Exists(x => ua.Contains(x)))
await _metricsService.MeasureAsync("BotScrape", 1);
await _next(context);
}
}
}
Then in Startup.cs
we can include our middleware conditionally, based on a feature flag, before we call app.UseEndpoints
app.UseMiddlewareForFeature<BotTrackerMiddleware>(FeatureFlags.BotTracker);
Next, in Toggly we’ll go to Features under our Application
And we’ll add a definition for our BotScrape flag
Finally, we’ll define the metric on our Metrics tab