2023-07-25 01:57:00 -04:00

92 lines
4.0 KiB
C#

using BingScrapeReadLib;
using BingScrapeStore;
using Newtonsoft.Json;
namespace RetroSampleApp
{
internal class Program
{
static void Main(string[] args)
{
//open data stores between 2/15/2023 and 2/21/2023 (start and end dates are inclusive)
var startDate = new DateTime(2023, 2, 15);
var endDate = new DateTime(2023, 2, 21);
var reader = new Reader("//tinybee/retroindexprocessed2023", startDate, endDate);
//count the number of available stores
var stubs = reader.GetStubs();
Console.WriteLine($"{stubs.Length} stores are available between {startDate} and {endDate}:");
if (stubs.Length > 0)
{
Console.WriteLine($"first available time stub: {stubs.First()}");
Console.WriteLine($" last available time stub: {stubs.Last()}");
}
//now we open the stores for processing (this just reads the publisher indices across stores, not yet the entire store)
reader.Open();
//get list of publishers and check if nytimes.com and washingtonpost.com is among them
var pubs = reader.GetPublishers();
Console.WriteLine($"publishers: {pubs.Length}");
var pubSelection = new string[] { "nytimes.com", "washingtonpost.com" };
foreach (var pub in pubSelection)
{
if (pubs.Contains(pub))
{
Console.WriteLine($"{pub}: exists in data");
}
else
{
Console.WriteLine($"{pub}: does not exist in data");
}
}
//now we "seal" the stores and lock in the selection (you can include non-existing publishers in the selection - they will be ignored
reader.SealStores(pubSelection);
//now we retrieve selected publishers until no more left
//memory requirement for client is about 1GB per publisher (for large ones like NYT) and day
Console.WriteLine("Reading filtered publishers ..");
while (true)
{
Console.WriteLine($"-----------------------");
var next = reader.ReadNextPublisher();
if (next == null)
{
//nothing left
break;
}
//now write some summary stats for this publisher
Console.WriteLine($"Publisher: {next.pub}");
Console.WriteLine($"{next.PublisherByStub.Count}/{stubs.Length} days: {JsonConvert.SerializeObject(next.PublisherByStub.Keys.ToArray())}");
int pages = 0;
int freshPages = 0;
foreach (var kvp in next.PublisherByStub)
{
var pubdata = kvp.Value;
pages += pubdata.scrapes.Count;
foreach (var scrape in pubdata.scrapes)
{
//scrape has all the data on the article
//for example you get the text using the readability extractor as follows:
if (scrape.readability != null)
{
var text = scrape.readability.text.getCleanedText();
}
//we count the number of "fresh" pages with publish date in the range that we pulled
if (scrape.PublishDate != null)
{
var dt = scrape.PublishDate.Date;
if (dt > startDate && dt < endDate)
{
freshPages++;
}
}
}
}
Console.WriteLine($"total pages: {pages}");
Console.WriteLine($"pages with publish data between {startDate} and {endDate}: {freshPages}");
}
}
}
}