RetroIndex/RetroSampleApp/Program.cs

using BingScrapeReadLib;
using BingScrapeStore;
using Newtonsoft.Json;

namespace RetroSampleApp
{
    internal class Program
    {
        static void Main(string[] args)
        {
            //open data stores between 2/15/2023 and 2/21/2023 (start and end dates are inclusive)
            var startDate = new DateTime(2023, 2, 15);
            var endDate = new DateTime(2023, 2, 21);
            var reader = new Reader("//tinybee/retroindexprocessed2023", startDate, endDate);

            //count the number of available stores
            var stubs = reader.GetStubs();
            Console.WriteLine($"{stubs.Length} stores are available between {startDate} and {endDate}:");
            if (stubs.Length > 0)
            {
                Console.WriteLine($"first available time stub: {stubs.First()}");
                Console.WriteLine($" last available time stub: {stubs.Last()}");
            }

            //now we open the stores for processing (this just reads the publisher indices across stores, not yet the entire store)
            reader.Open();

            //get list of publishers and check if nytimes.com and washingtonpost.com is among them
            var pubs = reader.GetPublishers();
            Console.WriteLine($"publishers: {pubs.Length}");
            var pubSelection = new string[] { "nytimes.com", "washingtonpost.com" };
            foreach (var pub in pubSelection)
            {
                if (pubs.Contains(pub))
                {
                    Console.WriteLine($"{pub}: exists in data");
                }
                else
                {
                    Console.WriteLine($"{pub}: does not exist in data");
                }
            }
            //now we "seal" the stores and lock in the selection (you can include non-existing publishers in the selection - they will be ignored
            reader.SealStores(pubSelection);

            //now we retrieve selected publishers until no more left
            //memory requirement for client is about 1GB per publisher (for large ones like NYT)  and day
            Console.WriteLine("Reading filtered publishers ..");
            while (true)
            {
                Console.WriteLine($"-----------------------");
                var next = reader.ReadNextPublisher();
                if (next == null)
                {
                    //nothing left
                    break;
                }
                //now write some summary stats for this publisher
                Console.WriteLine($"Publisher: {next.pub}");
                Console.WriteLine($"{next.PublisherByStub.Count}/{stubs.Length} days: {JsonConvert.SerializeObject(next.PublisherByStub.Keys.ToArray())}");
                int pages = 0;
                int freshPages = 0;
                foreach (var kvp in next.PublisherByStub)
                {
                    var pubdata = kvp.Value;
                    pages += pubdata.scrapes.Count;
                    foreach (var scrape in pubdata.scrapes)
                    {
                        //scrape has all the data on the article
                        //for example you get the text using the readability extractor as follows:
                        if (scrape.readability != null)
                        {
                            var text = scrape.readability.text.getCleanedText();
                        }
                        //we count the number of "fresh" pages with publish date in the range that we pulled
                        if (scrape.PublishDate != null)
                        {
                            var dt = scrape.PublishDate.Date;
                            if (dt > startDate && dt < endDate)
                            {
                                freshPages++;
                            }
                        }
                    }
                }
                Console.WriteLine($"total pages: {pages}");
                Console.WriteLine($"pages with publish data between {startDate} and {endDate}: {freshPages}");
            }

        }
    }
}