RetroIndex/RetroExtractor/Program.cs

using BingScrapeData;
using BingScrapeReadLib;
using Newtonsoft.Json;

namespace RetroExtractor
{
    internal class Program
    {
        static void Main(string[] args)
        {
            //read config
            string config_file = "config.json";
            if (args.Length > 0)
            {
                config_file = args[0];
                if (!File.Exists(config_file))
                {
                    Console.WriteLine("Cannot find config file {0}", config_file);
                    return;
                }
            }
            config.populate(config_file);

            if (Directory.Exists(config.param.OutputFolder))
            {
                Console.WriteLine($"output folder {config.param.OutputFolder} already exists");
                return;
            }
            Directory.CreateDirectory(config.param.OutputFolder);

            //open data stores between start and end (start and end dates are inclusive)
            var startDate = config.param.ScrapeDate.Start.GetDate();
            var endDate = config.param.ScrapeDate.End.GetDate();
            var reader = new Reader(config.param.BaseFolder, startDate, endDate);

            //count the number of available stores
            var stubs = reader.GetStubs();
            Console.WriteLine($"{stubs.Length} stores are available between {startDate} and {endDate}:");
            if (stubs.Length > 0)
            {
                Console.WriteLine($"first available time stub: {stubs.First()}");
                Console.WriteLine($" last available time stub: {stubs.Last()}");
            }

            //now we open the stores for processing (this just reads the publisher indices across stores, not yet the entire store)
            reader.Open();
            //now we "seal" the stores and lock in the selection (you can include non-existing publishers in the selection - they will be ignored
            var pubSelection = config.param.PubSelection;
            reader.SealStores(pubSelection);

            //get list of publishers and check if nytimes.com and washingtonpost.com is among them
            var pubs = reader.GetPublishers();
            Console.WriteLine($"publishers: {pubs.Length}");
            foreach (var pub in pubSelection)
            {
                if (pubs.Contains(pub))
                {
                    Console.WriteLine($"{pub}: exists in data");
                }
                else
                {
                    Console.WriteLine($"{pub}: does not exist in data");
                }
            }

            //now we retrieve selected publishers until no more left
            //memory requirement for client is about 1GB per publisher (for large ones like NYT)  and day
            Console.WriteLine("Reading filtered publishers ..");
            var publishers = new List<string>();
            while (true)
            {
                Console.WriteLine($"-----------------------");
                var next = reader.ReadNextPublisher();
                if (next == null)
                {
                    //nothing left
                    break;
                }
                //now write some summary stats for this publisher
                Console.WriteLine($"Publisher: {next.pub}");
                int pubID = publishers.Count;
                publishers.Add(next.pub);
                Console.WriteLine($"{next.PublisherByStub.Count}/{stubs.Length} days: {JsonConvert.SerializeObject(next.PublisherByStub.Keys.ToArray())}");
                int pages = 0;
                var freshScrapes = new List<ProcessedBingData>();
                DateTime? start = null;
                DateTime? end = null;
                if (config.param.ArticleDate.ApplyFilter)
                {
                    start = config.param.ArticleDate.Start.GetDate();
                    end = config.param.ArticleDate.End.GetDate();
                }
                foreach (var kvp in next.PublisherByStub)
                {
                    var pubdata = kvp.Value;
                    pages += pubdata.scrapes.Count;
                    foreach (var scrape in pubdata.scrapes)
                    {
                        //scrape has all the data on the article
                        //for example you get the text using the readability extractor as follows:
                        if (scrape.readability != null)
                        {
                            var text = scrape.readability.text.getCleanedText();
                        }
                        //we count the number of "fresh" pages with publish date in the range that we pulled
                        bool isValidPage = false;
                        if (!config.param.ArticleDate.ApplyFilter)
                        {
                            isValidPage = true;
                        }
                        else
                        {
                            if (scrape.PublishDate != null)
                            {
                                var dt = scrape.PublishDate.Date;
                                if (dt > (DateTime) start && dt < (DateTime) end)
                                {
                                    isValidPage = true;
                                }
                            }
                        }
                        if (isValidPage)
                        {
                            freshScrapes.Add(scrape);
                        }
                    }
                }
                if (config.param.ArticleDate.ApplyFilter)
                {
                    Console.WriteLine($"filtered pages between {start} and {end}: {freshScrapes.Count}/{pages}");
                }
                else
                {
                    Console.WriteLine($"filtered pages (all): {freshScrapes.Count}/{pages}");
                }
                serializeLargeJSON2File(freshScrapes, $"{config.param.OutputFolder}/{pubID}.json");
            }
            var sw = new StreamWriter($"{config.param.OutputFolder}/publishers.txt");
            for(int i=0;i<publishers.Count;i++)
            {
                sw.WriteLine(publishers[i]);
            }
            sw.Close();
        }

        static void serializeLargeJSON2File(object value, string fileName)
        {
            using (StreamWriter writer = new StreamWriter(fileName))
            using (JsonTextWriter jsonWriter = new JsonTextWriter(writer))
            {
                JsonSerializer ser = new JsonSerializer();
                ser.Serialize(jsonWriter, value);
                jsonWriter.Flush();
            }
        }
    }
}