using BingScrapeData; using BingScrapeReadLib; using Newtonsoft.Json; namespace RetroExtractor { internal class Program { static void Main(string[] args) { //read config string config_file = "config.json"; if (args.Length > 0) { config_file = args[0]; if (!File.Exists(config_file)) { Console.WriteLine("Cannot find config file {0}", config_file); return; } } config.populate(config_file); if (Directory.Exists(config.param.OutputFolder)) { Console.WriteLine($"output folder {config.param.OutputFolder} already exists"); return; } Directory.CreateDirectory(config.param.OutputFolder); //open data stores between start and end (start and end dates are inclusive) var startDate = config.param.ScrapeDate.Start.GetDate(); var endDate = config.param.ScrapeDate.End.GetDate(); var reader = new Reader(config.param.BaseFolder, startDate, endDate); //count the number of available stores var stubs = reader.GetStubs(); Console.WriteLine($"{stubs.Length} stores are available between {startDate} and {endDate}:"); if (stubs.Length > 0) { Console.WriteLine($"first available time stub: {stubs.First()}"); Console.WriteLine($" last available time stub: {stubs.Last()}"); } //now we open the stores for processing (this just reads the publisher indices across stores, not yet the entire store) reader.Open(); //now we "seal" the stores and lock in the selection (you can include non-existing publishers in the selection - they will be ignored var pubSelection = config.param.PubSelection; reader.SealStores(pubSelection); //get list of publishers and check if nytimes.com and washingtonpost.com is among them var pubs = reader.GetPublishers(); Console.WriteLine($"publishers: {pubs.Length}"); foreach (var pub in pubSelection) { if (pubs.Contains(pub)) { Console.WriteLine($"{pub}: exists in data"); } else { Console.WriteLine($"{pub}: does not exist in data"); } } //now we retrieve selected publishers until no more left //memory requirement for client is about 1GB per publisher (for large ones like NYT) and day Console.WriteLine("Reading filtered publishers .."); var publishers = new List(); while (true) { Console.WriteLine($"-----------------------"); var next = reader.ReadNextPublisher(); if (next == null) { //nothing left break; } //now write some summary stats for this publisher Console.WriteLine($"Publisher: {next.pub}"); int pubID = publishers.Count; publishers.Add(next.pub); Console.WriteLine($"{next.PublisherByStub.Count}/{stubs.Length} days: {JsonConvert.SerializeObject(next.PublisherByStub.Keys.ToArray())}"); int pages = 0; var freshScrapes = new List(); DateTime? start = null; DateTime? end = null; if (config.param.ArticleDate.ApplyFilter) { start = config.param.ArticleDate.Start.GetDate(); end = config.param.ArticleDate.End.GetDate(); } foreach (var kvp in next.PublisherByStub) { var pubdata = kvp.Value; pages += pubdata.scrapes.Count; foreach (var scrape in pubdata.scrapes) { //scrape has all the data on the article //for example you get the text using the readability extractor as follows: if (scrape.readability != null) { var text = scrape.readability.text.getCleanedText(); } //we count the number of "fresh" pages with publish date in the range that we pulled bool isValidPage = false; if (!config.param.ArticleDate.ApplyFilter) { isValidPage = true; } else { if (scrape.PublishDate != null) { var dt = scrape.PublishDate.Date; if (dt > (DateTime) start && dt < (DateTime) end) { isValidPage = true; } } } if (isValidPage) { freshScrapes.Add(scrape); } } } if (config.param.ArticleDate.ApplyFilter) { Console.WriteLine($"filtered pages between {start} and {end}: {freshScrapes.Count}/{pages}"); } else { Console.WriteLine($"filtered pages (all): {freshScrapes.Count}/{pages}"); } serializeLargeJSON2File(freshScrapes, $"{config.param.OutputFolder}/{pubID}.json"); } var sw = new StreamWriter($"{config.param.OutputFolder}/publishers.txt"); for(int i=0;i