158 lines
6.5 KiB
C#
158 lines
6.5 KiB
C#
using BingScrapeData;
|
|
using BingScrapeReadLib;
|
|
using Newtonsoft.Json;
|
|
|
|
namespace RetroExtractor
|
|
{
|
|
internal class Program
|
|
{
|
|
static void Main(string[] args)
|
|
{
|
|
//read config
|
|
string config_file = "config.json";
|
|
if (args.Length > 0)
|
|
{
|
|
config_file = args[0];
|
|
if (!File.Exists(config_file))
|
|
{
|
|
Console.WriteLine("Cannot find config file {0}", config_file);
|
|
return;
|
|
}
|
|
}
|
|
config.populate(config_file);
|
|
|
|
if (Directory.Exists(config.param.OutputFolder))
|
|
{
|
|
Console.WriteLine($"output folder {config.param.OutputFolder} already exists");
|
|
return;
|
|
}
|
|
Directory.CreateDirectory(config.param.OutputFolder);
|
|
|
|
//open data stores between start and end (start and end dates are inclusive)
|
|
var startDate = config.param.ScrapeDate.Start.GetDate();
|
|
var endDate = config.param.ScrapeDate.End.GetDate();
|
|
var reader = new Reader(config.param.BaseFolder, startDate, endDate);
|
|
|
|
//count the number of available stores
|
|
var stubs = reader.GetStubs();
|
|
Console.WriteLine($"{stubs.Length} stores are available between {startDate} and {endDate}:");
|
|
if (stubs.Length > 0)
|
|
{
|
|
Console.WriteLine($"first available time stub: {stubs.First()}");
|
|
Console.WriteLine($" last available time stub: {stubs.Last()}");
|
|
}
|
|
|
|
//now we open the stores for processing (this just reads the publisher indices across stores, not yet the entire store)
|
|
reader.Open();
|
|
//now we "seal" the stores and lock in the selection (you can include non-existing publishers in the selection - they will be ignored
|
|
var pubSelection = config.param.PubSelection;
|
|
reader.SealStores(pubSelection);
|
|
|
|
//get list of publishers and check if nytimes.com and washingtonpost.com is among them
|
|
var pubs = reader.GetPublishers();
|
|
Console.WriteLine($"publishers: {pubs.Length}");
|
|
foreach (var pub in pubSelection)
|
|
{
|
|
if (pubs.Contains(pub))
|
|
{
|
|
Console.WriteLine($"{pub}: exists in data");
|
|
}
|
|
else
|
|
{
|
|
Console.WriteLine($"{pub}: does not exist in data");
|
|
}
|
|
}
|
|
|
|
//now we retrieve selected publishers until no more left
|
|
//memory requirement for client is about 1GB per publisher (for large ones like NYT) and day
|
|
Console.WriteLine("Reading filtered publishers ..");
|
|
var publishers = new List<string>();
|
|
while (true)
|
|
{
|
|
Console.WriteLine($"-----------------------");
|
|
var next = reader.ReadNextPublisher();
|
|
if (next == null)
|
|
{
|
|
//nothing left
|
|
break;
|
|
}
|
|
//now write some summary stats for this publisher
|
|
Console.WriteLine($"Publisher: {next.pub}");
|
|
int pubID = publishers.Count;
|
|
publishers.Add(next.pub);
|
|
Console.WriteLine($"{next.PublisherByStub.Count}/{stubs.Length} days: {JsonConvert.SerializeObject(next.PublisherByStub.Keys.ToArray())}");
|
|
int pages = 0;
|
|
var freshScrapes = new List<ProcessedBingData>();
|
|
DateTime? start = null;
|
|
DateTime? end = null;
|
|
if (config.param.ArticleDate.ApplyFilter)
|
|
{
|
|
start = config.param.ArticleDate.Start.GetDate();
|
|
end = config.param.ArticleDate.End.GetDate();
|
|
}
|
|
foreach (var kvp in next.PublisherByStub)
|
|
{
|
|
var pubdata = kvp.Value;
|
|
pages += pubdata.scrapes.Count;
|
|
foreach (var scrape in pubdata.scrapes)
|
|
{
|
|
//scrape has all the data on the article
|
|
//for example you get the text using the readability extractor as follows:
|
|
if (scrape.readability != null)
|
|
{
|
|
var text = scrape.readability.text.getCleanedText();
|
|
}
|
|
//we count the number of "fresh" pages with publish date in the range that we pulled
|
|
bool isValidPage = false;
|
|
if (!config.param.ArticleDate.ApplyFilter)
|
|
{
|
|
isValidPage = true;
|
|
}
|
|
else
|
|
{
|
|
if (scrape.PublishDate != null)
|
|
{
|
|
var dt = scrape.PublishDate.Date;
|
|
if (dt > (DateTime) start && dt < (DateTime) end)
|
|
{
|
|
isValidPage = true;
|
|
}
|
|
}
|
|
}
|
|
if (isValidPage)
|
|
{
|
|
freshScrapes.Add(scrape);
|
|
}
|
|
}
|
|
}
|
|
if (config.param.ArticleDate.ApplyFilter)
|
|
{
|
|
Console.WriteLine($"filtered pages between {start} and {end}: {freshScrapes.Count}/{pages}");
|
|
}
|
|
else
|
|
{
|
|
Console.WriteLine($"filtered pages (all): {freshScrapes.Count}/{pages}");
|
|
}
|
|
serializeLargeJSON2File(freshScrapes, $"{config.param.OutputFolder}/{pubID}.json");
|
|
}
|
|
var sw = new StreamWriter($"{config.param.OutputFolder}/publishers.txt");
|
|
for(int i=0;i<publishers.Count;i++)
|
|
{
|
|
sw.WriteLine(publishers[i]);
|
|
}
|
|
sw.Close();
|
|
}
|
|
|
|
static void serializeLargeJSON2File(object value, string fileName)
|
|
{
|
|
using (StreamWriter writer = new StreamWriter(fileName))
|
|
using (JsonTextWriter jsonWriter = new JsonTextWriter(writer))
|
|
{
|
|
JsonSerializer ser = new JsonSerializer();
|
|
ser.Serialize(jsonWriter, value);
|
|
jsonWriter.Flush();
|
|
}
|
|
}
|
|
}
|
|
}
|