added scraper
This commit is contained in:
parent
a9e1752428
commit
be776884a3
40
README.txt
40
README.txt
@ -10,41 +10,41 @@ git clone https://www.gitea.econlabs.org/markusmobiuspublic/RetroIndex
|
|||||||
The navigate to RetroIndex folder:
|
The navigate to RetroIndex folder:
|
||||||
cd RetroSampleApp
|
cd RetroSampleApp
|
||||||
dotnet publish
|
dotnet publish
|
||||||
cd bin\Debug\net7.0\publish
|
cd bin\Release\net9.0\publish
|
||||||
dotnet RetroSampleApp.dll
|
dotnet RetroSampleApp.dll
|
||||||
|
|
||||||
|
|
||||||
Expected Output
|
Expected Output
|
||||||
-------------
|
-------------
|
||||||
|
|
||||||
7 stores are available between 2/15/2023 12:00:00 AM and 2/21/2023 12:00:00 AM:
|
7 stores are available between 6/15/2025 12:00:00 AM and 6/22/2025 12:00:00 AM:
|
||||||
first available time stub: 2023_02_15_00_00_00
|
first available time stub: 2025_06_16_00_00_00
|
||||||
last available time stub: 2023_02_21_00_00_00
|
last available time stub: 2025_06_22_00_00_00
|
||||||
Opening datastore 2023_02_15_00_00_00 from disk ...
|
Opening datastore 2025_06_16_00_00_00 from disk ...
|
||||||
Opening datastore 2023_02_16_00_00_00 from disk ...
|
Opening datastore 2025_06_17_00_00_00 from disk ...
|
||||||
Opening datastore 2023_02_17_00_00_00 from disk ...
|
Opening datastore 2025_06_18_00_00_00 from disk ...
|
||||||
Opening datastore 2023_02_18_00_00_00 from disk ...
|
Opening datastore 2025_06_19_00_00_00 from disk ...
|
||||||
Opening datastore 2023_02_19_00_00_00 from disk ...
|
Opening datastore 2025_06_20_00_00_00 from disk ...
|
||||||
Opening datastore 2023_02_20_00_00_00 from disk ...
|
Opening datastore 2025_06_21_00_00_00 from disk ...
|
||||||
Opening datastore 2023_02_21_00_00_00 from disk ...
|
Opening datastore 2025_06_22_00_00_00 from disk ...
|
||||||
publishers: 9020
|
publishers: 8594
|
||||||
nytimes.com: exists in data
|
nytimes.com: exists in data
|
||||||
washingtonpost.com: exists in data
|
washingtonpost.com: exists in data
|
||||||
Reading filtered publishers ..
|
Reading filtered publishers ..
|
||||||
-----------------------
|
-----------------------
|
||||||
Publisher: nytimes.com
|
Publisher: nytimes.com
|
||||||
7/7 days: ["2023_02_15_00_00_00","2023_02_16_00_00_00","2023_02_17_00_00_00","2023_02_18_00_00_00","2023_02_19_00_00_00","2023_02_20_00_00_00","2023_02_21_00_00_00"]
|
7/7 days: ["2025_06_16_00_00_00","2025_06_17_00_00_00","2025_06_18_00_00_00","2025_06_19_00_00_00","2025_06_20_00_00_00","2025_06_21_00_00_00","2025_06_22_00_00_00"]
|
||||||
total pages: 105223
|
total pages: 36046
|
||||||
pages with publish data between 2/15/2023 12:00:00 AM and 2/21/2023 12:00:00 AM: 17030
|
pages with publish data between 6/15/2025 12:00:00 AM and 6/22/2025 12:00:00 AM: 604
|
||||||
-----------------------
|
-----------------------
|
||||||
Publisher: washingtonpost.com
|
Publisher: washingtonpost.com
|
||||||
7/7 days: ["2023_02_15_00_00_00","2023_02_16_00_00_00","2023_02_17_00_00_00","2023_02_18_00_00_00","2023_02_19_00_00_00","2023_02_20_00_00_00","2023_02_21_00_00_00"]
|
7/7 days: ["2025_06_16_00_00_00","2025_06_17_00_00_00","2025_06_18_00_00_00","2025_06_19_00_00_00","2025_06_20_00_00_00","2025_06_21_00_00_00","2025_06_22_00_00_00"]
|
||||||
total pages: 28779
|
total pages: 19560
|
||||||
pages with publish data between 2/15/2023 12:00:00 AM and 2/21/2023 12:00:00 AM: 4143
|
pages with publish data between 6/15/2025 12:00:00 AM and 6/22/2025 12:00:00 AM: 828
|
||||||
-----------------------
|
-----------------------
|
||||||
|
|
||||||
|
|
||||||
In order to incorporate in your own project make sure you reference the BingScrapeReadLib (1.5.0+) nuget package:
|
In order to incorporate in your own project make sure you reference the BingScrapeReadLib (2.0.0+) nuget package:
|
||||||
https://www.gitea.econlabs.org/markusmobiuspublic/-/packages/nuget/bingscrapereadlib/1.5.0
|
https://www.gitea.econlabs.org/markusmobiuspublic/-/packages/nuget/bingscrapereadlib/2.0.0
|
||||||
|
|
||||||
The RetroSampleApp demonstrates how.
|
The RetroSampleApp demonstrates how.
|
||||||
|
|||||||
157
RetroExtractor/Program.cs
Normal file
157
RetroExtractor/Program.cs
Normal file
@ -0,0 +1,157 @@
|
|||||||
|
using BingScrapeData;
|
||||||
|
using BingScrapeReadLib;
|
||||||
|
using Newtonsoft.Json;
|
||||||
|
|
||||||
|
namespace RetroExtractor
|
||||||
|
{
|
||||||
|
internal class Program
|
||||||
|
{
|
||||||
|
static void Main(string[] args)
|
||||||
|
{
|
||||||
|
//read config
|
||||||
|
string config_file = "config.json";
|
||||||
|
if (args.Length > 0)
|
||||||
|
{
|
||||||
|
config_file = args[0];
|
||||||
|
if (!File.Exists(config_file))
|
||||||
|
{
|
||||||
|
Console.WriteLine("Cannot find config file {0}", config_file);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
config.populate(config_file);
|
||||||
|
|
||||||
|
if (Directory.Exists(config.param.OutputFolder))
|
||||||
|
{
|
||||||
|
Console.WriteLine($"output folder {config.param.OutputFolder} already exists");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
Directory.CreateDirectory(config.param.OutputFolder);
|
||||||
|
|
||||||
|
//open data stores between start and end (start and end dates are inclusive)
|
||||||
|
var startDate = config.param.ScrapeDate.Start.GetDate();
|
||||||
|
var endDate = config.param.ScrapeDate.End.GetDate();
|
||||||
|
var reader = new Reader(config.param.BaseFolder, startDate, endDate);
|
||||||
|
|
||||||
|
//count the number of available stores
|
||||||
|
var stubs = reader.GetStubs();
|
||||||
|
Console.WriteLine($"{stubs.Length} stores are available between {startDate} and {endDate}:");
|
||||||
|
if (stubs.Length > 0)
|
||||||
|
{
|
||||||
|
Console.WriteLine($"first available time stub: {stubs.First()}");
|
||||||
|
Console.WriteLine($" last available time stub: {stubs.Last()}");
|
||||||
|
}
|
||||||
|
|
||||||
|
//now we open the stores for processing (this just reads the publisher indices across stores, not yet the entire store)
|
||||||
|
reader.Open();
|
||||||
|
//now we "seal" the stores and lock in the selection (you can include non-existing publishers in the selection - they will be ignored
|
||||||
|
var pubSelection = config.param.PubSelection;
|
||||||
|
reader.SealStores(pubSelection);
|
||||||
|
|
||||||
|
//get list of publishers and check if nytimes.com and washingtonpost.com is among them
|
||||||
|
var pubs = reader.GetPublishers();
|
||||||
|
Console.WriteLine($"publishers: {pubs.Length}");
|
||||||
|
foreach (var pub in pubSelection)
|
||||||
|
{
|
||||||
|
if (pubs.Contains(pub))
|
||||||
|
{
|
||||||
|
Console.WriteLine($"{pub}: exists in data");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
Console.WriteLine($"{pub}: does not exist in data");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//now we retrieve selected publishers until no more left
|
||||||
|
//memory requirement for client is about 1GB per publisher (for large ones like NYT) and day
|
||||||
|
Console.WriteLine("Reading filtered publishers ..");
|
||||||
|
var publishers = new List<string>();
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
Console.WriteLine($"-----------------------");
|
||||||
|
var next = reader.ReadNextPublisher();
|
||||||
|
if (next == null)
|
||||||
|
{
|
||||||
|
//nothing left
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
//now write some summary stats for this publisher
|
||||||
|
Console.WriteLine($"Publisher: {next.pub}");
|
||||||
|
int pubID = publishers.Count;
|
||||||
|
publishers.Add(next.pub);
|
||||||
|
Console.WriteLine($"{next.PublisherByStub.Count}/{stubs.Length} days: {JsonConvert.SerializeObject(next.PublisherByStub.Keys.ToArray())}");
|
||||||
|
int pages = 0;
|
||||||
|
var freshScrapes = new List<ProcessedBingData>();
|
||||||
|
DateTime? start = null;
|
||||||
|
DateTime? end = null;
|
||||||
|
if (config.param.ArticleDate.ApplyFilter)
|
||||||
|
{
|
||||||
|
start = config.param.ArticleDate.Start.GetDate();
|
||||||
|
end = config.param.ArticleDate.End.GetDate();
|
||||||
|
}
|
||||||
|
foreach (var kvp in next.PublisherByStub)
|
||||||
|
{
|
||||||
|
var pubdata = kvp.Value;
|
||||||
|
pages += pubdata.scrapes.Count;
|
||||||
|
foreach (var scrape in pubdata.scrapes)
|
||||||
|
{
|
||||||
|
//scrape has all the data on the article
|
||||||
|
//for example you get the text using the readability extractor as follows:
|
||||||
|
if (scrape.readability != null)
|
||||||
|
{
|
||||||
|
var text = scrape.readability.text.getCleanedText();
|
||||||
|
}
|
||||||
|
//we count the number of "fresh" pages with publish date in the range that we pulled
|
||||||
|
bool isValidPage = false;
|
||||||
|
if (!config.param.ArticleDate.ApplyFilter)
|
||||||
|
{
|
||||||
|
isValidPage = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (scrape.PublishDate != null)
|
||||||
|
{
|
||||||
|
var dt = scrape.PublishDate.Date;
|
||||||
|
if (dt > (DateTime) start && dt < (DateTime) end)
|
||||||
|
{
|
||||||
|
isValidPage = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (isValidPage)
|
||||||
|
{
|
||||||
|
freshScrapes.Add(scrape);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (config.param.ArticleDate.ApplyFilter)
|
||||||
|
{
|
||||||
|
Console.WriteLine($"filtered pages between {start} and {end}: {freshScrapes.Count}/{pages}");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
Console.WriteLine($"filtered pages (all): {freshScrapes.Count}/{pages}");
|
||||||
|
}
|
||||||
|
serializeLargeJSON2File(freshScrapes, $"{config.param.OutputFolder}/{pubID}.json");
|
||||||
|
}
|
||||||
|
var sw = new StreamWriter($"{config.param.OutputFolder}/publishers.txt");
|
||||||
|
for(int i=0;i<publishers.Count;i++)
|
||||||
|
{
|
||||||
|
sw.WriteLine(publishers[i]);
|
||||||
|
}
|
||||||
|
sw.Close();
|
||||||
|
}
|
||||||
|
|
||||||
|
static void serializeLargeJSON2File(object value, string fileName)
|
||||||
|
{
|
||||||
|
using (StreamWriter writer = new StreamWriter(fileName))
|
||||||
|
using (JsonTextWriter jsonWriter = new JsonTextWriter(writer))
|
||||||
|
{
|
||||||
|
JsonSerializer ser = new JsonSerializer();
|
||||||
|
ser.Serialize(jsonWriter, value);
|
||||||
|
jsonWriter.Flush();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
23
RetroExtractor/RetroExtractor.csproj
Normal file
23
RetroExtractor/RetroExtractor.csproj
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
<Project Sdk="Microsoft.NET.Sdk">
|
||||||
|
|
||||||
|
<PropertyGroup>
|
||||||
|
<OutputType>Exe</OutputType>
|
||||||
|
<TargetFramework>net9.0</TargetFramework>
|
||||||
|
<ImplicitUsings>enable</ImplicitUsings>
|
||||||
|
<Nullable>enable</Nullable>
|
||||||
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<PackageReference Include="BingScrapeReadLib" Version="2.0.0" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<None Update="config_blira_2025_unfiltered.json">
|
||||||
|
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
|
||||||
|
</None>
|
||||||
|
<None Update="config_blira_2025_filtered.json">
|
||||||
|
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
|
||||||
|
</None>
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
</Project>
|
||||||
30
RetroExtractor/config_blira_2025_filtered.json
Normal file
30
RetroExtractor/config_blira_2025_filtered.json
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
{
|
||||||
|
"BaseFolder": "//nerds21/retro2025",
|
||||||
|
"ScrapeDate": {
|
||||||
|
"Start": {
|
||||||
|
"Year": 2025,
|
||||||
|
"Month": 6,
|
||||||
|
"Day": 15
|
||||||
|
},
|
||||||
|
"End": {
|
||||||
|
"Year": 2025,
|
||||||
|
"Month": 6,
|
||||||
|
"Day": 22
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"ArticleDate": {
|
||||||
|
"ApplyFilter": true,
|
||||||
|
"Start": {
|
||||||
|
"Year": 2025,
|
||||||
|
"Month": 6,
|
||||||
|
"Day": 15
|
||||||
|
},
|
||||||
|
"End": {
|
||||||
|
"Year": 2025,
|
||||||
|
"Month": 6,
|
||||||
|
"Day": 22
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"PubSelection": [ "bbc.com", "cnn.com", "nytimes.com", "washingtonpost.com" ],
|
||||||
|
"OutputFolder" : "export"
|
||||||
|
}
|
||||||
20
RetroExtractor/config_blira_2025_unfiltered.json
Normal file
20
RetroExtractor/config_blira_2025_unfiltered.json
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
{
|
||||||
|
"BaseFolder": "//nerds21/retro2025",
|
||||||
|
"ScrapeDate": {
|
||||||
|
"Start": {
|
||||||
|
"Year": 2025,
|
||||||
|
"Month": 6,
|
||||||
|
"Day": 15
|
||||||
|
},
|
||||||
|
"End": {
|
||||||
|
"Year": 2025,
|
||||||
|
"Month": 6,
|
||||||
|
"Day": 22
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"ArticleDate": {
|
||||||
|
"ApplyFilter": false
|
||||||
|
},
|
||||||
|
"PubSelection": [ "bbc.com", "cnn.com", "nytimes.com", "washingtonpost.com" ],
|
||||||
|
"OutputFolder": "export"
|
||||||
|
}
|
||||||
54
RetroExtractor/configuration.cs
Normal file
54
RetroExtractor/configuration.cs
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
using System;
|
||||||
|
using System.IO;
|
||||||
|
using Newtonsoft.Json;
|
||||||
|
using System.Threading;
|
||||||
|
|
||||||
|
namespace RetroExtractor
|
||||||
|
{
|
||||||
|
public class Range
|
||||||
|
{
|
||||||
|
public int Year;
|
||||||
|
public int Month;
|
||||||
|
public int Day;
|
||||||
|
|
||||||
|
public DateTime GetDate()
|
||||||
|
{
|
||||||
|
return new DateTime(Year, Month, Day);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public class ScrapeRange
|
||||||
|
{
|
||||||
|
public Range Start;
|
||||||
|
public Range End;
|
||||||
|
}
|
||||||
|
|
||||||
|
public class ArticleRange
|
||||||
|
{
|
||||||
|
public bool ApplyFilter;
|
||||||
|
public Range Start;
|
||||||
|
public Range End;
|
||||||
|
}
|
||||||
|
|
||||||
|
public class configuration
|
||||||
|
{
|
||||||
|
public string BaseFolder;
|
||||||
|
public ScrapeRange ScrapeDate;
|
||||||
|
public ArticleRange ArticleDate;
|
||||||
|
public string[] PubSelection;
|
||||||
|
public string OutputFolder;
|
||||||
|
}
|
||||||
|
|
||||||
|
static class config
|
||||||
|
{
|
||||||
|
public static configuration param;
|
||||||
|
|
||||||
|
public static void populate(string configpath)
|
||||||
|
{
|
||||||
|
StreamReader sr = new StreamReader(configpath);
|
||||||
|
param = JsonConvert.DeserializeObject<configuration>(sr.ReadToEnd());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -7,9 +7,12 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "RetroSampleApp", "RetroSamp
|
|||||||
EndProject
|
EndProject
|
||||||
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{A68C05D6-1843-4166-A34A-528AE637AC73}"
|
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{A68C05D6-1843-4166-A34A-528AE637AC73}"
|
||||||
ProjectSection(SolutionItems) = preProject
|
ProjectSection(SolutionItems) = preProject
|
||||||
|
publishers_usa.txt = publishers_usa.txt
|
||||||
README.txt = README.txt
|
README.txt = README.txt
|
||||||
EndProjectSection
|
EndProjectSection
|
||||||
EndProject
|
EndProject
|
||||||
|
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "RetroExtractor", "RetroExtractor\RetroExtractor.csproj", "{95A8EC70-30EA-4452-B0EE-CF8192C9013B}"
|
||||||
|
EndProject
|
||||||
Global
|
Global
|
||||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||||
Debug|Any CPU = Debug|Any CPU
|
Debug|Any CPU = Debug|Any CPU
|
||||||
@ -20,6 +23,10 @@ Global
|
|||||||
{4B4FD66B-68EF-4B63-9DED-6E4D22222C9E}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
{4B4FD66B-68EF-4B63-9DED-6E4D22222C9E}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||||
{4B4FD66B-68EF-4B63-9DED-6E4D22222C9E}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
{4B4FD66B-68EF-4B63-9DED-6E4D22222C9E}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||||
{4B4FD66B-68EF-4B63-9DED-6E4D22222C9E}.Release|Any CPU.Build.0 = Release|Any CPU
|
{4B4FD66B-68EF-4B63-9DED-6E4D22222C9E}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||||
|
{95A8EC70-30EA-4452-B0EE-CF8192C9013B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||||
|
{95A8EC70-30EA-4452-B0EE-CF8192C9013B}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||||
|
{95A8EC70-30EA-4452-B0EE-CF8192C9013B}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||||
|
{95A8EC70-30EA-4452-B0EE-CF8192C9013B}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||||
EndGlobalSection
|
EndGlobalSection
|
||||||
GlobalSection(SolutionProperties) = preSolution
|
GlobalSection(SolutionProperties) = preSolution
|
||||||
HideSolutionNode = FALSE
|
HideSolutionNode = FALSE
|
||||||
|
|||||||
@ -9,9 +9,9 @@ namespace RetroSampleApp
|
|||||||
static void Main(string[] args)
|
static void Main(string[] args)
|
||||||
{
|
{
|
||||||
//open data stores between 2/15/2023 and 2/21/2023 (start and end dates are inclusive)
|
//open data stores between 2/15/2023 and 2/21/2023 (start and end dates are inclusive)
|
||||||
var startDate = new DateTime(2023, 2, 15);
|
var startDate = new DateTime(2025, 6, 15);
|
||||||
var endDate = new DateTime(2023, 2, 21);
|
var endDate = new DateTime(2025, 6, 22);
|
||||||
var reader = new Reader("//tinybee/retroindexprocessed2023", startDate, endDate);
|
var reader = new Reader("//nerds21/retro2025", startDate, endDate);
|
||||||
|
|
||||||
//count the number of available stores
|
//count the number of available stores
|
||||||
var stubs = reader.GetStubs();
|
var stubs = reader.GetStubs();
|
||||||
|
|||||||
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
<PropertyGroup>
|
<PropertyGroup>
|
||||||
<OutputType>Exe</OutputType>
|
<OutputType>Exe</OutputType>
|
||||||
<TargetFramework>net7.0</TargetFramework>
|
<TargetFramework>net9.0</TargetFramework>
|
||||||
<ImplicitUsings>enable</ImplicitUsings>
|
<ImplicitUsings>enable</ImplicitUsings>
|
||||||
<Nullable>enable</Nullable>
|
<Nullable>enable</Nullable>
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
|
|||||||
15
RetroSampleApp/config_blira_2025.json
Normal file
15
RetroSampleApp/config_blira_2025.json
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
{
|
||||||
|
"BaseFolder": "//nerds21/retro2025",
|
||||||
|
"StartDate": {
|
||||||
|
"Year": 2025,
|
||||||
|
"Month": 6,
|
||||||
|
"Day": 15
|
||||||
|
},
|
||||||
|
"EndDate": {
|
||||||
|
"Year": 2025,
|
||||||
|
"Month": 6,
|
||||||
|
"Day": 21
|
||||||
|
},
|
||||||
|
"PubSelection": [ "bbc.com", "cnn.com", "nytimes.com", "washingtonpost.com" ]
|
||||||
|
}
|
||||||
|
|
||||||
6199
publishers_usa.txt
Normal file
6199
publishers_usa.txt
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user