From 96f2823f47a759c9e607740c0ee451fd1d47bf84 Mon Sep 17 00:00:00 2001 From: Markus Mobius Date: Tue, 25 Jul 2023 01:57:00 -0400 Subject: [PATCH] Initial commit --- .gitignore | 398 +++++++++++++++++++++++++++ README.md | 1 + README.txt | 34 +++ RetroIndex.sln | 30 ++ RetroSampleApp/Program.cs | 92 +++++++ RetroSampleApp/RetroSampleApp.csproj | 15 + nuget.config | 6 + 7 files changed, 576 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 README.txt create mode 100644 RetroIndex.sln create mode 100644 RetroSampleApp/Program.cs create mode 100644 RetroSampleApp/RetroSampleApp.csproj create mode 100644 nuget.config diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8a30d25 --- /dev/null +++ b/.gitignore @@ -0,0 +1,398 @@ +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. +## +## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore + +# User-specific files +*.rsuser +*.suo +*.user +*.userosscache +*.sln.docstates + +# User-specific files (MonoDevelop/Xamarin Studio) +*.userprefs + +# Mono auto generated files +mono_crash.* + +# Build results +[Dd]ebug/ +[Dd]ebugPublic/ +[Rr]elease/ +[Rr]eleases/ +x64/ +x86/ +[Ww][Ii][Nn]32/ +[Aa][Rr][Mm]/ +[Aa][Rr][Mm]64/ +bld/ +[Bb]in/ +[Oo]bj/ +[Ll]og/ +[Ll]ogs/ + +# Visual Studio 2015/2017 cache/options directory +.vs/ +# Uncomment if you have tasks that create the project's static files in wwwroot +#wwwroot/ + +# Visual Studio 2017 auto generated files +Generated\ Files/ + +# MSTest test Results +[Tt]est[Rr]esult*/ +[Bb]uild[Ll]og.* + +# NUnit +*.VisualState.xml +TestResult.xml +nunit-*.xml + +# Build Results of an ATL Project +[Dd]ebugPS/ +[Rr]eleasePS/ +dlldata.c + +# Benchmark Results +BenchmarkDotNet.Artifacts/ + +# .NET Core +project.lock.json +project.fragment.lock.json +artifacts/ + +# ASP.NET Scaffolding +ScaffoldingReadMe.txt + +# StyleCop +StyleCopReport.xml + +# Files built by Visual Studio +*_i.c +*_p.c +*_h.h +*.ilk +*.meta +*.obj +*.iobj +*.pch +*.pdb +*.ipdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.tmp_proj +*_wpftmp.csproj +*.log +*.tlog +*.vspscc +*.vssscc +.builds +*.pidb +*.svclog +*.scc + +# Chutzpah Test files +_Chutzpah* + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opendb +*.opensdf +*.sdf +*.cachefile +*.VC.db +*.VC.VC.opendb + +# Visual Studio profiler +*.psess +*.vsp +*.vspx +*.sap + +# Visual Studio Trace Files +*.e2e + +# TFS 2012 Local Workspace +$tf/ + +# Guidance Automation Toolkit +*.gpState + +# ReSharper is a .NET coding add-in +_ReSharper*/ +*.[Rr]e[Ss]harper +*.DotSettings.user + +# TeamCity is a build add-in +_TeamCity* + +# DotCover is a Code Coverage Tool +*.dotCover + +# AxoCover is a Code Coverage Tool +.axoCover/* +!.axoCover/settings.json + +# Coverlet is a free, cross platform Code Coverage Tool +coverage*.json +coverage*.xml +coverage*.info + +# Visual Studio code coverage results +*.coverage +*.coveragexml + +# NCrunch +_NCrunch_* +.*crunch*.local.xml +nCrunchTemp_* + +# MightyMoose +*.mm.* +AutoTest.Net/ + +# Web workbench (sass) +.sass-cache/ + +# Installshield output folder +[Ee]xpress/ + +# DocProject is a documentation generator add-in +DocProject/buildhelp/ +DocProject/Help/*.HxT +DocProject/Help/*.HxC +DocProject/Help/*.hhc +DocProject/Help/*.hhk +DocProject/Help/*.hhp +DocProject/Help/Html2 +DocProject/Help/html + +# Click-Once directory +publish/ + +# Publish Web Output +*.[Pp]ublish.xml +*.azurePubxml +# Note: Comment the next line if you want to checkin your web deploy settings, +# but database connection strings (with potential passwords) will be unencrypted +*.pubxml +*.publishproj + +# Microsoft Azure Web App publish settings. Comment the next line if you want to +# checkin your Azure Web App publish settings, but sensitive information contained +# in these scripts will be unencrypted +PublishScripts/ + +# NuGet Packages +*.nupkg +# NuGet Symbol Packages +*.snupkg +# The packages folder can be ignored because of Package Restore +**/[Pp]ackages/* +# except build/, which is used as an MSBuild target. +!**/[Pp]ackages/build/ +# Uncomment if necessary however generally it will be regenerated when needed +#!**/[Pp]ackages/repositories.config +# NuGet v3's project.json files produces more ignorable files +*.nuget.props +*.nuget.targets + +# Microsoft Azure Build Output +csx/ +*.build.csdef + +# Microsoft Azure Emulator +ecf/ +rcf/ + +# Windows Store app package directories and files +AppPackages/ +BundleArtifacts/ +Package.StoreAssociation.xml +_pkginfo.txt +*.appx +*.appxbundle +*.appxupload + +# Visual Studio cache files +# files ending in .cache can be ignored +*.[Cc]ache +# but keep track of directories ending in .cache +!?*.[Cc]ache/ + +# Others +ClientBin/ +~$* +*~ +*.dbmdl +*.dbproj.schemaview +*.jfm +*.pfx +*.publishsettings +orleans.codegen.cs + +# Including strong name files can present a security risk +# (https://github.com/github/gitignore/pull/2483#issue-259490424) +#*.snk + +# Since there are multiple workflows, uncomment next line to ignore bower_components +# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) +#bower_components/ + +# RIA/Silverlight projects +Generated_Code/ + +# Backup & report files from converting an old project file +# to a newer Visual Studio version. Backup files are not needed, +# because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML +UpgradeLog*.htm +ServiceFabricBackup/ +*.rptproj.bak + +# SQL Server files +*.mdf +*.ldf +*.ndf + +# Business Intelligence projects +*.rdl.data +*.bim.layout +*.bim_*.settings +*.rptproj.rsuser +*- [Bb]ackup.rdl +*- [Bb]ackup ([0-9]).rdl +*- [Bb]ackup ([0-9][0-9]).rdl + +# Microsoft Fakes +FakesAssemblies/ + +# GhostDoc plugin setting file +*.GhostDoc.xml + +# Node.js Tools for Visual Studio +.ntvs_analysis.dat +node_modules/ + +# Visual Studio 6 build log +*.plg + +# Visual Studio 6 workspace options file +*.opt + +# Visual Studio 6 auto-generated workspace file (contains which files were open etc.) +*.vbw + +# Visual Studio 6 auto-generated project file (contains which files were open etc.) +*.vbp + +# Visual Studio 6 workspace and project file (working project files containing files to include in project) +*.dsw +*.dsp + +# Visual Studio 6 technical files +*.ncb +*.aps + +# Visual Studio LightSwitch build output +**/*.HTMLClient/GeneratedArtifacts +**/*.DesktopClient/GeneratedArtifacts +**/*.DesktopClient/ModelManifest.xml +**/*.Server/GeneratedArtifacts +**/*.Server/ModelManifest.xml +_Pvt_Extensions + +# Paket dependency manager +.paket/paket.exe +paket-files/ + +# FAKE - F# Make +.fake/ + +# CodeRush personal settings +.cr/personal + +# Python Tools for Visual Studio (PTVS) +__pycache__/ +*.pyc + +# Cake - Uncomment if you are using it +# tools/** +# !tools/packages.config + +# Tabs Studio +*.tss + +# Telerik's JustMock configuration file +*.jmconfig + +# BizTalk build output +*.btp.cs +*.btm.cs +*.odx.cs +*.xsd.cs + +# OpenCover UI analysis results +OpenCover/ + +# Azure Stream Analytics local run output +ASALocalRun/ + +# MSBuild Binary and Structured Log +*.binlog + +# NVidia Nsight GPU debugger configuration file +*.nvuser + +# MFractors (Xamarin productivity tool) working folder +.mfractor/ + +# Local History for Visual Studio +.localhistory/ + +# Visual Studio History (VSHistory) files +.vshistory/ + +# BeatPulse healthcheck temp database +healthchecksdb + +# Backup folder for Package Reference Convert tool in Visual Studio 2017 +MigrationBackup/ + +# Ionide (cross platform F# VS Code tools) working folder +.ionide/ + +# Fody - auto-generated XML schema +FodyWeavers.xsd + +# VS Code files for those working on multiple tools +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +*.code-workspace + +# Local History for Visual Studio Code +.history/ + +# Windows Installer files from build outputs +*.cab +*.msi +*.msix +*.msm +*.msp + +# JetBrains Rider +*.sln.iml diff --git a/README.md b/README.md new file mode 100644 index 0000000..cafe975 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +# newsredmond \ No newline at end of file diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..8e95ff2 --- /dev/null +++ b/README.txt @@ -0,0 +1,34 @@ +This program opens a range of datastores between two specified dates and extracts all the data +for a set of user-defined publishers. + + +RetroSampleApp Output +------------- + + + +7 stores are available between 2/15/2023 12:00:00 AM and 2/21/2023 12:00:00 AM: +first available time stub: 2023_02_15_00_00_00 + last available time stub: 2023_02_21_00_00_00 +Opening datastore 2023_02_15_00_00_00 from disk ... +Opening datastore 2023_02_16_00_00_00 from disk ... +Opening datastore 2023_02_17_00_00_00 from disk ... +Opening datastore 2023_02_18_00_00_00 from disk ... +Opening datastore 2023_02_19_00_00_00 from disk ... +Opening datastore 2023_02_20_00_00_00 from disk ... +Opening datastore 2023_02_21_00_00_00 from disk ... +publishers: 9020 +nytimes.com: exists in data +washingtonpost.com: exists in data +Reading filtered publishers .. +----------------------- +Publisher: nytimes.com +7/7 days: ["2023_02_15_00_00_00","2023_02_16_00_00_00","2023_02_17_00_00_00","2023_02_18_00_00_00","2023_02_19_00_00_00","2023_02_20_00_00_00","2023_02_21_00_00_00"] +total pages: 105223 +pages with publish data between 2/15/2023 12:00:00 AM and 2/21/2023 12:00:00 AM: 17030 +----------------------- +Publisher: washingtonpost.com +7/7 days: ["2023_02_15_00_00_00","2023_02_16_00_00_00","2023_02_17_00_00_00","2023_02_18_00_00_00","2023_02_19_00_00_00","2023_02_20_00_00_00","2023_02_21_00_00_00"] +total pages: 28779 +pages with publish data between 2/15/2023 12:00:00 AM and 2/21/2023 12:00:00 AM: 4143 +----------------------- diff --git a/RetroIndex.sln b/RetroIndex.sln new file mode 100644 index 0000000..6db5427 --- /dev/null +++ b/RetroIndex.sln @@ -0,0 +1,30 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 17 +VisualStudioVersion = 17.6.33829.357 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "RetroSampleApp", "RetroSampleApp\RetroSampleApp.csproj", "{4B4FD66B-68EF-4B63-9DED-6E4D22222C9E}" +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{A68C05D6-1843-4166-A34A-528AE637AC73}" + ProjectSection(SolutionItems) = preProject + README.txt = README.txt + EndProjectSection +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {4B4FD66B-68EF-4B63-9DED-6E4D22222C9E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {4B4FD66B-68EF-4B63-9DED-6E4D22222C9E}.Debug|Any CPU.Build.0 = Debug|Any CPU + {4B4FD66B-68EF-4B63-9DED-6E4D22222C9E}.Release|Any CPU.ActiveCfg = Release|Any CPU + {4B4FD66B-68EF-4B63-9DED-6E4D22222C9E}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {4A35D5D2-DB90-422B-AAD6-B4CE1C8036E7} + EndGlobalSection +EndGlobal diff --git a/RetroSampleApp/Program.cs b/RetroSampleApp/Program.cs new file mode 100644 index 0000000..27d1d11 --- /dev/null +++ b/RetroSampleApp/Program.cs @@ -0,0 +1,92 @@ +using BingScrapeReadLib; +using BingScrapeStore; +using Newtonsoft.Json; + +namespace RetroSampleApp +{ + internal class Program + { + static void Main(string[] args) + { + //open data stores between 2/15/2023 and 2/21/2023 (start and end dates are inclusive) + var startDate = new DateTime(2023, 2, 15); + var endDate = new DateTime(2023, 2, 21); + var reader = new Reader("//tinybee/retroindexprocessed2023", startDate, endDate); + + //count the number of available stores + var stubs = reader.GetStubs(); + Console.WriteLine($"{stubs.Length} stores are available between {startDate} and {endDate}:"); + if (stubs.Length > 0) + { + Console.WriteLine($"first available time stub: {stubs.First()}"); + Console.WriteLine($" last available time stub: {stubs.Last()}"); + } + + //now we open the stores for processing (this just reads the publisher indices across stores, not yet the entire store) + reader.Open(); + + //get list of publishers and check if nytimes.com and washingtonpost.com is among them + var pubs = reader.GetPublishers(); + Console.WriteLine($"publishers: {pubs.Length}"); + var pubSelection = new string[] { "nytimes.com", "washingtonpost.com" }; + foreach (var pub in pubSelection) + { + if (pubs.Contains(pub)) + { + Console.WriteLine($"{pub}: exists in data"); + } + else + { + Console.WriteLine($"{pub}: does not exist in data"); + } + } + //now we "seal" the stores and lock in the selection (you can include non-existing publishers in the selection - they will be ignored + reader.SealStores(pubSelection); + + //now we retrieve selected publishers until no more left + //memory requirement for client is about 1GB per publisher (for large ones like NYT) and day + Console.WriteLine("Reading filtered publishers .."); + while (true) + { + Console.WriteLine($"-----------------------"); + var next = reader.ReadNextPublisher(); + if (next == null) + { + //nothing left + break; + } + //now write some summary stats for this publisher + Console.WriteLine($"Publisher: {next.pub}"); + Console.WriteLine($"{next.PublisherByStub.Count}/{stubs.Length} days: {JsonConvert.SerializeObject(next.PublisherByStub.Keys.ToArray())}"); + int pages = 0; + int freshPages = 0; + foreach (var kvp in next.PublisherByStub) + { + var pubdata = kvp.Value; + pages += pubdata.scrapes.Count; + foreach (var scrape in pubdata.scrapes) + { + //scrape has all the data on the article + //for example you get the text using the readability extractor as follows: + if (scrape.readability != null) + { + var text = scrape.readability.text.getCleanedText(); + } + //we count the number of "fresh" pages with publish date in the range that we pulled + if (scrape.PublishDate != null) + { + var dt = scrape.PublishDate.Date; + if (dt > startDate && dt < endDate) + { + freshPages++; + } + } + } + } + Console.WriteLine($"total pages: {pages}"); + Console.WriteLine($"pages with publish data between {startDate} and {endDate}: {freshPages}"); + } + + } + } +} \ No newline at end of file diff --git a/RetroSampleApp/RetroSampleApp.csproj b/RetroSampleApp/RetroSampleApp.csproj new file mode 100644 index 0000000..c279721 --- /dev/null +++ b/RetroSampleApp/RetroSampleApp.csproj @@ -0,0 +1,15 @@ + + + + Exe + net7.0 + enable + enable + + + + + + + + diff --git a/nuget.config b/nuget.config new file mode 100644 index 0000000..af3c34c --- /dev/null +++ b/nuget.config @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file