Extract title & emoji from Omnivore

Takes first annotation lines as titles, and extracts emoji from before the first space too.
This commit is contained in:
JP Hastings-Spital 2024-05-10 09:38:21 +01:00
parent 796692cd5d
commit 716e206938
6 changed files with 1437 additions and 4 deletions

View file

@ -1,6 +1,7 @@
---
title: Slop is the new name for unwanted AI-generated content
title: AI "Slop"
date: "2024-05-09T16:03:02Z"
emoji: "\U0001F4A9"
bookmarkOf: https://simonwillison.net/2024/May/8/slop/
references:
bookmark:

View file

@ -16,6 +16,7 @@ require (
github.com/kelseyhightower/envconfig v1.4.0
github.com/mattn/go-mastodon v0.0.6
github.com/mmcdole/gofeed v1.3.0
github.com/stretchr/testify v1.8.2
golang.org/x/image v0.15.0
golang.org/x/net v0.21.0
golang.org/x/text v0.14.0
@ -82,7 +83,6 @@ require (
github.com/smartystreets/assertions v1.2.0 // indirect
github.com/smartystreets/goconvey v1.7.2 // indirect
github.com/spaolacci/murmur3 v1.1.0 // indirect
github.com/stretchr/testify v1.8.2 // indirect
github.com/tomnomnom/linkheader v0.0.0-20180905144013-02ca5825eb80 // indirect
github.com/warpfork/go-wish v0.0.0-20220906213052-39a1cc7a02d0 // indirect
github.com/whyrusleeping/cbor-gen v0.0.0-20230331140348-1f892b517e70 // indirect

View file

@ -15,6 +15,7 @@ import (
"strings"
"time"
"github.com/by-jp/www.byjp.me/tools/shared"
"github.com/joho/godotenv"
"gopkg.in/yaml.v2"
)
@ -98,8 +99,16 @@ func outputArticle(article Article, outputDir string) error {
fm.Date = article.BookmarkDate.Format(time.RFC3339)
}
article.Annotation = strings.TrimSpace(article.Annotation)
if len(fm.Title) == 0 {
fm.Title = article.Title
if strings.HasPrefix(article.Annotation, "# ") {
parts := strings.SplitAfterN(article.Annotation, "\n", 2)
article.Annotation = strings.TrimSpace(parts[1])
fm.Emoji, fm.Title = shared.ExtractLeadingEmoji(parts[0][2:])
} else {
fm.Title = article.Title
}
}
fm.BookmarkOf = article.OriginalURL
fm.Tags = removeDupes(append(fm.Tags, article.Tags...))
@ -126,7 +135,7 @@ func outputArticle(article Article, outputDir string) error {
}
fmt.Fprint(hugoPost, "---\n")
fmt.Fprintln(hugoPost, linkHashtags(strings.TrimSpace(article.Annotation), fm.Tags))
fmt.Fprintln(hugoPost, linkHashtags(article.Annotation, fm.Tags))
if len(article.Highlights) > 0 {
fmt.Fprint(hugoPost, "\n### Highlights\n")

1320
tools/shared/emoji-data.txt Normal file

File diff suppressed because it is too large Load diff

78
tools/shared/text.go Normal file
View file

@ -0,0 +1,78 @@
package shared
import (
"bufio"
"embed"
"fmt"
"math/big"
"strings"
)
//go:embed emoji-data.txt
var edf embed.FS
var emojiCodePoints map[rune]struct{}
func init() {
f, err := edf.Open("emoji-data.txt")
if err != nil {
panic(err)
}
defer f.Close()
emojiCodePoints = make(map[rune]struct{})
scanner := bufio.NewScanner(f)
scanner.Split(bufio.ScanLines)
for scanner.Scan() {
line := scanner.Text()
if len(line) == 0 || strings.HasPrefix(line, "#") {
continue
}
parts := strings.Split(line, ";")
if len(parts) < 2 {
continue
}
codeToRune := func(str string) rune {
cp := new(big.Int)
cp.SetString(str, 16)
return rune(cp.Int64())
}
codepoints := strings.Split(strings.TrimSpace(parts[0]), "..")
switch len(codepoints) {
case 1:
emojiCodePoints[codeToRune(codepoints[0])] = struct{}{}
case 2:
a := codeToRune(codepoints[0])
b := codeToRune(codepoints[1])
for i := a; i <= b; i++ {
emojiCodePoints[i] = struct{}{}
}
default:
panic("Unknown emoji-data.txt database")
}
}
if err := scanner.Err(); err != nil {
panic(err)
}
}
// NB. emoji-data.txt needs to be up to date from https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
func ExtractLeadingEmoji(str string) (string, string) {
parts := strings.SplitN(str, " ", 2)
if len(parts) == 1 {
return "", str
}
for _, c := range parts[0] {
if _, ok := emojiCodePoints[c]; !ok {
fmt.Printf("%c: %d\n", c, c)
return "", str
}
}
return parts[0], strings.TrimSpace(parts[1])
}

25
tools/shared/text_test.go Normal file
View file

@ -0,0 +1,25 @@
package shared_test
import (
"testing"
"github.com/by-jp/www.byjp.me/tools/shared"
"github.com/stretchr/testify/assert"
)
func TestExtractLeadingEmoji(t *testing.T) {
cases := [][]string{
{"No Emoji here", "", "No Emoji here"},
{"A single character start", "", "A single character start"},
{"😊 The first emoji", "😊", "The first emoji"},
{"😊 \t Extra space", "😊", "Extra space"},
{"🍋‍🟩 15.1 emoji", "🍋‍🟩", "15.1 emoji"},
{"絵文字", "", "絵文字"},
}
for _, c := range cases {
emoji, text := shared.ExtractLeadingEmoji(c[0])
assert.Equal(t, c[1], emoji)
assert.Equal(t, c[2], text)
}
}