Add cmd/datdump/main.go (#3213)

* Feat: add a new datdump tool

* Refactor: address code review comments

* Refactor: remove export all from main program

use datdump instead

* Refactor: allow spaces in exportlists

e.g. `--exportlists="lista, listb"`

* all: cleanup

* apply review suggestion

---------

Co-authored-by: database64128 <free122448@hotmail.com>
This commit is contained in:
MkQtS
2026-01-24 23:11:35 +08:00
committed by GitHub
parent 8e62b9b541
commit 5c38f34456
5 changed files with 303 additions and 116 deletions

View File

@@ -36,7 +36,8 @@ jobs:
- name: Build dlc.dat and plain lists
run: |
cd code || exit 1
go run ./ --outputdir=../ --exportlists=_all_,category-ads-all,tld-cn,cn,tld-\!cn,geolocation-\!cn,apple,icloud
go run ./ --outputdir=../ --exportlists=category-ads-all,tld-cn,cn,tld-\!cn,geolocation-\!cn,apple,icloud
go run ./cmd/datdump/main.go --inputdata=../dlc.dat --outputdir=../ --exportlists=_all_
cd ../ && rm -rf code
- name: Generate dlc.dat sha256 hash

2
.gitignore vendored
View File

@@ -8,5 +8,5 @@
dlc.dat
# Exported plaintext lists.
dlc.dat_plain.yml
/*.yml
/*.txt

164
cmd/datdump/main.go Normal file
View File

@@ -0,0 +1,164 @@
package main
import (
"bufio"
"flag"
"fmt"
"os"
"path/filepath"
"strings"
"github.com/v2fly/domain-list-community/internal/dlc"
router "github.com/v2fly/v2ray-core/v5/app/router/routercommon"
"google.golang.org/protobuf/proto"
)
var (
inputData = flag.String("inputdata", "dlc.dat", "Name of the geosite dat file")
outputDir = flag.String("outputdir", "./", "Directory to place all generated files")
exportLists = flag.String("exportlists", "", "Lists to be exported, separated by ',' (empty for _all_)")
)
type DomainRule struct {
Type string
Value string
Attrs []string
}
type DomainList struct {
Name string
Rules []DomainRule
}
func (d *DomainRule) domain2String() string {
dstring := d.Type + ":" + d.Value
if len(d.Attrs) != 0 {
dstring += ":@" + strings.Join(d.Attrs, ",@")
}
return dstring
}
func loadGeosite(path string) ([]DomainList, map[string]*DomainList, error) {
data, err := os.ReadFile(path)
if err != nil {
return nil, nil, fmt.Errorf("failed to read geosite file: %w", err)
}
vgeositeList := new(router.GeoSiteList)
if err := proto.Unmarshal(data, vgeositeList); err != nil {
return nil, nil, fmt.Errorf("failed to unmarshal: %w", err)
}
domainLists := make([]DomainList, len(vgeositeList.Entry))
domainListByName := make(map[string]*DomainList, len(vgeositeList.Entry))
for i, vsite := range vgeositeList.Entry {
rules := make([]DomainRule, 0, len(vsite.Domain))
for _, vdomain := range vsite.Domain {
rule := DomainRule{Value: vdomain.Value}
switch vdomain.Type {
case router.Domain_RootDomain:
rule.Type = dlc.RuleTypeDomain
case router.Domain_Regex:
rule.Type = dlc.RuleTypeRegexp
case router.Domain_Plain:
rule.Type = dlc.RuleTypeKeyword
case router.Domain_Full:
rule.Type = dlc.RuleTypeFullDomain
default:
return nil, nil, fmt.Errorf("invalid rule type: %+v", vdomain.Type)
}
for _, vattr := range vdomain.Attribute {
rule.Attrs = append(rule.Attrs, vattr.Key)
}
rules = append(rules, rule)
}
domainLists[i] = DomainList{
Name: strings.ToUpper(vsite.CountryCode),
Rules: rules,
}
domainListByName[domainLists[i].Name] = &domainLists[i]
}
return domainLists, domainListByName, nil
}
func exportSite(name string, domainListByName map[string]*DomainList) error {
domainList, ok := domainListByName[strings.ToUpper(name)]
if !ok {
return fmt.Errorf("list '%s' does not exist", name)
}
if len(domainList.Rules) == 0 {
return fmt.Errorf("list '%s' is empty", name)
}
file, err := os.Create(filepath.Join(*outputDir, name+".yml"))
if err != nil {
return err
}
defer file.Close()
w := bufio.NewWriter(file)
fmt.Fprintf(w, "%s:\n", name)
for _, domain := range domainList.Rules {
fmt.Fprintf(w, " - %q\n", domain.domain2String())
}
return w.Flush()
}
func exportAll(filename string, domainLists []DomainList) error {
file, err := os.Create(filepath.Join(*outputDir, filename))
if err != nil {
return err
}
defer file.Close()
w := bufio.NewWriter(file)
w.WriteString("lists:\n")
for _, domainList := range domainLists {
fmt.Fprintf(w, " - name: %s\n", strings.ToLower(domainList.Name))
fmt.Fprintf(w, " length: %d\n", len(domainList.Rules))
w.WriteString(" rules:\n")
for _, domain := range domainList.Rules {
fmt.Fprintf(w, " - %q\n", domain.domain2String())
}
}
return w.Flush()
}
func main() {
flag.Parse()
// Create output directory if not exist
if _, err := os.Stat(*outputDir); os.IsNotExist(err) {
if mkErr := os.MkdirAll(*outputDir, 0755); mkErr != nil {
fmt.Println("Failed to create output directory:", mkErr)
os.Exit(1)
}
}
fmt.Printf("Loading %s...\n", *inputData)
domainLists, domainListByName, err := loadGeosite(*inputData)
if err != nil {
fmt.Println("Failed to loadGeosite:", err)
os.Exit(1)
}
var exportListSlice []string
for raw := range strings.SplitSeq(*exportLists, ",") {
if trimmed := strings.TrimSpace(raw); trimmed != "" {
exportListSlice = append(exportListSlice, trimmed)
}
}
if len(exportListSlice) == 0 {
exportListSlice = []string{"_all_"}
}
for _, eplistname := range exportListSlice {
if strings.EqualFold(eplistname, "_all_") {
if err := exportAll(filepath.Base(*inputData)+"_plain.yml", domainLists); err != nil {
fmt.Println("Failed to exportAll:", err)
continue
}
} else {
if err := exportSite(eplistname, domainListByName); err != nil {
fmt.Println("Failed to exportSite:", err)
continue
}
}
fmt.Printf("list: '%s' has been exported successfully.\n", eplistname)
}
}

9
internal/dlc/dlc.go Normal file
View File

@@ -0,0 +1,9 @@
package dlc
const (
RuleTypeDomain string = "domain"
RuleTypeFullDomain string = "full"
RuleTypeKeyword string = "keyword"
RuleTypeRegexp string = "regexp"
RuleTypeInclude string = "include"
)

241
main.go
View File

@@ -10,6 +10,7 @@ import (
"slices"
"strings"
"github.com/v2fly/domain-list-community/internal/dlc"
router "github.com/v2fly/v2ray-core/v5/app/router/routercommon"
"google.golang.org/protobuf/proto"
)
@@ -21,21 +22,6 @@ var (
exportLists = flag.String("exportlists", "", "Lists to be flattened and exported in plaintext format, separated by ',' comma")
)
const (
RuleTypeDomain string = "domain"
RuleTypeFullDomain string = "full"
RuleTypeKeyword string = "keyword"
RuleTypeRegexp string = "regexp"
RuleTypeInclude string = "include"
)
var (
TypeChecker = regexp.MustCompile(`^(domain|full|keyword|regexp|include)$`)
DomainChecker = regexp.MustCompile(`^[a-z0-9\.-]+$`)
AttrChecker = regexp.MustCompile(`^[a-z0-9!-]+$`)
SiteChecker = regexp.MustCompile(`^[A-Z0-9!-]+$`)
)
var (
refMap = make(map[string][]*Entry)
plMap = make(map[string]*ParsedList)
@@ -66,7 +52,7 @@ type ParsedList struct {
func makeProtoList(listName string, entries []*Entry) (*router.GeoSite, error) {
site := &router.GeoSite{
CountryCode: listName,
Domain: make([]*router.Domain, 0, len(entries)),
Domain: make([]*router.Domain, 0, len(entries)),
}
for _, entry := range entries {
pdomain := &router.Domain{Value: entry.Value}
@@ -78,13 +64,13 @@ func makeProtoList(listName string, entries []*Entry) (*router.GeoSite, error) {
}
switch entry.Type {
case RuleTypeDomain:
case dlc.RuleTypeDomain:
pdomain.Type = router.Domain_RootDomain
case RuleTypeRegexp:
case dlc.RuleTypeRegexp:
pdomain.Type = router.Domain_Regex
case RuleTypeKeyword:
case dlc.RuleTypeKeyword:
pdomain.Type = router.Domain_Plain
case RuleTypeFullDomain:
case dlc.RuleTypeFullDomain:
pdomain.Type = router.Domain_Full
}
site.Domain = append(site.Domain, pdomain)
@@ -92,31 +78,12 @@ func makeProtoList(listName string, entries []*Entry) (*router.GeoSite, error) {
return site, nil
}
func writePlainAll(siteList *[]string) error {
file, err := os.Create(filepath.Join(*outputDir, *outputName + "_plain.yml"))
if err != nil {
return err
}
defer file.Close()
w := bufio.NewWriter(file)
w.WriteString("lists:\n")
for _, site := range *siteList {
fmt.Fprintf(w, " - name: %s\n", strings.ToLower(site))
fmt.Fprintf(w, " length: %d\n", len(finalMap[site]))
w.WriteString(" rules:\n")
for _, entry := range finalMap[site] {
fmt.Fprintf(w, " - %s\n", entry.Plain)
}
}
return w.Flush()
}
func writePlainList(exportedName string) error {
targetList, exist := finalMap[strings.ToUpper(exportedName)]
if !exist || len(targetList) == 0 {
return fmt.Errorf("'%s' list does not exist or is empty.", exportedName)
return fmt.Errorf("list %q does not exist or is empty.", exportedName)
}
file, err := os.Create(filepath.Join(*outputDir, strings.ToLower(exportedName) + ".txt"))
file, err := os.Create(filepath.Join(*outputDir, strings.ToLower(exportedName)+".txt"))
if err != nil {
return err
}
@@ -131,41 +98,43 @@ func writePlainList(exportedName string) error {
func parseEntry(line string) (Entry, error) {
var entry Entry
parts := strings.Fields(line)
if len(parts) == 0 {
return entry, fmt.Errorf("empty line: %q", line)
}
// Parse type and value
rawTypeVal := parts[0]
kv := strings.Split(rawTypeVal, ":")
if len(kv) == 1 {
entry.Type = RuleTypeDomain // Default type
entry.Value = strings.ToLower(rawTypeVal)
} else if len(kv) == 2 {
entry.Type = strings.ToLower(kv[0])
if entry.Type == RuleTypeRegexp {
entry.Value = kv[1]
} else if entry.Type == RuleTypeInclude {
entry.Value = strings.ToUpper(kv[1])
} else {
entry.Value = strings.ToLower(kv[1])
v := parts[0]
colonIndex := strings.Index(v, ":")
if colonIndex == -1 {
entry.Type = dlc.RuleTypeDomain // Default type
entry.Value = strings.ToLower(v)
if !validateDomainChars(entry.Value) {
return entry, fmt.Errorf("invalid domain: %q", entry.Value)
}
} else {
return entry, fmt.Errorf("invalid format: %s", line)
}
// Check type and value
if !TypeChecker.MatchString(entry.Type) {
return entry, fmt.Errorf("invalid type: %s", entry.Type)
}
switch entry.Type {
case RuleTypeRegexp:
if _, err := regexp.Compile(entry.Value); err != nil {
return entry, fmt.Errorf("invalid regexp: %s", entry.Value)
}
case RuleTypeInclude:
if !SiteChecker.MatchString(entry.Value) {
return entry, fmt.Errorf("invalid included list name: %s", entry.Value)
}
default: // `full`, `domain` and `keyword` are all (parts of) domains
if !DomainChecker.MatchString(entry.Value) {
return entry, fmt.Errorf("invalid domain: %s", entry.Value)
typ := strings.ToLower(v[:colonIndex])
val := v[colonIndex+1:]
switch typ {
case dlc.RuleTypeRegexp:
if _, err := regexp.Compile(val); err != nil {
return entry, fmt.Errorf("invalid regexp %q: %w", val, err)
}
entry.Type = dlc.RuleTypeRegexp
entry.Value = val
case dlc.RuleTypeInclude:
entry.Type = dlc.RuleTypeInclude
entry.Value = strings.ToUpper(val)
if !validateSiteName(entry.Value) {
return entry, fmt.Errorf("invalid include list name: %q", entry.Value)
}
case dlc.RuleTypeDomain, dlc.RuleTypeFullDomain, dlc.RuleTypeKeyword:
entry.Type = typ
entry.Value = strings.ToLower(val)
if !validateDomainChars(entry.Value) {
return entry, fmt.Errorf("invalid domain: %q", entry.Value)
}
default:
return entry, fmt.Errorf("invalid type: %q", typ)
}
}
@@ -173,18 +142,18 @@ func parseEntry(line string) (Entry, error) {
for _, part := range parts[1:] {
if strings.HasPrefix(part, "@") {
attr := strings.ToLower(part[1:]) // Trim attribute prefix `@` character
if !AttrChecker.MatchString(attr) {
return entry, fmt.Errorf("invalid attribute key: %s", attr)
if !validateAttrChars(attr) {
return entry, fmt.Errorf("invalid attribute: %q", attr)
}
entry.Attrs = append(entry.Attrs, attr)
} else if strings.HasPrefix(part, "&") {
aff := strings.ToUpper(part[1:]) // Trim affiliation prefix `&` character
if !SiteChecker.MatchString(aff) {
return entry, fmt.Errorf("invalid affiliation key: %s", aff)
if !validateSiteName(aff) {
return entry, fmt.Errorf("invalid affiliation: %q", aff)
}
entry.Affs = append(entry.Affs, aff)
} else {
return entry, fmt.Errorf("invalid attribute/affiliation: %s", part)
return entry, fmt.Errorf("invalid attribute/affiliation: %q", part)
}
}
// Sort attributes
@@ -198,6 +167,39 @@ func parseEntry(line string) (Entry, error) {
return entry, nil
}
func validateDomainChars(domain string) bool {
for i := range domain {
c := domain[i]
if (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '.' || c == '-' {
continue
}
return false
}
return true
}
func validateAttrChars(attr string) bool {
for i := range attr {
c := attr[i]
if (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '!' || c == '-' {
continue
}
return false
}
return true
}
func validateSiteName(name string) bool {
for i := range name {
c := name[i]
if (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '!' || c == '-' {
continue
}
return false
}
return true
}
func loadData(path string) error {
file, err := os.Open(path)
if err != nil {
@@ -206,7 +208,7 @@ func loadData(path string) error {
defer file.Close()
listName := strings.ToUpper(filepath.Base(path))
if !SiteChecker.MatchString(listName) {
if !validateSiteName(listName) {
return fmt.Errorf("invalid list name: %s", listName)
}
scanner := bufio.NewScanner(file)
@@ -238,7 +240,7 @@ func parseList(refName string, refList []*Entry) error {
plMap[refName] = pl
}
for _, entry := range refList {
if entry.Type == RuleTypeInclude {
if entry.Type == dlc.RuleTypeInclude {
if len(entry.Affs) != 0 {
return fmt.Errorf("affiliation is not allowed for include:%s", entry.Value)
}
@@ -272,18 +274,18 @@ func polishList(roughMap *map[string]*Entry) []*Entry {
domainsMap := make(map[string]bool)
for _, entry := range *roughMap {
switch entry.Type { // Bypass regexp, keyword and "full/domain with attr"
case RuleTypeRegexp:
case dlc.RuleTypeRegexp:
finalList = append(finalList, entry)
case RuleTypeKeyword:
case dlc.RuleTypeKeyword:
finalList = append(finalList, entry)
case RuleTypeDomain:
case dlc.RuleTypeDomain:
domainsMap[entry.Value] = true
if len(entry.Attrs) != 0 {
finalList = append(finalList, entry)
} else {
queuingList = append(queuingList, entry)
}
case RuleTypeFullDomain:
case dlc.RuleTypeFullDomain:
if len(entry.Attrs) != 0 {
finalList = append(finalList, entry)
} else {
@@ -295,14 +297,18 @@ func polishList(roughMap *map[string]*Entry) []*Entry {
for _, qentry := range queuingList {
isRedundant := false
pd := qentry.Value // To be parent domain
if qentry.Type == RuleTypeFullDomain {
if qentry.Type == dlc.RuleTypeFullDomain {
pd = "." + pd // So that `domain:example.org` overrides `full:example.org`
}
for {
idx := strings.Index(pd, ".")
if idx == -1 { break }
if idx == -1 {
break
}
pd = pd[idx+1:] // Go for next parent
if !strings.Contains(pd, ".") { break } // Not allow tld to be a parent
if !strings.Contains(pd, ".") {
break
} // Not allow tld to be a parent
if domainsMap[pd] {
isRedundant = true
break
@@ -320,7 +326,9 @@ func polishList(roughMap *map[string]*Entry) []*Entry {
}
func resolveList(pl *ParsedList) error {
if _, pldone := finalMap[pl.Name]; pldone { return nil }
if _, pldone := finalMap[pl.Name]; pldone {
return nil
}
if cirIncMap[pl.Name] {
return fmt.Errorf("circular inclusion in: %s", pl.Name)
@@ -329,14 +337,22 @@ func resolveList(pl *ParsedList) error {
defer delete(cirIncMap, pl.Name)
isMatchAttrFilters := func(entry *Entry, incFilter *Inclusion) bool {
if len(incFilter.MustAttrs) == 0 && len(incFilter.BanAttrs) == 0 { return true }
if len(entry.Attrs) == 0 { return len(incFilter.MustAttrs) == 0 }
if len(incFilter.MustAttrs) == 0 && len(incFilter.BanAttrs) == 0 {
return true
}
if len(entry.Attrs) == 0 {
return len(incFilter.MustAttrs) == 0
}
for _, m := range incFilter.MustAttrs {
if !slices.Contains(entry.Attrs, m) { return false }
if !slices.Contains(entry.Attrs, m) {
return false
}
}
for _, b := range incFilter.BanAttrs {
if slices.Contains(entry.Attrs, b) { return false }
if slices.Contains(entry.Attrs, b) {
return false
}
}
return true
}
@@ -348,7 +364,7 @@ func resolveList(pl *ParsedList) error {
for _, inc := range pl.Inclusions {
incPl, exist := plMap[inc.Source]
if !exist {
return fmt.Errorf("list '%s' includes a non-existent list: '%s'", pl.Name, inc.Source)
return fmt.Errorf("list %q includes a non-existent list: %q", pl.Name, inc.Source)
}
if err := resolveList(incPl); err != nil {
return err
@@ -395,54 +411,51 @@ func main() {
}
}
// Generate finalMap and sorted list of site names
siteList := make([]string, 0 ,len(plMap))
// Generate finalMap
for _, pl := range plMap {
siteList = append(siteList, pl.Name)
if err := resolveList(pl); err != nil {
fmt.Println("Failed to resolveList:", err)
os.Exit(1)
}
}
slices.Sort(siteList)
// Create output directory if not exist
if _, err := os.Stat(*outputDir); os.IsNotExist(err) {
if mkErr := os.MkdirAll(*outputDir, 0755); mkErr != nil {
fmt.Println("Failed:", mkErr)
fmt.Println("Failed to create output directory:", mkErr)
os.Exit(1)
}
}
// Export plaintext list
if *exportLists != "" {
exportedListSlice := strings.Split(*exportLists, ",")
for _, exportedList := range exportedListSlice {
if exportedList == "_all_" {
if err := writePlainAll(&siteList); err != nil {
fmt.Println("Failed to writePlainAll:", err)
continue
}
} else {
if err := writePlainList(exportedList); err != nil {
fmt.Println("Failed to write list:", err)
continue
}
}
fmt.Printf("list: '%s' has been generated successfully.\n", exportedList)
var exportListSlice []string
for raw := range strings.SplitSeq(*exportLists, ",") {
if trimmed := strings.TrimSpace(raw); trimmed != "" {
exportListSlice = append(exportListSlice, trimmed)
}
}
for _, exportList := range exportListSlice {
if err := writePlainList(exportList); err != nil {
fmt.Println("Failed to write list:", err)
continue
}
fmt.Printf("list %q has been generated successfully.\n", exportList)
}
// Generate dat file
protoList := new(router.GeoSiteList)
for _, siteName := range siteList { // So that protoList.Entry is sorted
site, err := makeProtoList(siteName, finalMap[siteName])
for siteName, siteEntries := range finalMap {
site, err := makeProtoList(siteName, siteEntries)
if err != nil {
fmt.Println("Failed to makeProtoList:", err)
os.Exit(1)
}
protoList.Entry = append(protoList.Entry, site)
}
// Sort protoList so the marshaled list is reproducible
slices.SortFunc(protoList.Entry, func(a, b *router.GeoSite) int {
return strings.Compare(a.CountryCode, b.CountryCode)
})
protoBytes, err := proto.Marshal(protoList)
if err != nil {