处理AI胡乱生成的乱摊子

This commit is contained in:
2025-09-07 20:36:02 +08:00
parent ba513e0827
commit c4522b974b
403 changed files with 22915 additions and 44424 deletions

View File

@@ -9,4 +9,5 @@
*.txt
vendor/
/removecomments
/removecomments
/snake2camel

View File

@@ -2,21 +2,34 @@
**A parser for URNs**.
> As seen on [RFC 2141](https://tools.ietf.org/html/rfc2141#ref-1).
> As seen on [RFC 2141](https://datatracker.ietf.org/doc/html/rfc2141), [RFC 7643](https://datatracker.ietf.org/doc/html/rfc7643#section-10), and on [RFC 8141](https://datatracker.ietf.org/doc/html/rfc8141).
[API documentation](https://godoc.org/github.com/leodido/go-urn).
Starting with version 1.3 this library also supports [RFC 7643 SCIM URNs](https://datatracker.ietf.org/doc/html/rfc7643#section-10).
Starting with version 1.4 this library also supports [RFC 8141 URNs (2017)](https://datatracker.ietf.org/doc/html/rfc8141).
## Installation
```
go get github.com/leodido/go-urn
```
## Features
1. RFC 2141 URNs parsing (default)
2. RFC 8141 URNs parsing (supersedes RFC 2141)
3. RFC 7643 SCIM URNs parsing
4. Normalization as per RFCs
5. Lexical equivalence as per RFCs
6. Precise, fine-grained errors
## Performances
This implementation results to be really fast.
Usually below ½ microsecond on my machine<sup>[1](#mymachine)</sup>.
Usually below 400 ns on my machine<sup>[1](#mymachine)</sup>.
Notice it also performs, while parsing:
@@ -24,35 +37,36 @@ Notice it also performs, while parsing:
2. specific-string normalization
```
ok/00/urn:a:b______________________________________/-4 20000000 265 ns/op 182 B/op 6 allocs/op
ok/01/URN:foo:a123,456_____________________________/-4 30000000 296 ns/op 200 B/op 6 allocs/op
ok/02/urn:foo:a123%2c456___________________________/-4 20000000 331 ns/op 208 B/op 6 allocs/op
ok/03/urn:ietf:params:scim:schemas:core:2.0:User___/-4 20000000 430 ns/op 280 B/op 6 allocs/op
ok/04/urn:ietf:params:scim:schemas:extension:enterp/-4 20000000 411 ns/op 312 B/op 6 allocs/op
ok/05/urn:ietf:params:scim:schemas:extension:enterp/-4 20000000 472 ns/op 344 B/op 6 allocs/op
ok/06/urn:burnout:nss______________________________/-4 30000000 257 ns/op 192 B/op 6 allocs/op
ok/07/urn:abcdefghilmnopqrstuvzabcdefghilm:x_______/-4 20000000 375 ns/op 213 B/op 6 allocs/op
ok/08/urn:urnurnurn:urn____________________________/-4 30000000 265 ns/op 197 B/op 6 allocs/op
ok/09/urn:ciao:@!=%2c(xyz)+a,b.*@g=$_'_____________/-4 20000000 307 ns/op 248 B/op 6 allocs/op
ok/10/URN:x:abc%1dz%2f%3az_________________________/-4 30000000 259 ns/op 212 B/op 6 allocs/op
no/11/URN:-xxx:x___________________________________/-4 20000000 445 ns/op 320 B/op 6 allocs/op
no/12/urn::colon:nss_______________________________/-4 20000000 461 ns/op 320 B/op 6 allocs/op
no/13/urn:abcdefghilmnopqrstuvzabcdefghilmn:specifi/-4 10000000 660 ns/op 320 B/op 6 allocs/op
no/14/URN:a!?:x____________________________________/-4 20000000 507 ns/op 320 B/op 6 allocs/op
no/15/urn:urn:NSS__________________________________/-4 20000000 429 ns/op 288 B/op 6 allocs/op
no/16/urn:white_space:NSS__________________________/-4 20000000 482 ns/op 320 B/op 6 allocs/op
no/17/urn:concat:no_spaces_________________________/-4 20000000 539 ns/op 328 B/op 7 allocs/op
no/18/urn:a:/______________________________________/-4 20000000 470 ns/op 320 B/op 7 allocs/op
no/19/urn:UrN:NSS__________________________________/-4 20000000 399 ns/op 288 B/op 6 allocs/op
ok/00/urn:a:b______________________________________/-10 51372006 109.0 ns/op 275 B/op 3 allocs/op
ok/01/URN:foo:a123,456_____________________________/-10 36024072 160.8 ns/op 296 B/op 6 allocs/op
ok/02/urn:foo:a123%2C456___________________________/-10 31901007 188.4 ns/op 320 B/op 7 allocs/op
ok/03/urn:ietf:params:scim:schemas:core:2.0:User___/-10 22736756 266.6 ns/op 376 B/op 6 allocs/op
ok/04/urn:ietf:params:scim:schemas:extension:enterp/-10 18291859 335.2 ns/op 408 B/op 6 allocs/op
ok/05/urn:ietf:params:scim:schemas:extension:enterp/-10 15283087 379.4 ns/op 440 B/op 6 allocs/op
ok/06/urn:burnout:nss______________________________/-10 39407593 155.1 ns/op 288 B/op 6 allocs/op
ok/07/urn:abcdefghilmnopqrstuvzabcdefghilm:x_______/-10 27832718 211.4 ns/op 307 B/op 4 allocs/op
ok/08/urn:urnurnurn:urn____________________________/-10 33269596 168.1 ns/op 293 B/op 6 allocs/op
ok/09/urn:ciao:!!*_________________________________/-10 41100675 148.8 ns/op 288 B/op 6 allocs/op
ok/10/urn:ciao:=@__________________________________/-10 37214253 149.7 ns/op 284 B/op 6 allocs/op
ok/11/urn:ciao:@!=%2C(xyz)+a,b.*@g=$_'_____________/-10 26534240 229.8 ns/op 336 B/op 7 allocs/op
ok/12/URN:x:abc%1Dz%2F%3az_________________________/-10 28166396 211.8 ns/op 336 B/op 7 allocs/op
no/13/URN:---xxx:x_________________________________/-10 23635159 255.6 ns/op 419 B/op 5 allocs/op
no/14/urn::colon:nss_______________________________/-10 23594779 258.4 ns/op 419 B/op 5 allocs/op
no/15/URN:@,:x_____________________________________/-10 23742535 261.5 ns/op 419 B/op 5 allocs/op
no/16/URN:URN:NSS__________________________________/-10 27432714 223.3 ns/op 371 B/op 5 allocs/op
no/17/urn:UrN:NSS__________________________________/-10 26922117 224.9 ns/op 371 B/op 5 allocs/op
no/18/urn:a:%______________________________________/-10 24926733 224.6 ns/op 371 B/op 5 allocs/op
no/19/urn:urn:NSS__________________________________/-10 27652641 220.7 ns/op 371 B/op 5 allocs/op
```
---
* <a name="mymachine">[1]</a>: Apple M1 Pro
* <a name="mymachine">[1]</a>: Intel Core i7-7600U CPU @ 2.80GHz
---
## Example
For more examples take a look at the [examples file](examples_test.go).
```go
package main
@@ -64,6 +78,35 @@ import (
func main() {
var uid = "URN:foo:a123,456"
// Parse the input string as a RFC 2141 URN only
u, e := urn.NewMachine().Parse(uid)
if e != nil {
fmt.Errorf(err)
return
}
fmt.Println(u.ID)
fmt.Println(u.SS)
// Output:
// foo
// a123,456
}
```
```go
package main
import (
"fmt"
"github.com/leodido/go-urn"
)
func main() {
var uid = "URN:foo:a123,456"
// Parse the input string as a RFC 2141 URN only
u, ok := urn.Parse([]byte(uid))
if !ok {
panic("error parsing urn")
@@ -78,4 +121,33 @@ func main() {
}
```
[![Analytics](https://ga-beacon.appspot.com/UA-49657176-1/go-urn?flat)](https://github.com/igrigorik/ga-beacon)
```go
package main
import (
"fmt"
"github.com/leodido/go-urn"
)
func main() {
input := "urn:ietf:params:scim:api:messages:2.0:ListResponse"
// Parsing the input string as a RFC 7643 SCIM URN
u, ok := urn.Parse([]byte(input), urn.WithParsingMode(urn.RFC7643Only))
if !ok {
panic("error parsing urn")
}
fmt.Println(u.IsSCIM())
scim := u.SCIM()
fmt.Println(scim.Type.String())
fmt.Println(scim.Name)
fmt.Println(scim.Other)
// Output:
// true
// api
// messages
// 2.0:ListResponse
}
```

File diff suppressed because it is too large Load Diff

View File

@@ -2,15 +2,28 @@ package urn
import (
"fmt"
scimschema "github.com/leodido/go-urn/scim/schema"
)
var (
errPrefix = "expecting the prefix to be the \"urn\" string (whatever case) [col %d]"
errIdentifier = "expecting the identifier to be string (1..31 alnum chars, also containing dashes but not at its start) [col %d]"
errSpecificString = "expecting the specific string to be a string containing alnum, hex, or others ([()+,-.:=@;$_!*']) chars [col %d]"
errNoUrnWithinID = "expecting the identifier to not contain the \"urn\" reserved string [col %d]"
errHex = "expecting the specific string hex chars to be well-formed (%%alnum{2}) [col %d]"
errParse = "parsing error [col %d]"
errPrefix = "expecting the prefix to be the \"urn\" string (whatever case) [col %d]"
errIdentifier = "expecting the identifier to be string (1..31 alnum chars, also containing dashes but not at its beginning) [col %d]"
errSpecificString = "expecting the specific string to be a string containing alnum, hex, or others ([()+,-.:=@;$_!*']) chars [col %d]"
errNoUrnWithinID = "expecting the identifier to not contain the \"urn\" reserved string [col %d]"
errHex = "expecting the percent encoded chars to be well-formed (%%alnum{2}) [col %d]"
errSCIMNamespace = "expecing the SCIM namespace identifier (ietf:params:scim) [col %d]"
errSCIMType = "expecting a correct SCIM type (schemas, api, param) [col %d]"
errSCIMName = "expecting one or more alnum char in the SCIM name part [col %d]"
errSCIMOther = "expecting a well-formed other SCIM part [col %d]"
errSCIMOtherIncomplete = "expecting a not empty SCIM other part after colon [col %d]"
err8141InformalID = "informal URN namespace must be in the form urn-[1-9][0-9] [col %d]"
err8141SpecificString = "expecting the specific string to contain alnum, hex, or others ([~&()+,-.:=@;$_!*'] or [/?] not in first position) chars [col %d]"
err8141Identifier = "expecting the indentifier to be a string with (length 2 to 32 chars) containing alnum (or dashes) not starting or ending with a dash [col %d]"
err8141RComponentStart = "expecting only one r-component (starting with the ?+ sequence) [col %d]"
err8141QComponentStart = "expecting only one q-component (starting with the ?= sequence) [col %d]"
err8141MalformedRComp = "expecting a non-empty r-component containing alnum, hex, or others ([~&()+,-.:=@;$_!*'] or [/?] but not at its beginning) [col %d]"
err8141MalformedQComp = "expecting a non-empty q-component containing alnum, hex, or others ([~&()+,-.:=@;$_!*'] or [/?] but not at its beginning) [col %d]"
)
%%{
@@ -24,25 +37,42 @@ action mark {
}
action tolower {
m.tolower = append(m.tolower, m.p - m.pb)
// List of positions in the buffer to later lowercase
output.tolower = append(output.tolower, m.p - m.pb)
}
action set_pre {
output.prefix = string(m.text())
}
action throw_pre_urn_err {
if m.parsingMode != RFC8141Only {
// Throw an error when:
// - we are entering here matching the the prefix in the namespace identifier part
// - looking ahead (3 chars) we find a colon
if pos := m.p + 3; pos < m.pe && m.data[pos] == 58 && output.prefix != "" {
m.err = fmt.Errorf(errNoUrnWithinID, pos)
fhold;
fgoto fail;
}
}
}
action set_nid {
output.ID = string(m.text())
}
action set_nss {
raw := m.text()
output.SS = string(raw)
output.SS = string(m.text())
// Iterate upper letters lowering them
for _, i := range m.tolower {
raw[i] = raw[i] + 32
for _, i := range output.tolower {
m.data[m.pb+i] = m.data[m.pb+i] + 32
}
output.norm = string(m.text())
// Revert the buffer to the original
for _, i := range output.tolower {
m.data[m.pb+i] = m.data[m.pb+i] - 32
}
output.norm = string(raw)
}
action err_pre {
@@ -70,20 +100,20 @@ action err_urn {
}
action err_hex {
m.err = fmt.Errorf(errHex, m.p)
fhold;
fgoto fail;
if m.parsingMode == RFC2141Only || m.parsingMode == RFC8141Only {
m.err = fmt.Errorf(errHex, m.p)
fhold;
fgoto fail;
}
}
action err_parse {
m.err = fmt.Errorf(errParse, m.p)
fhold;
fgoto fail;
action base_type {
output.kind = RFC2141;
}
pre = ([uU][rR][nN] @err(err_pre)) >mark %set_pre;
pre = ([uU] @err(err_pre) [rR] @err(err_pre) [nN] @err(err_pre)) >mark >throw_pre_urn_err %set_pre;
nid = (alnum >mark (alnum | '-'){0,31}) %set_nid;
nid = (alnum >mark (alnum | '-'){0,31}) $err(err_nid) %set_nid;
hex = '%' (digit | lower | upper >tolower){2} $err(err_hex);
@@ -91,9 +121,179 @@ sss = (alnum | [()+,\-.:=@;$_!*']);
nss = (sss | hex)+ $err(err_nss);
nid_not_urn = (nid - pre %err(err_urn));
urn = pre ':' @err(err_pre) (nid_not_urn ':' nss >mark %set_nss) %eof(base_type);
### SCIM BEG
action err_scim_nid {
m.err = fmt.Errorf(errSCIMNamespace, m.p)
fhold;
fgoto fail;
}
action err_scim_type {
m.err = fmt.Errorf(errSCIMType, m.p)
fhold;
fgoto fail;
}
action err_scim_name {
m.err = fmt.Errorf(errSCIMName, m.p)
fhold;
fgoto fail;
}
action err_scim_other {
if m.p == m.pe {
m.err = fmt.Errorf(errSCIMOtherIncomplete, m.p-1)
} else {
m.err = fmt.Errorf(errSCIMOther, m.p)
}
fhold;
fgoto fail;
}
action scim_type {
output.kind = RFC7643;
}
action create_scim {
output.scim = &SCIM{};
}
action set_scim_type {
output.scim.Type = scimschema.TypeFromString(string(m.text()))
}
action mark_scim_name {
output.scim.pos = m.p
}
action set_scim_name {
output.scim.Name = string(m.data[output.scim.pos:m.p])
}
action mark_scim_other {
output.scim.pos = m.p
}
action set_scim_other {
output.scim.Other = string(m.data[output.scim.pos:m.p])
}
scim_nid = 'ietf:params:scim' >mark %set_nid %create_scim $err(err_scim_nid);
scim_other = ':' (sss | hex)+ >mark_scim_other %set_scim_other $err(err_scim_other);
scim_name = (alnum)+ >mark_scim_name %set_scim_name $err(err_scim_name);
scim_type = ('schemas' | 'api' | 'param') >mark %set_scim_type $err(err_scim_type);
scim_only := pre ':' @err(err_pre) (scim_nid ':' scim_type ':' scim_name scim_other? %set_nss) %eof(scim_type);
### SCIM END
### 8141 BEG
action err_nss_8141 {
m.err = fmt.Errorf(err8141SpecificString, m.p)
fhold;
fgoto fail;
}
action err_nid_8141 {
m.err = fmt.Errorf(err8141Identifier, m.p)
fhold;
fgoto fail;
}
action rfc8141_type {
output.kind = RFC8141;
}
action set_r_component {
output.rComponent = string(m.text())
}
action set_q_component {
output.qComponent = string(m.text())
}
action set_f_component {
output.fComponent = string(m.text())
}
action informal_nid_match {
fhold;
m.err = fmt.Errorf(err8141InformalID, m.p);
fgoto fail;
}
action mark_r_start {
if output.rStart {
m.err = fmt.Errorf(err8141RComponentStart, m.p)
fhold;
fgoto fail;
}
output.rStart = true
}
action mark_q_start {
if output.qStart {
m.err = fmt.Errorf(err8141QComponentStart, m.p)
fhold;
fgoto fail;
}
output.qStart = true
}
action err_malformed_r_component {
m.err = fmt.Errorf(err8141MalformedRComp, m.p)
fhold;
fgoto fail;
}
action err_malformed_q_component {
m.err = fmt.Errorf(err8141MalformedQComp, m.p)
fhold;
fgoto fail;
}
pchar = (sss | '~' | '&' | hex);
component = pchar (pchar | '/' | '?')*;
r_start = ('?+') %mark_r_start;
r_component = r_start <: (r_start | component)+ $err(err_malformed_r_component) >mark %set_r_component;
q_start = ('?=') %mark_q_start;
q_component = q_start <: (q_start | component)+ $err(err_malformed_q_component) >mark %set_q_component;
rq_components = (r_component :>> q_component? | q_component);
fragment = (pchar | '/' | '?')*;
f_component = '#' fragment >mark %set_f_component;
nss_rfc8141 = (pchar >mark (pchar | '/')*) $err(err_nss_8141) %set_nss;
nid_rfc8141 = (alnum >mark (alnum | '-'){0,30} alnum) $err(err_nid_8141) %set_nid;
informal_id = pre ('-' [a-zA-z0] %to(informal_nid_match));
nid_rfc8141_not_urn = (nid_rfc8141 - informal_id?);
rfc8141_only := pre ':' @err(err_pre) nid_rfc8141_not_urn ':' nss_rfc8141 rq_components? f_component? %eof(rfc8141_type);
### 8141 END
fail := (any - [\n\r])* @err{ fgoto main; };
main := (pre ':' (nid - pre %err(err_urn)) $err(err_nid) ':' nss >mark %set_nss) $err(err_parse);
main := urn;
}%%
@@ -103,6 +303,7 @@ main := (pre ':' (nid - pre %err(err_urn)) $err(err_nid) ':' nss >mark %set_nss)
type Machine interface {
Error() error
Parse(input []byte) (*URN, error)
WithParsingMode(ParsingMode)
}
type machine struct {
@@ -110,12 +311,24 @@ type machine struct {
cs int
p, pe, eof, pb int
err error
tolower []int
startParsingAt int
parsingMode ParsingMode
parsingModeSet bool
}
// NewMachine creates a new FSM able to parse RFC 2141 strings.
func NewMachine() Machine {
m := &machine{}
func NewMachine(options ...Option) Machine {
m := &machine{
parsingModeSet: false,
}
for _, o := range options {
o(m)
}
// Set default parsing mode
if !m.parsingModeSet {
m.WithParsingMode(DefaultParsingMode)
}
%% access m.;
%% variable p m.p;
@@ -137,7 +350,7 @@ func (m *machine) text() []byte {
return m.data[m.pb:m.p]
}
// Parse parses the input byte array as a RFC 2141 string.
// Parse parses the input byte array as a RFC 2141 or RFC7643 string.
func (m *machine) Parse(input []byte) (*URN, error) {
m.data = input
m.p = 0
@@ -145,10 +358,11 @@ func (m *machine) Parse(input []byte) (*URN, error) {
m.pe = len(input)
m.eof = len(input)
m.err = nil
m.tolower = []int{}
output := &URN{}
m.cs = m.startParsingAt
output := &URN{
tolower: []int{},
}
%% write init;
%% write exec;
if m.cs < first_final || m.cs == en_fail {
@@ -157,3 +371,16 @@ func (m *machine) Parse(input []byte) (*URN, error) {
return output, nil
}
func (m *machine) WithParsingMode(x ParsingMode) {
m.parsingMode = x
switch m.parsingMode {
case RFC2141Only:
m.startParsingAt = en_main
case RFC8141Only:
m.startParsingAt = en_rfc8141_only
case RFC7643Only:
m.startParsingAt = en_scim_only
}
m.parsingModeSet = true
}

View File

@@ -15,18 +15,24 @@ clean:
.PHONY: images
images: docs/urn.png
.PHONY: snake2camel
snake2camel:
@cd ./tools/snake2camel; go build -o ../../snake2camel .
.PHONY: removecomments
removecomments:
@cd ./tools/removecomments; go build -o ../../removecomments .
machine.go: machine.go.rl
machine.go: snake2camel
machine.go: removecomments
machine.go:
$(RAGEL) -Z -G2 -e -o $@ $<
$(RAGEL) -Z -G1 -e -o $@ $<
@./removecomments $@
$(MAKE) -s file=$@ snake2camel
@./snake2camel $@
$(GOFMT) $@
docs/urn.dot: machine.go.rl
@@ -41,13 +47,5 @@ bench: *_test.go machine.go
go test -bench=. -benchmem -benchtime=5s ./...
.PHONY: tests
tests: *_test.go
tests: *_test.go
$(GO_TEST) ./...
.PHONY: snake2camel
snake2camel:
@awk -i inplace '{ \
while ( match($$0, /(.*)([a-z]+[0-9]*)_([a-zA-Z0-9])(.*)/, cap) ) \
$$0 = cap[1] cap[2] toupper(cap[3]) cap[4]; \
print \
}' $(file)

View File

@@ -16,10 +16,18 @@ const errInvalidURN = "invalid URN: %s"
//
// Details at https://tools.ietf.org/html/rfc2141.
type URN struct {
prefix string // Static prefix. Equal to "urn" when empty.
ID string // Namespace identifier
SS string // Namespace specific string
norm string // Normalized namespace specific string
prefix string // Static prefix. Equal to "urn" when empty.
ID string // Namespace identifier (NID)
SS string // Namespace specific string (NSS)
norm string // Normalized namespace specific string
kind Kind
scim *SCIM
rComponent string // RFC8141
qComponent string // RFC8141
fComponent string // RFC8141
rStart bool // RFC8141
qStart bool // RFC8141
tolower []int
}
// Normalize turns the receiving URN into its norm version.
@@ -30,12 +38,21 @@ func (u *URN) Normalize() *URN {
prefix: "urn",
ID: strings.ToLower(u.ID),
SS: u.norm,
// rComponent: u.rComponent,
// qComponent: u.qComponent,
// fComponent: u.fComponent,
}
}
// Equal checks the lexical equivalence of the current URN with another one.
func (u *URN) Equal(x *URN) bool {
return *u.Normalize() == *x.Normalize()
if x == nil {
return false
}
nu := u.Normalize()
nx := x.Normalize()
return nu.prefix == nx.prefix && nu.ID == nx.ID && nu.SS == nx.SS
}
// String reassembles the URN into a valid URN string.
@@ -51,14 +68,23 @@ func (u *URN) String() string {
res += "urn"
}
res += u.prefix + ":" + u.ID + ":" + u.SS
if u.rComponent != "" {
res += "?+" + u.rComponent
}
if u.qComponent != "" {
res += "?=" + u.qComponent
}
if u.fComponent != "" {
res += "#" + u.fComponent
}
}
return res
}
// Parse is responsible to create an URN instance from a byte array matching the correct URN syntax.
func Parse(u []byte) (*URN, bool) {
urn, err := NewMachine().Parse(u)
// Parse is responsible to create an URN instance from a byte array matching the correct URN syntax (RFC 2141).
func Parse(u []byte, options ...Option) (*URN, bool) {
urn, err := NewMachine(options...).Parse(u)
if err != nil {
return nil, false
}
@@ -71,7 +97,7 @@ func (u URN) MarshalJSON() ([]byte, error) {
return json.Marshal(u.String())
}
// MarshalJSON unmarshals a URN from JSON string form (e.g. `"urn:oid:1.2.3.4"`).
// UnmarshalJSON unmarshals a URN from JSON string form (e.g. `"urn:oid:1.2.3.4"`).
func (u *URN) UnmarshalJSON(bytes []byte) error {
var str string
if err := json.Unmarshal(bytes, &str); err != nil {
@@ -82,5 +108,34 @@ func (u *URN) UnmarshalJSON(bytes []byte) error {
} else {
*u = *value
}
return nil
}
}
func (u *URN) IsSCIM() bool {
return u.kind == RFC7643
}
func (u *URN) SCIM() *SCIM {
if u.kind != RFC7643 {
return nil
}
return u.scim
}
func (u *URN) RFC() Kind {
return u.kind
}
func (u *URN) FComponent() string {
return u.fComponent
}
func (u *URN) QComponent() string {
return u.qComponent
}
func (u *URN) RComponent() string {
return u.rComponent
}