diff --git a/dataframe.go b/dataframe.go index 2a5621d32aff3d290b07249873d954a02e4659b1..5f548688bef7f65e2e9892fcefbe9c99eb43367d 100644 --- a/dataframe.go +++ b/dataframe.go @@ -2,7 +2,6 @@ package pandas import ( "fmt" - "reflect" "sort" "strconv" "strings" @@ -498,152 +497,6 @@ func WithComments(b rune) LoadOption { } } -// LoadStructs creates a new DataFrame from arbitrary struct slices. -// -// LoadStructs will ignore unexported fields inside an struct. Note also that -// unless otherwise specified the column names will correspond with the name of -// the field. -// -// You can configure each field with the `dataframe:"name[,type]"` struct -// tag. If the name on the tag is the empty string `""` the field name will be -// used instead. If the name is `"-"` the field will be ignored. -// -// Examples: -// -// // field will be ignored -// field int -// -// // Field will be ignored -// Field int `dataframe:"-"` -// -// // Field will be parsed with column name Field and type int -// Field int -// -// // Field will be parsed with column name `field_column` and type int. -// Field int `dataframe:"field_column"` -// -// // Field will be parsed with column name `field` and type string. -// Field int `dataframe:"field,string"` -// -// // Field will be parsed with column name `Field` and type string. -// Field int `dataframe:",string"` -// -// If the struct tags and the given LoadOptions contradict each other, the later -// will have preference over the former. -func LoadStructs(i interface{}, options ...LoadOption) DataFrame { - if i == nil { - return DataFrame{Err: fmt.Errorf("load: can't create DataFrame from value")} - } - - // Set the default load options - cfg := loadOptions{ - defaultType: SERIES_TYPE_STRING, - detectTypes: true, - hasHeader: true, - nanValues: []string{"NA", "NaN", ""}, - } - - // Set any custom load options - for _, option := range options { - option(&cfg) - } - - tpy, val := reflect.TypeOf(i), reflect.ValueOf(i) - switch tpy.Kind() { - case reflect.Slice: - if tpy.Elem().Kind() != reflect.Struct { - return DataFrame{Err: fmt.Errorf( - "load: type %s (%s %s) is not supported, must be []struct", tpy.Name(), tpy.Elem().Kind(), tpy.Kind())} - } - if val.Len() == 0 { - return DataFrame{Err: fmt.Errorf("load: can't create DataFrame from empty slice")} - } - - numFields := val.Index(0).Type().NumField() - var columns []Series - for j := 0; j < numFields; j++ { - // Extract field metadata - if !val.Index(0).Field(j).CanInterface() { - continue - } - field := val.Index(0).Type().Field(j) - fieldName := field.Name - fieldType := field.Type.String() - - // Process struct tags - fieldTags := field.Tag.Get("dataframe") - if fieldTags == "-" { - continue - } - tagOpts := strings.Split(fieldTags, ",") - if len(tagOpts) > 2 { - return DataFrame{Err: fmt.Errorf("malformed struct tag on field %s: %s", fieldName, fieldTags)} - } - if len(tagOpts) > 0 { - if name := strings.TrimSpace(tagOpts[0]); name != "" { - fieldName = name - } - if len(tagOpts) == 2 { - if tagType := strings.TrimSpace(tagOpts[1]); tagType != "" { - fieldType = tagType - } - } - } - - // Handle `types` option - var t Type - if cfgtype, ok := cfg.types[fieldName]; ok { - t = cfgtype - } else { - // Handle `detectTypes` option - if cfg.detectTypes { - // Parse field type - parsedType, err := parseType(fieldType) - if err != nil { - return DataFrame{Err: err} - } - t = parsedType - } else { - t = cfg.defaultType - } - } - - // Create Series for this field - elements := make([]interface{}, val.Len()) - for i := 0; i < val.Len(); i++ { - fieldValue := val.Index(i).Field(j) - elements[i] = fieldValue.Interface() - - // Handle `nanValues` option - if findInStringSlice(fmt.Sprint(elements[i]), cfg.nanValues) != -1 { - elements[i] = nil - } - } - - // Handle `hasHeader` option - if !cfg.hasHeader { - tmp := make([]interface{}, 1) - tmp[0] = fieldName - elements = append(tmp, elements...) - fieldName = "" - } - if t == SERIES_TYPE_STRING { - columns = append(columns, NewSeriesString(fieldName, elements)) - } else if t == SERIES_TYPE_BOOL { - columns = append(columns, NewSeriesBool(fieldName, elements)) - } else if t == SERIES_TYPE_INT { - columns = append(columns, NewSeriesInt64(fieldName, elements)) - } else { - // 默认float - columns = append(columns, NewSeriesFloat64(fieldName, elements)) - } - } - return NewDataFrame(columns...) - } - return DataFrame{Err: fmt.Errorf( - "load: type %s (%s) is not supported, must be []struct", tpy.Name(), tpy.Kind())} -} - func parseType(s string) (Type, error) { switch s { case "float", "float64", "float32": diff --git a/dataframe_csv.go b/dataframe_csv.go new file mode 100644 index 0000000000000000000000000000000000000000..ade134ca195bab80cd4b77755341609a9fac9671 --- /dev/null +++ b/dataframe_csv.go @@ -0,0 +1,135 @@ +package pandas + +import ( + "encoding/csv" + "github.com/mymmsc/gox/api" + "github.com/mymmsc/gox/logger" + "github.com/mymmsc/gox/util/homedir" + "io" + "os" +) + +// ReadCSV reads a CSV file from a io.Reader and builds a DataFrame with the +// resulting records. +// 支持文件名和io两种方式读取数据 +func ReadCSV(in any, options ...LoadOption) DataFrame { + var ( + reader io.Reader + filename string + ) + switch param := in.(type) { + case io.Reader: + reader = param + case string: + filename = param + } + + if !IsEmpty(filename) { + filepath, err := homedir.Expand(filename) + if err != nil { + logger.Errorf("%s, error=%+v\n", filename, err) + return DataFrame{} + } + csvFile, err := os.Open(filepath) + if err != nil { + logger.Errorf("%s, error=%+v\n", filename, err) + return DataFrame{} + } + defer api.CloseQuietly(csvFile) + reader = csvFile + } + + csvReader := csv.NewReader(reader) + cfg := loadOptions{ + delimiter: ',', + lazyQuotes: false, + comment: 0, + } + for _, option := range options { + option(&cfg) + } + + csvReader.Comma = cfg.delimiter + csvReader.LazyQuotes = cfg.lazyQuotes + csvReader.Comment = cfg.comment + + records, err := csvReader.ReadAll() + if err != nil { + return DataFrame{Err: err} + } + return LoadRecords(records, options...) +} + +// WriteOption is the type used to configure the writing of elements +type WriteOption func(*writeOptions) + +type writeOptions struct { + // Specifies whether the header is also written + writeHeader bool +} + +// WriteHeader sets the writeHeader option for writeOptions. +func WriteHeader(b bool) WriteOption { + return func(c *writeOptions) { + c.writeHeader = b + } +} + +// WriteCSV writes the DataFrame to the given io.Writer as a CSV file. +// 支持文件名和io两种方式写入数据 +func (df DataFrame) WriteCSV(out any, options ...WriteOption) error { + var ( + writer io.Writer + filename string + ) + switch param := out.(type) { + case io.Writer: + writer = param + case string: + filename = param + } + + if !IsEmpty(filename) { + filepath, err := homedir.Expand(filename) + if err != nil { + return err + } + csvFile, err := os.Create(filepath) + if err != nil { + return err + } + defer api.CloseQuietly(csvFile) + writer = csvFile + } + // Set the default write options + cfg := writeOptions{ + writeHeader: true, + } + + // Set any custom write options + for _, option := range options { + option(&cfg) + } + + records := df.Records() + if !cfg.writeHeader { + records = records[1:] + } + + return csv.NewWriter(writer).WriteAll(records) +} + +// ToCSV 写csv格式文件 +func (self DataFrame) oldToCSV(filename string, options ...WriteOption) error { + filepath, err := homedir.Expand(filename) + if err != nil { + return err + } + csvFile, err := os.Create(filepath) + if err != nil { + return err + } + defer api.CloseQuietly(csvFile) + err = self.WriteCSV(csvFile, options...) + return err +} diff --git a/dataframe_records.go b/dataframe_records.go new file mode 100644 index 0000000000000000000000000000000000000000..798a8717682c3600407c4dff2035a56414c242fb --- /dev/null +++ b/dataframe_records.go @@ -0,0 +1,130 @@ +package pandas + +import ( + "fmt" + "strconv" +) + +// LoadRecords creates a new DataFrame based on the given records. +func LoadRecords(records [][]string, options ...LoadOption) DataFrame { + // Set the default load options + cfg := loadOptions{ + defaultType: SERIES_TYPE_STRING, + detectTypes: true, + hasHeader: true, + nanValues: []string{"NA", "NaN", ""}, + } + + // Set any custom load options + for _, option := range options { + option(&cfg) + } + + if len(records) == 0 { + return DataFrame{Err: fmt.Errorf("load records: empty DataFrame")} + } + if cfg.hasHeader && len(records) <= 1 { + return DataFrame{Err: fmt.Errorf("load records: empty DataFrame")} + } + if cfg.names != nil && len(cfg.names) != len(records[0]) { + if len(cfg.names) > len(records[0]) { + return DataFrame{Err: fmt.Errorf("load records: too many column names")} + } + return DataFrame{Err: fmt.Errorf("load records: not enough column names")} + } + + // Extract headers + headers := make([]string, len(records[0])) + if cfg.hasHeader { + headers = records[0] + records = records[1:] + } + if cfg.names != nil { + headers = cfg.names + } + + types := make([]Type, len(headers)) + rawcols := make([][]string, len(headers)) + for i, colname := range headers { + rawcol := make([]string, len(records)) + for j := 0; j < len(records); j++ { + rawcol[j] = records[j][i] + if findInStringSlice(rawcol[j], cfg.nanValues) != -1 { + rawcol[j] = "NaN" + } + } + rawcols[i] = rawcol + + t, ok := cfg.types[colname] + if !ok { + t = cfg.defaultType + if cfg.detectTypes { + if l, err := findType(rawcol); err == nil { + t = l + } + } + } + types[i] = t + } + + columns := make([]Series, len(headers)) + for i, colname := range headers { + cols := rawcols[i] + col := NewSeries(types[i], colname, cols) + //if col.Err != nil { + // return DataFrame{Err: col.Err} + //} + columns[i] = *col + } + nrows, ncols, err := checkColumnsDimensions(columns...) + if err != nil { + return DataFrame{Err: err} + } + df := DataFrame{ + columns: columns, + ncols: ncols, + nrows: nrows, + } + + colnames := df.Names() + fixColnames(colnames) + for i, colname := range colnames { + df.columns[i].Rename(colname) + } + return df +} + +func findType(arr []string) (Type, error) { + var hasFloats, hasInts, hasBools, hasStrings bool + for _, str := range arr { + if str == "" || str == "NaN" { + continue + } + if _, err := strconv.Atoi(str); err == nil { + hasInts = true + continue + } + if _, err := strconv.ParseFloat(str, 64); err == nil { + hasFloats = true + continue + } + if str == "true" || str == "false" { + hasBools = true + continue + } + hasStrings = true + } + + switch { + case hasStrings: + return SERIES_TYPE_STRING, nil + case hasBools: + return SERIES_TYPE_BOOL, nil + case hasFloats: + return SERIES_TYPE_FLOAT, nil + case hasInts: + return SERIES_TYPE_INT, nil + default: + return SERIES_TYPE_STRING, fmt.Errorf("couldn't detect type") + } +} diff --git a/dataframe_struct.go b/dataframe_struct.go new file mode 100644 index 0000000000000000000000000000000000000000..6bd0a5ac51222b54f8cce96bedba378c3ec91eb1 --- /dev/null +++ b/dataframe_struct.go @@ -0,0 +1,153 @@ +package pandas + +import ( + "fmt" + "reflect" + "strings" +) + +// LoadStructs creates a new DataFrame from arbitrary struct slices. +// +// LoadStructs will ignore unexported fields inside an struct. Note also that +// unless otherwise specified the column names will correspond with the name of +// the field. +// +// You can configure each field with the `dataframe:"name[,type]"` struct +// tag. If the name on the tag is the empty string `""` the field name will be +// used instead. If the name is `"-"` the field will be ignored. +// +// Examples: +// +// // field will be ignored +// field int +// +// // Field will be ignored +// Field int `dataframe:"-"` +// +// // Field will be parsed with column name Field and type int +// Field int +// +// // Field will be parsed with column name `field_column` and type int. +// Field int `dataframe:"field_column"` +// +// // Field will be parsed with column name `field` and type string. +// Field int `dataframe:"field,string"` +// +// // Field will be parsed with column name `Field` and type string. +// Field int `dataframe:",string"` +// +// If the struct tags and the given LoadOptions contradict each other, the later +// will have preference over the former. +func LoadStructs(i interface{}, options ...LoadOption) DataFrame { + if i == nil { + return DataFrame{Err: fmt.Errorf("load: can't create DataFrame from value")} + } + + // Set the default load options + cfg := loadOptions{ + defaultType: SERIES_TYPE_STRING, + detectTypes: true, + hasHeader: true, + nanValues: []string{"NA", "NaN", ""}, + } + + // Set any custom load options + for _, option := range options { + option(&cfg) + } + + tpy, val := reflect.TypeOf(i), reflect.ValueOf(i) + switch tpy.Kind() { + case reflect.Slice: + if tpy.Elem().Kind() != reflect.Struct { + return DataFrame{Err: fmt.Errorf( + "load: type %s (%s %s) is not supported, must be []struct", tpy.Name(), tpy.Elem().Kind(), tpy.Kind())} + } + if val.Len() == 0 { + return DataFrame{Err: fmt.Errorf("load: can't create DataFrame from empty slice")} + } + + numFields := val.Index(0).Type().NumField() + var columns []Series + for j := 0; j < numFields; j++ { + // Extract field metadata + if !val.Index(0).Field(j).CanInterface() { + continue + } + field := val.Index(0).Type().Field(j) + fieldName := field.Name + fieldType := field.Type.String() + + // Process struct tags + fieldTags := field.Tag.Get("dataframe") + if fieldTags == "-" { + continue + } + tagOpts := strings.Split(fieldTags, ",") + if len(tagOpts) > 2 { + return DataFrame{Err: fmt.Errorf("malformed struct tag on field %s: %s", fieldName, fieldTags)} + } + if len(tagOpts) > 0 { + if name := strings.TrimSpace(tagOpts[0]); name != "" { + fieldName = name + } + if len(tagOpts) == 2 { + if tagType := strings.TrimSpace(tagOpts[1]); tagType != "" { + fieldType = tagType + } + } + } + + // Handle `types` option + var t Type + if cfgtype, ok := cfg.types[fieldName]; ok { + t = cfgtype + } else { + // Handle `detectTypes` option + if cfg.detectTypes { + // Parse field type + parsedType, err := parseType(fieldType) + if err != nil { + return DataFrame{Err: err} + } + t = parsedType + } else { + t = cfg.defaultType + } + } + + // Create Series for this field + elements := make([]interface{}, val.Len()) + for i := 0; i < val.Len(); i++ { + fieldValue := val.Index(i).Field(j) + elements[i] = fieldValue.Interface() + + // Handle `nanValues` option + if findInStringSlice(fmt.Sprint(elements[i]), cfg.nanValues) != -1 { + elements[i] = nil + } + } + + // Handle `hasHeader` option + if !cfg.hasHeader { + tmp := make([]interface{}, 1) + tmp[0] = fieldName + elements = append(tmp, elements...) + fieldName = "" + } + if t == SERIES_TYPE_STRING { + columns = append(columns, NewSeriesString(fieldName, elements)) + } else if t == SERIES_TYPE_BOOL { + columns = append(columns, NewSeriesBool(fieldName, elements)) + } else if t == SERIES_TYPE_INT { + columns = append(columns, NewSeriesInt64(fieldName, elements)) + } else { + // 默认float + columns = append(columns, NewSeriesFloat64(fieldName, elements)) + } + } + return NewDataFrame(columns...) + } + return DataFrame{Err: fmt.Errorf( + "load: type %s (%s) is not supported, must be []struct", tpy.Name(), tpy.Kind())} +} diff --git a/series_frame.go b/series_frame.go index 8784e0b75d2acf4eb5de882c3b55e4326c349e86..013efdac08eadf8c400b55728d40dd9ec43bb0fd 100644 --- a/series_frame.go +++ b/series_frame.go @@ -11,80 +11,21 @@ type SeriesFrame struct { lock sync.RWMutex name string nilCount int - elements any + //elements any } -func NewSeriesFrame(name string, vals ...interface{}) *SeriesFrame { - series := SeriesFloat64{ - SeriesFrame: SeriesFrame{ - name: name, - nilCount: 0, - valFormatter: DefaultValueFormatter, - }, - Data: []float64{}, - } - - series.Data = make([]float64, 0) // Warning: filled with 0.0 (not NaN) - size := len(series.Data) - for idx, v := range vals { - if fs, ok := v.([]float64); ok { - for idx, v := range fs { - val := AnyToFloat64(v) - if isNaN(val) { - series.nilCount++ - } - if idx < size { - series.Data[idx] = val - } else { - series.Data = append(series.Data, val) - } - } - continue - } else if fs, ok := v.([]any); ok { - for idx, v := range fs { - val := AnyToFloat64(v) - if isNaN(val) { - series.nilCount++ - } - if idx < size { - series.Data[idx] = val - } else { - series.Data = append(series.Data, val) - } - } - continue - } - - val := AnyToFloat64(v) - if isNaN(val) { - series.nilCount++ - } - - if idx < size { - series.Data[idx] = val - } else { - series.Data = append(series.Data, val) - } - } - - var lVals int - if len(vals) > 0 { - if fs, ok := vals[0].([]float64); ok { - lVals = len(fs) - } else { - lVals = len(vals) - } - } - - if lVals < size { - series.nilCount = series.nilCount + size - lVals - // Fill with NaN - for i := lVals; i < size; i++ { - series.Data[i] = nan() - } +func NewSeries(t Type, name string, vals ...interface{}) *Series { + var series Series + if t == SERIES_TYPE_BOOL { + series = NewSeriesBool(name, vals...) + } else if t == SERIES_TYPE_INT { + series = NewSeriesInt64(name, vals...) + } else if t == SERIES_TYPE_STRING { + series = NewSeriesString(name, vals...) + } else { + series = NewSeriesFloat64(name, vals...) } - series.SeriesFrame.elements = series.Data - return &series.SeriesFrame + return &series } func Shift[T ~int64 | ~float64 | ~bool | ~string](s *Series, periods int, cbNan func() T) *Series { @@ -123,3 +64,68 @@ func Shift[T ~int64 | ~float64 | ~bool | ~string](s *Series, periods int, cbNan _ = naVals return &d } + +func (self *SeriesFrame) Name() string { + //TODO implement me + panic("implement me") +} + +func (self *SeriesFrame) Rename(n string) { + //TODO implement me + panic("implement me") +} + +func (self *SeriesFrame) Type() Type { + //TODO implement me + panic("implement me") +} + +func (self *SeriesFrame) Len() int { + //TODO implement me + panic("implement me") +} + +func (self *SeriesFrame) Values() any { + //TODO implement me + panic("implement me") +} + +func (self *SeriesFrame) Empty() Series { + //TODO implement me + panic("implement me") +} + +func (self *SeriesFrame) Records() []string { + //TODO implement me + panic("implement me") +} + +func (self *SeriesFrame) Subset(start, end int) *Series { + //TODO implement me + panic("implement me") +} + +func (self *SeriesFrame) Repeat(x any, repeats int) *Series { + //TODO implement me + panic("implement me") +} + +func (self *SeriesFrame) Shift(periods int) *Series { + //TODO implement me + panic("implement me") +} + +func (self *SeriesFrame) Rolling(window int) RollingWindow { + //TODO implement me + panic("implement me") +} + +func (self *SeriesFrame) Mean() float64 { + //TODO implement me + panic("implement me") +} + +func (self *SeriesFrame) StdDev() float64 { + //TODO implement me + panic("implement me") +} diff --git a/series_frame_test.go b/series_frame_test.go index e9d3c919b8a514fb2f5b5c25dc8524017c46bdd0..11ae574e8a78d550adadfce9af8b153cd705fed9 100644 --- a/series_frame_test.go +++ b/series_frame_test.go @@ -7,7 +7,12 @@ import ( ) func TestNewSeriesFrame(t *testing.T) { + + //sf := NewSeries(SERIES_TYPE_STRING, "x", []string{"1", "2", "3"}) + //fmt.Println(sf) + type args struct { + t string name string vals []interface{} } @@ -20,8 +25,8 @@ func TestNewSeriesFrame(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if got := NewSeriesFrame(tt.args.name, tt.args.vals...); !reflect.DeepEqual(got, tt.want) { - t.Errorf("NewSeriesFrame() = %v, want %v", got, tt.want) + if got := NewSeries(tt.args.t, tt.args.name, tt.args.vals...); !reflect.DeepEqual(got, tt.want) { + t.Errorf("NewSeries() = %v, want %v", got, tt.want) } }) } @@ -29,11 +34,11 @@ func TestNewSeriesFrame(t *testing.T) { func TestSeriesFrame(t *testing.T) { data := []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10} - s1 := NewSeriesFrame("x", data) + s1 := NewSeries(SERIES_TYPE_FLOAT, "x", data) fmt.Printf("%+v\n", s1) var d1 any d1 = data - s2 := NewSeriesFrame("x", d1) + s2 := NewSeries(SERIES_TYPE_FLOAT, "x", d1) fmt.Printf("%+v\n", s2) } diff --git a/tests/series1_test.go b/tests/series1_test.go index 15fa3a189fae24f51b1d2e6d712a5757cde4c13f..8c1f953dd1d558cea6338119a3bcc4c5aea66452 100644 --- a/tests/series1_test.go +++ b/tests/series1_test.go @@ -1,8 +1,10 @@ package tests import ( + "bytes" "encoding/json" "fmt" + "gitee.com/quant1x/pandas" "strings" "testing" ) @@ -19,8 +21,14 @@ Country,Date,Age,Amount,Id "United States",2012-02-01,32,321.31,54320 Spain,2012-02-01,66,555.42,00241 ` - //df := dframe.ReadCSV(strings.NewReader(csvStr)) - //fmt.Println(df) + df := pandas.ReadCSV(strings.NewReader(csvStr)) + fmt.Println(df) + filename := "tutorials.csv" + _ = df.WriteCSV(filename) + buf := new(bytes.Buffer) + _ = df.WriteCSV(buf) + df = pandas.ReadCSV(filename) + fmt.Println(df) //df.SetNames("a", "b", "c", "d", "e") //s1 := df.Col("d") //fmt.Println(s1) @@ -29,8 +37,8 @@ Spain,2012-02-01,66,555.42,00241 //ma5 := closes.Rolling(5).Mean() //dframe.NewSeries(closes, dframe.Float, "") //fmt.Println(ma5) - _ = csvStr + } type T1 struct { diff --git a/type_string.go b/type_string.go index 3925629cfe62daac7849841b92ccb8594b0559c3..30c8d3d0d5ee5a2a4eaf5a7dbc9d986dad6a2949 100644 --- a/type_string.go +++ b/type_string.go @@ -1,6 +1,7 @@ package pandas import ( + "github.com/mymmsc/gox/logger" "strconv" "strings" ) @@ -75,6 +76,7 @@ func AnyToString(v any) string { case int32: return []string{strconv.FormatInt(int64(val), 10)}[0] default: + logger.Errorf("%s, error=The type is not recognized\n", v) _ = v.(string) // Intentionally panic return Nil2String }