|
Вопрос - как искать другие комбинации параметров Набор данных, о котором идёт речь - это статья S. H. Holt, J. C. Miller, P. Petocz. An insulin index of foods: the insulin demand generated by 1000-kJ portions of common foods, The American Journal of Clinical Nutrition, Volume 66, Issue 5, November 1997, Pages 1264–1276, откуда Вы пытаетесь найти зависимость между регрессором Glucose AUC (  ) и предикторами Fat, Protein, Sugar, Starch, Fiber, Water (  )? Я выгрузил набор данных из статьи:
- X <- structure(c(0.4, 0.3, 0, 0.6, 14.4, 11.9, 13.4, 10.9, 5.4, 9.4,
- 5.3, 13.4, 0, 20.1, 16.2, 13, 20, 17.9, 4.6, 1.7, 1, 2.1, 2.6,
- 5.4, 0.5, 2.1, 0.8, 1.6, 1, 8.7, 2.1, 2.1, 2.2, 3.1, 6.1, 6.2,
- 2.9, 3.2, 4.7, 1.3, 6.9, 6.1, 4.3, 4.3, 2.4, 5.8, 2.9, 11.8,
- 5.2, 5.3, 9.6, 2.7, 4.6, 15, 19.6, 19.4, 16.1, 56.3, 8.5, 7.6,
- 9.4, 5, 5.2, 7.8, 11.3, 10, 3.9, 8.4, 15.3, 8.7, 9.7, 10.7, 10.9,
- 11.7, 56.9, 47.2, 56.5, 50.6, 3.1, 20.1, 8.9, 18.7, 1.3, 36.7,
- 37.6, 25.8, 44.6, 1.7, 0.2, 2.1, 0.1, 0.5, 4.2, 16.1, 0, 1.8,
- 1.7, 2.4, 0.1, 0.5, 2, 0.7, 3.1, 1.1, 10.2, 14, 31.1, 13.7, 17.1,
- 7.5, 13.9, 0, 8.4, 2.2, 0, 18.6, 10.5, 17, 16.2, 40.2, 1.1, 0,
- 0, 11.5, 3.7, 22.1, 25.3, 0, 0, 24.9, 23.2, 0, 44.1, 43.7, 37.6,
- 56, 52.6, 47.1, 47.8, 45.9, 35.4, 36.1, 27.2, 17, 29.1, 19.8,
- 29, 29.4, 3.6, 6.1, 9.1, 12.5, 1.8, 0.7, 1.4, 1, 1.6, 1.7, 0.5,
- 0, 0, 2.4, 2.4, 6.2, 0, 0, 11.4, 16.8, 0, 3.3, 6.6, 6.5, 0.4,
- 1.4, 3.5, 10.9, 9.2, 3.5, 1.5, 1.4, 2.6, 3.2, 6.6, 4.7, 14.1,
- 317, 210.1, 360.9, 539.4, 13.5, 10.7, 16.1, 2.1, 2.2, 3.5, 187,
- 74.2, 12.2, 0.6, 1.1, 1.7, 20.9, 119.4, 222, 267.1, 250, 36.1,
- 40.3, 41.4, 140, 93.9, 134.8, 132.6, 290.8, 33.8, 110.9, 111.2,
- 115, 119.1, 114.1, 333.7, 111), .Dim = c(37L, 6L), .Dimnames = list(
- c("Fruit Grapes", "Fruit Bananas", "Fruit Apples", "Fruit Oranges",
- "Bakery products Croissants", "Bakery products Cake", "Bakery products Doughnuts",
- "Bakery products Cookies", "Bakery products Crackers", "Snacks and confectionery Mars Bar",
- "Snacks and confectionery Yogurt", "Snacks and confectionery Ice cream",
- "Snacks and confectionery Jellybeans", "Snacks and confectionery Peanuts",
- "Snacks and confectionery Potato chips", "Snacks and confectionery Popcorn",
- "Protein-rich foods Cheese", "Protein-rich foods Eggs", "Protein-rich foods Lentils",
- "Protein-rich foods Baked beans", "Protein-rich foods Fish",
- "Carbohydrate-rich foods White bread", "Carbohydrate-rich foods Whole-meal bread",
- "Carbohydrate-rich foods Grain bread", "Carbohydrate-rich foods White rice",
- "Carbohydrate-rich foods Brown rice", "Carbohydrate-rich foods White pasta",
- "Carbohydrate-rich foods Brown pasta", "Carbohydrate-rich foods Potatoes",
- "Carbohydrate-rich foods French fries", "Breakfast cereals Cornflakes",
- "Breakfast cereals Special K", "Breakfast cereals Honeysmacks",
- "Breakfast cereals Sustain", "Breakfast cereals Muesli",
- "Breakfast cereals Porridge", "Breakfast cereals All-Bran"
- ), c("Fat", "Protein", "Sugar", "Starch", "Fiber", "Water"
- )))
-
- y <- structure(c(126, 133, 83, 66, 89, 61, 78, 92, 139, 98, 88, 93,
- 161, 20, 77, 71, 42, 36, 63, 110, 29, 120, 106, 68, 129, 113,
- 50, 74, 148, 70, 110, 106, 91, 93, 65, 80, 59, 12293, 12445,
- 8919, 9345, 13097, 14305, 12445, 15223, 14673, 16682, 15611,
- 12348, 22860, 3047, 8195, 6537, 5994, 4744, 9268, 20106, 9350,
- 12882, 11203, 6659, 8143, 6240, 4456, 4535, 13930, 7643, 8768,
- 8038, 9102, 8938, 6034, 5093, 4299, 113, 108, 118, 166, 483,
- 178, 113, 200, 331, 218, 167, 172, 133, 214, 169, 109, 268, 135,
- 307, 183, 775, 112, 122, 106, 69, 58, 156, 67, 120, 146, 88,
- 95, 108, 102, 118, 74, 87, 216, 224, 152, 185, 604, 467, 480,
- 436, 354, 441, 415, 479, 407, 564, 367, 239, 64257, 9340, 325,
- 504, NA, 281, 247, 166, 145, 117, 91, 93, 284, 209, 189, 195,
- 189, 209, 163, 139, 99, 31, 45, 20, 15, 215, 223, 191, 298, 253,
- 309, 65, 103, 260, 80, 186, 139, 106, 30, 37, 57, 28, 137, 111,
- 62, 40, 42, 22, 21, 38, 82, 52, 47, 53, 53, 34, 13, 25, 74, 79,
- 50, 39, 74, 56, 63, 74, 118, 79, 62, 70, 118, 12, 52, 62, 55,
- 42, 62, 114, 28, 100, 97, 60, 110, 104, 46, 68, 141, 71, 76,
- 70, 60, 66, 43, 60, 40, 82, 81, 59, 60, 79, 82, 74, 92, 87, 122,
- 115, 89, 160, 20, 61, 54, 45, 31, 58, 120, 59, 100, 96, 56, 79,
- 62, 40, 40, 121, 74, 75, 66, 67, 71, 46, 40, 32), .Dim = c(37L,
- 7L), .Dimnames = list(c("Fruit Grapes", "Fruit Bananas", "Fruit Apples",
- "Fruit Oranges", "Bakery products Croissants", "Bakery products Cake",
- "Bakery products Doughnuts", "Bakery products Cookies", "Bakery products Crackers",
- "Snacks and confectionery Mars Bar", "Snacks and confectionery Yogurt",
- "Snacks and confectionery Ice cream", "Snacks and confectionery Jellybeans",
- "Snacks and confectionery Peanuts", "Snacks and confectionery Potato chips",
- "Snacks and confectionery Popcorn", "Protein-rich foods Cheese",
- "Protein-rich foods Eggs", "Protein-rich foods Lentils", "Protein-rich foods Baked beans",
- "Protein-rich foods Fish", "Carbohydrate-rich foods White bread",
- "Carbohydrate-rich foods Whole-meal bread", "Carbohydrate-rich foods Grain bread",
- "Carbohydrate-rich foods White rice", "Carbohydrate-rich foods Brown rice",
- "Carbohydrate-rich foods White pasta", "Carbohydrate-rich foods Brown pasta",
- "Carbohydrate-rich foods Potatoes", "Carbohydrate-rich foods French fries",
- "Breakfast cereals Cornflakes", "Breakfast cereals Special K",
- "Breakfast cereals Honeysmacks", "Breakfast cereals Sustain",
- "Breakfast cereals Muesli", "Breakfast cereals Porridge", "Breakfast cereals All-Bran"
- ), c("Glucose AUC, mol • min/L", "Insulin AUC, pmol • min/L",
- "Insulin AUC: glucose AUC", "Insulin AUC per g carbohydrate, pmol • min • L-1 • g-1",
- "Insulin AUC per g serving weight, pmol • min • L-1 • g-1",
- "Glucose score, %", "Insulin score, %")))
-
- ySEM <- structure(c(14, 12, 7, 11, 6, 11, 14, 12, 26, 10, 23, 17, 18,
- 7, 15, 12, 10, 11, 17, 14, 14, 13, 14, 9, 16, 13, 11, 7, 24,
- 11, 11, 14, 10, 8, 12, 9, 9, 1190, 1353, 910, 1074, 2978, 3472,
- 2402, 382, 2686, 1896, 1808, 1867, 368, 828, 1577, 679, 1590,
- 1017, 2174, 3776, 2055, 1901, 1420, 837, 683, 616, 453, 574,
- 1467, 713, 623, 635, 1506, 757, 813, 493, 612, 19, 22, 18, 23,
- 244, 54, 21, 57, 104, 65, 33, 38, 27, 88, 78, 32, 153, 92, 103,
- 44, 502, 15, 20, 12, 5, 5, 48, 10, 19, 29, 5, 14, 12, 9, 18,
- 11, 15, 21, 24, 15, 21, 137, 113, 93, 110, 65, 50, 48, 72, 64,
- 153, 71, 25, 15013, 1845, 68, 87, NA, 41, 31, 21, 12, 11, 9,
- 12, 30, 19, 13, 15, 31, 18, 22, 13, 14, 3, 5, 2, 2, 49, 54, 37,
- 75, 46, 35, 7, 16, 41, 22, 36, 14, 27, 6, 9, 11, 6, 20, 14, 8,
- 3, 4, 2, 3, 4, 8, 4, 4, 9, 4, 5, 1, 3, 9, 10, 6, 7, 9, 14, 12,
- 11, 24, 13, 15, 19, 18, 4, 9, 16, 18, 16, 22, 18, 13, 0, 17,
- 12, 15, 18, 10, 10, 35, 16, 11, 9, 7, 6, 7, 12, 7, 6, 5, 4, 3,
- 14, 12, 9, 15, 12, 15, 13, 13, 16, 5, 14, 9, 13, 6, 12, 19, 18,
- 0, 12, 6, 12, 11, 5, 5, 11, 12, 8, 5, 6, 6, 5, 4, 4), .Dim = c(37L,
- 7L), .Dimnames = list(c("Fruit Grapes", "Fruit Bananas", "Fruit Apples",
- "Fruit Oranges", "Bakery products Croissants", "Bakery products Cake",
- "Bakery products Doughnuts", "Bakery products Cookies", "Bakery products Crackers",
- "Snacks and confectionery Mars Bar", "Snacks and confectionery Yogurt",
- "Snacks and confectionery Ice cream", "Snacks and confectionery Jellybeans",
- "Snacks and confectionery Peanuts", "Snacks and confectionery Potato chips",
- "Snacks and confectionery Popcorn", "Protein-rich foods Cheese",
- "Protein-rich foods Eggs", "Protein-rich foods Lentils", "Protein-rich foods Baked beans",
- "Protein-rich foods Fish", "Carbohydrate-rich foods White bread",
- "Carbohydrate-rich foods Whole-meal bread", "Carbohydrate-rich foods Grain bread",
- "Carbohydrate-rich foods White rice", "Carbohydrate-rich foods Brown rice",
- "Carbohydrate-rich foods White pasta", "Carbohydrate-rich foods Brown pasta",
- "Carbohydrate-rich foods Potatoes", "Carbohydrate-rich foods French fries",
- "Breakfast cereals Cornflakes", "Breakfast cereals Special K",
- "Breakfast cereals Honeysmacks", "Breakfast cereals Sustain",
- "Breakfast cereals Muesli", "Breakfast cereals Porridge", "Breakfast cereals All-Bran"
- ), c("Glucose AUC, mol • min/L", "Insulin AUC, pmol • min/L",
- "Insulin AUC: glucose AUC", "Insulin AUC per g carbohydrate, pmol • min • L-1 • g-1",
- "Insulin AUC per g serving weight, pmol • min • L-1 • g-1",
- "Glucose score, %", "Insulin score, %")))
И попробовал выполнить похожие действия:
- # chosen regressor
- n <- "Glucose AUC, mol • min/L"
-
- # the linear model is relatively easy to fit
- lin.model <- lm(y[,n] ~ X + 0, weights = ySEM[,n])
- print(summary(lin.model))
-
- # nonlinear regression is complicated:
- # - need starting values for all parameters
- # - no guarantees we won't end up in a local optimum
-
- # for technical reasons we'll have to remove "special characters" from variable names
- colnames(X) <- sub(', g', '', colnames(X))
- nonlin.data <- as.data.frame(cbind(X, y = y[,n]))
-
- # starting values from the linear model (power = 1)
- nonlin.start <- list()
- nonlin.start[paste0(colnames(X), '_slope')] <- coef(lin.model)
- nonlin.start[paste0(colnames(X), '_power')] <- 1
-
- nonlin.model <- nlsLM(
- formula = as.formula(paste( # some dark magic to build a formula from variable names
- 'y ~', paste(
- colnames(X), '_slope * ', colnames(X), ' ^ ', colnames(X), '_power',
- collapse = ' + ', sep = ''
- )
- )),
- data = nonlin.data, start = nonlin.start, weights = ySEM[,n],
- # some more dark magic to make sure that we don't stop before reaching convergence
- control = nls.lm.control(maxiter = 256, maxfev = 65536)
- )
- print(summary(nonlin.model))
-
- # print RMSE from both models
- print(sqrt(mean((predict(nonlin.model) - y[,n])^2)))
- print(sqrt(mean((predict(lin.model) - y[,n])^2)))
-
- # take one sample out of the model and see how parameters change
- lapply(seq_len(nrow(X)), function(i) {
- suppressWarnings(nlsLM(
- formula = as.formula(paste(
- 'y ~', paste(
- colnames(X), '_slope * ', colnames(X), ' ^ ', colnames(X), '_power',
- collapse = ' + ', sep = ''
- )
- )),
- data = nonlin.data[-i,], start = nonlin.start, weights = ySEM[-i,n],
- control = nls.lm.control(maxiter = 1024, maxfev = 65536)
- ))
- }) -> LOOCV
-
- # some models still don't converge, skip them
- print(summary(t(sapply(Filter(function(x) x[['convInfo']][['isConv']], LOOCV), coef))))
Не знаю, какие  даёт предложенная Вами процедура, но у меня на этом наборе данных получается  для нелинейной регрессии и  - для линейной. Вроде бы, это соответсвует найденному Вами улучшению в  при переходе к нелинейной модели. К сожалению, найденная при помощи метода Левенберга-Маркуардта формула выглядит как  А перекрёстная проверка методом вытаскивания даже по одному образцу из обучающего набора (которая, как верно заметил Andrey_Kireew, даёт чересчур оптимистичный результат) показывает разброс для предсказанных значений параметров на порядки, ошибку на обучающем наборе ![$ \in [ 17,9 ; 20,4 ] $ $ \in [ 17,9 ; 20,4 ] $](https://dxdy-03.korotkov.co.uk/f/a/8/1/a816a3b83cb6b04095b4037f78ddc7da82.png) и на исключённом образце ![$ \in [ 0,60; 68 ] $ $ \in [ 0,60; 68 ] $](https://dxdy-03.korotkov.co.uk/f/e/3/a/e3a0d51253d0d53a3a65fac9cbfc7b5182.png) . Всё это говорит о переобученности модели; в ней слишком много степеней свободы, и она знает слишком много способов описать набор данных - одинаково хорошо.
|
|